def evaluate_classifier(predicted_output, y_test):
    y_test = y_test.array
    accuracy = evaluation_metrics.accuracy(y_test, predicted_output)
    precision = evaluation_metrics.precision(y_test, predicted_output)
    recall = evaluation_metrics.recall(y_test, predicted_output)
    f1_score = evaluation_metrics.f1_score(recall, precision)
    return accuracy, precision, recall, f1_score
Exemplo n.º 2
0
def evaluate_MCAP_bernoulli_model(dataset_name):
    """
    This is the method used for evaluation of multinomial NB on a particular dataset
    :param dataset_name: This is the given dataset name
    :return: All the evaluation metrics
    """
    # We first import training data for the training
    try:
        spam_email_bernoulli_model1, ham_email_bernoulli_model1, spam_mail_in_all_documents, ham_mail_in_all_documents, size_of_total_dataset, size_of_spam_dataset, size_of_ham_dataset, total_file_dictionary = bernoulli_model.convert_to_bernoulli_model(
            dataset_name, True)
    except:
        print "You have given wrong file name, please check and run again"
        exit(-1)
    # Firstly we will divide our training data into training and validation data
    train_data, validation_data = MCAP_logistic_regression.divide_into_validation_and_train(spam_email_bernoulli_model1,
                                                                                            ham_email_bernoulli_model1)
    # Now we find the lambda value by using the grid search algorithm
    lambda_parameter = MCAP_logistic_regression.mcap_validation(train_data, validation_data, total_file_dictionary)
    alpha_value = 0.01
    # Here we merge the training data and the validation data again
    train_data = train_data + validation_data
    # In this step the algorithm learns the weights
    weights = MCAP_logistic_regression.mcap_logistic_regression_train(train_data, total_file_dictionary, alpha_value,
                                                                      lambda_parameter, 500)
    # We now import the data for testing
    spam_email_bernoulli_model_test, ham_email_bernoulli_model_test, spam_mail_in_all_documents_test, ham_mail_in_all_documents_test, size_of_total_dataset_test, size_of_spam_dataset_test, size_of_ham_dataset_test, total_file_dictionary_test = bernoulli_model.convert_to_bernoulli_model(
        dataset_name, False)
    spam_predict = []
    # In this step the algorithm predicts the output for a given dataset
    for each_document in spam_email_bernoulli_model_test:
        spam_predict.append(MCAP_logistic_regression.mcap_logistic_regression_test(each_document, weights))
    # We  are taking spam as 1
    spam_actual = [1] * len(spam_predict)
    ham_predict = []
    for each_document in ham_email_bernoulli_model_test:
        ham_predict.append(MCAP_logistic_regression.mcap_logistic_regression_test(each_document, weights))
    ham_actual = [0] * len(ham_predict)
    total_actual = spam_actual + ham_actual
    total_predict = spam_predict + ham_predict
    # Now we find the evaluation metrics for the method
    accuracy = evaluation_metrics.accuracy(total_actual, total_predict)
    precision = evaluation_metrics.precision(total_actual, total_predict)
    recall = evaluation_metrics.recall(total_actual, total_predict)
    f1_score = evaluation_metrics.f1_score(recall, precision)
    return accuracy, precision, recall, f1_score, lambda_parameter\

# evaluate_MCAP_bag_of_words(dataset_name) #for bag of words
# evaluate_MCAP_bernoulli_model(dataset_name) # for bernoulli_model
def evaluate_SGD_bernoulli_model(dataset_name):
    """
    This is the method used for evaluation of multinomial NB on a particular dataset
    :param dataset_name: This is the given dataset name
    :return: All the evaluation metrics
    """
    # We first import training data for the training
    try:
        spam_email_bernoulli_model1, ham_email_bernoulli_model1, spam_mail_in_all_documents, ham_mail_in_all_documents, size_of_total_dataset, size_of_spam_dataset, size_of_ham_dataset, total_file_dictionary = bernoulli_model.convert_to_bernoulli_model(
            dataset_name, True)
        spam_email_bernoulli_model_test, ham_email_bernoulli_model_test, spam_mail_in_all_documents_test, ham_mail_in_all_documents_test, size_of_total_dataset_test, size_of_spam_dataset_test, size_of_ham_dataset_test, total_file_dictionary_test = bernoulli_model.convert_to_bernoulli_model(
            dataset_name, False)
    except:
        print "You have given wrong file name, please check and run again"
        exit(-1)
    train_data, validation_data = SGDClassifier.divide_into_validation_and_train(
        spam_email_bernoulli_model1, ham_email_bernoulli_model1)
    test_data = SGDClassifier.get_data_from_given_model(
        spam_email_bernoulli_model_test, ham_email_bernoulli_model_test)
    words_list = list(train_data[0])
    # we import the train, test and validation datasets
    train_x, train_y = SGDClassifier.convert_data_for_SGD_classifier(
        train_data, words_list)
    test_x, test_y = SGDClassifier.convert_data_for_SGD_classifier(
        test_data, words_list)
    valid_x, valid_y = SGDClassifier.convert_data_for_SGD_classifier(
        validation_data, words_list)
    # In this step we are getting the best parameters for the sklearn SGD classifier
    classifier_model = SGDClassifier.parameter_tuning(valid_x, valid_y)
    # In this step the classifier model is being trained on the training dataset
    trained_classifier_model = SGDClassifier.train_SGD(train_x, train_y,
                                                       classifier_model)
    # In this step we find the output for the classifier.
    predicted_y, actual_y = SGDClassifier.test_SGD(trained_classifier_model,
                                                   test_x, test_y)
    # Now calculate the evaluation metrics
    accuracy = evaluation_metrics.accuracy(actual_y, predicted_y)
    precision = evaluation_metrics.precision(actual_y, predicted_y)
    recall = evaluation_metrics.recall(actual_y, predicted_y)
    f1_score = evaluation_metrics.f1_score(recall, precision)
    return accuracy, precision, recall, f1_score


# evaluate_SGD_bag_of_words(dataset_name) # for bow
# evaluate_SGD_bernoulli_model(dataset_name) # for bm
def evaluate_multinomial_NB(dataset_name):
    """
    This is the method used for evaluation of multinomial NB on a particular dataset
    :param dataset_name: This is the given dataset name
    :return: The method returns the accuracy, precision, recall and f1_score for the given dataset
    """
    # We first import training data for the training
    try:
        spam_email_bag_of_words, ham_email_bag_of_words, text_in_all_document, spam_mail_in_all_documents, ham_mail_in_all_documents, size_of_total_dataset, size_of_spam_dataset, size_of_ham_dataset, total_file_dictionary = bag_of_words.convert_to_bag_of_words(
            dataset_name, True)
    except:
        print "You have given wrong file name, please check and run again"
        exit(-1)
    prior, conditional_probability, conditional_probability_of_non_occurring_word = multi_nomial_naive_bayes.train_multinomial_NB(
        spam_email_bag_of_words, ham_email_bag_of_words, text_in_all_document, spam_mail_in_all_documents,
        ham_mail_in_all_documents, size_of_total_dataset, size_of_spam_dataset, size_of_ham_dataset,
        total_file_dictionary)
    # We now import the data for testing
    spam_email_bag_of_words, ham_email_bag_of_words, text_in_all_document, spam_mail_in_all_documents, ham_mail_in_all_documents, size_of_total_dataset, size_of_spam_dataset, size_of_ham_dataset, total_file_dictionary = bag_of_words.convert_to_bag_of_words(
        dataset_name, False)
    # We calculate the evaluation metric
    # Here we first predict for the spam class and then the ham class
    spam_predict = []
    for each_document in spam_email_bag_of_words:
        spam_predict.append(multi_nomial_naive_bayes.test_multinomial_naive_bayes(prior, conditional_probability,
                                                                                  conditional_probability_of_non_occurring_word,
                                                                                  each_document))
    # We  are taking spam as 1
    spam_actual = [1] * len(spam_predict)
    ham_predict = []
    for each_document in ham_email_bag_of_words:
        ham_predict.append(multi_nomial_naive_bayes.test_multinomial_naive_bayes(prior, conditional_probability,
                                                                                 conditional_probability_of_non_occurring_word,
                                                                                 each_document))
    ham_actual = [0] * len(ham_predict)
    total_actual = spam_actual + ham_actual
    total_predict = spam_predict + ham_predict
    # Now we find the evaluation metrics for the method
    accuracy = evaluation_metrics.accuracy(total_actual, total_predict)
    precision = evaluation_metrics.precision(total_actual, total_predict)
    recall = evaluation_metrics.recall(total_actual, total_predict)
    f1_score = evaluation_metrics.f1_score(recall, precision)
    return accuracy, precision, recall, f1_score