def binary_naive_bayes():
    model = nb.NaiveBayesModel()
    clean = cn.DataCLean()
    doc_vector = dv.DocumentVector()
    df_clean, uniqueWords = clean.Clean()
    df_clean_test, df_clean_train = split(
        df_clean, 0, int(.3 * (df_clean['class'].count())))
    docVector = doc_vector.binary_docvector(df_clean_train, uniqueWords)
    # print(docVector)
    df_WordGivenPI, df_WordGivenNoPi, Prob_PI, Prob_NoPI, numWordsInPI, numWordsInNoPI = model.TrainModel(
        docVector, uniqueWords)
    # print("Model Trained")
    predict_df, test_data = model.predict(Prob_PI, Prob_NoPI, uniqueWords,
                                          df_WordGivenPI, df_WordGivenNoPi,
                                          numWordsInPI, numWordsInNoPI,
                                          df_clean_test, clean)

    print(
        "--------------Binary Naive Bayes Accuracy Stats---------------------------"
    )
    stats = em.Evaluate()
    TP, FN, TN, FP = stats.confusion_matrix(test_data, predict_df)
    print("Accuracy = ", stats.Accuracy(TP, TN, FP, FN))
    print("Precision = ", stats.Precision(TP, FP))
    print("Recall = ", stats.Recall(TP, FN))
    print("fScore = ", stats.fScore(TP, FN, FP))
    print("True Negative = ", stats.TrueNegative(TN, FP))
    print(
        "---------------------------------------------------------------------"
    )
def binary_naive_bayes_kfold():
    model = nb.NaiveBayesModel()
    clean = cn.DataCLean()
    doc_vector = dv.DocumentVector()
    final_df, df = clean.extract(pathData)
    count = 0
    start = -200
    end = 0
    accuracy = []
    precision = []
    recall = []
    fscore = []
    true_neg = []
    stats = em.Evaluate()
    for count in range(5):
        start = start + 200
        end = end + 200
        df_test, df_train = split(final_df, start, end)
        # print(df_train)
        li_clean_text, df_clean = clean.clean_data(df_train)
        uniqueWords = clean.make_unique_li(li_clean_text)
        # # print(uniqueWords)
        docVector = doc_vector.binary_docvector(df_clean, uniqueWords)
        df_WordGivenPI, df_WordGivenNoPi, Prob_PI, Prob_NoPI, numWordsInPI, numWordsInNoPI = model.TrainModel(
            docVector, uniqueWords)
        predict_df, punc_df = model.predict(Prob_PI, Prob_NoPI, uniqueWords,
                                            df_WordGivenPI, df_WordGivenNoPi,
                                            numWordsInPI, numWordsInNoPI,
                                            df_test, clean)
        # print("--------------Naive Bayes Accuracy Stats---------------------------")
        TP, FN, TN, FP = stats.confusion_matrix(punc_df, predict_df)
        accuracy.append(stats.Accuracy(TP, TN, FP, FN))
        precision.append(stats.Precision(TP, FP))
        recall.append(stats.Recall(TP, FN))
        fscore.append(stats.fScore(TP, FN, FP))
        true_neg.append(stats.TrueNegative(TN, FP))
        # print("---------------------------------------------------------------------")
    print(
        "---------------------------------------------------------------------"
    )
    print("Binary Naive Bayes wit k-fold Accuracy Stats")
    print("accuracy = ", accuracy)
    print("precison = ", precision)
    print("recall = ", recall)
    print("f-score = ", fscore)
    print("True Negative = ", true_neg)
    print("accuracy = ", Average(accuracy))
    print("precison = ", Average(precision))
    print("recall = ", Average(recall))
    print("f-score = ", Average(fscore))
    print("true negative = ", Average(true_neg))
Пример #3
0
def generatingTrainSet():
    _dcl = cl.DataCLean()
    final_df, uniqueWords = _dcl.Clean()
    _dv = dv.DocumentVector()
    # docVector = _dv.tf_idf(final_df, uniqueWords)
    docVector = _dv.DocVector(final_df, uniqueWords)
    # docVector = _dv.binary_docvector(final_df, uniqueWords)

    # -------------------------------------------------------------------------
    # using textblob dict approach
    # import NaiveBayesTextBlob as tb

    # polarity_docVector = tb.text_blob(docVector, uniqueWords)
    # docVector = polarity_docVector
    # -------------------------------------------------------------------------

    df = docVector.values
    X_train, Y = df[:, :-1], df[:, -1]
    Y_train = convert_to_0_or_1(Y)
    return (X_train, Y_train)