Exemplo n.º 1
0
def main():
    # train_data = {
    #     'test/spam1' : 'spam',
    #     'test/spam2' : 'spam',
    #     'test/spam3' : 'spam',
    #     'test/spam4' : 'spam',
    #     'test/ham1' : 'ham',
    #     'test/ham2' : 'ham',
    #     'test/ham3' : 'ham',
    #     'test/ham4' : 'ham',
    # }
    #
    # Setup

    train_data, test_data = parse_labels()
    stats(test_data)


    test_data = {
        'test/test0' : 'spam',
    }


    print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Generating Counts"
    word_dict, spam_count, ham_count = create_word_counts(train_data)
    print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Done."

    # Iterate through possible values of lambda for lambda smoothing.
    # la_set = [0.005, 0.1, 0.5, 1.0, 2.0]
    la_set = [0]

    for la in la_set:
        print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Computing Probs for lambda={0}".format(la)
        word_probs = compute_probs(word_dict, spam_count, ham_count, la)
        spam_prior_prob = (float)(spam_count) / (spam_count + ham_count)
        ham_prior_prob = (float)(ham_count) / (spam_count + ham_count)
        print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Done"

        TP, FP, TN, FN, problems = classify(test_data, word_probs, spam_prior_prob, ham_prior_prob)
        summarize_findings(la, TP, FP, TN, FN, problems)
Exemplo n.º 2
0
    print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Generating Counts"
    word_dict, total_doc_count, spam_doc_count, ham_doc_count = create_word_counts(train_data)
    print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Done."

    la = 2.000

    print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Computing Probs for lambda={0}".format(la)
    word_probs = compute_probs(word_dict, spam_doc_count, ham_doc_count, la)
    spam_prior_prob = (float)(spam_doc_count) / (spam_doc_count + ham_doc_count)
    ham_prior_prob = (float)(ham_doc_count) / (spam_doc_count + ham_doc_count)
    print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Spam Prior Prob : {0}".format(spam_prior_prob)
    print datetime.datetime.now().strftime("%I:%M:%S") + " : " + " Ham Prior Prob : {0}".format(ham_prior_prob)

    best_mi = compute_mi(word_probs, spam_prior_prob, ham_prior_prob)
    top_200 = best_mi[0:200]
    print [word[0] for word in top_200]
    mi_word_probs = {}

    for (word, mi) in top_200:
        mi_word_probs[word] = word_probs[word]
    mi_word_probs["*"] = word_probs["*"]

    TP, FP, TN, FN = classify(test_data, mi_word_probs, spam_prior_prob, ham_prior_prob)
    summarize_findings(TP, FP, TN, FN)

    print "Done."


if __name__ == "__main__":
    main()