예제 #1
0
import sys
import parse
import probabilities

training_files, testing_files = parse.parse_input_files(
    sys.argv[1], sys.argv[2])
smoother = float(sys.argv[3])

##train
distinct_vocab = parse.get_distinct_vocabulary(training_files)

lib_files, con_files = parse.get_subset_files(training_files)

vocab_lib = parse.get_subset_vocab(lib_files)
vocab_con = parse.get_subset_vocab(con_files)

p_lib, p_con = probabilities.probability_outcome(training_files, lib_files)

p_word_lib = probabilities.probability_word_given_outcome(
    distinct_vocab, vocab_lib, smoother)
p_word_con = probabilities.probability_word_given_outcome(
    distinct_vocab, vocab_con, smoother)

##test
correct = 0
for doc in testing_files:
    outcome = probabilities.classify_naive_bayes(distinct_vocab, p_word_con,
                                                 p_word_lib, p_con, p_lib,
                                                 parse.get_test_vocab(doc))
    print outcome
    if "lib" in doc and outcome == "L":
예제 #2
0
                lib_max = lib_dict[word]
                lib_max_word = word
        lib_max_dict[lib_max] = lib_max_word
        del lib_dict[lib_max_word]

        for word in con_dict:
            if con_dict[word] > con_max:
                con_max = con_dict[word]
                con_max_word = word
        con_max_dict[con_max] = con_max_word
        del con_dict[con_max_word]

    return lib_max_dict, con_max_dict


training_files = parse.parse_input_files(sys.argv[1])

##train
distinct_vocab = parse.get_distinct_vocabulary(training_files)

lib_files, con_files = parse.get_subset_files(training_files)

vocab_lib = parse.get_subset_vocab(lib_files)
vocab_con = parse.get_subset_vocab(con_files)

p_lib, p_con = probabilities.probability_outcome(training_files, lib_files)

p_word_lib = probabilities.probability_word_given_outcome(
    distinct_vocab, vocab_lib, 1.0)
p_word_con = probabilities.probability_word_given_outcome(
    distinct_vocab, vocab_con, 1.0)