import sys import parse import probabilities training_files, testing_files = parse.parse_input_files( sys.argv[1], sys.argv[2]) smoother = float(sys.argv[3]) ##train distinct_vocab = parse.get_distinct_vocabulary(training_files) lib_files, con_files = parse.get_subset_files(training_files) vocab_lib = parse.get_subset_vocab(lib_files) vocab_con = parse.get_subset_vocab(con_files) p_lib, p_con = probabilities.probability_outcome(training_files, lib_files) p_word_lib = probabilities.probability_word_given_outcome( distinct_vocab, vocab_lib, smoother) p_word_con = probabilities.probability_word_given_outcome( distinct_vocab, vocab_con, smoother) ##test correct = 0 for doc in testing_files: outcome = probabilities.classify_naive_bayes(distinct_vocab, p_word_con, p_word_lib, p_con, p_lib, parse.get_test_vocab(doc)) print outcome if "lib" in doc and outcome == "L":
lib_max = lib_dict[word] lib_max_word = word lib_max_dict[lib_max] = lib_max_word del lib_dict[lib_max_word] for word in con_dict: if con_dict[word] > con_max: con_max = con_dict[word] con_max_word = word con_max_dict[con_max] = con_max_word del con_dict[con_max_word] return lib_max_dict, con_max_dict training_files = parse.parse_input_files(sys.argv[1]) ##train distinct_vocab = parse.get_distinct_vocabulary(training_files) lib_files, con_files = parse.get_subset_files(training_files) vocab_lib = parse.get_subset_vocab(lib_files) vocab_con = parse.get_subset_vocab(con_files) p_lib, p_con = probabilities.probability_outcome(training_files, lib_files) p_word_lib = probabilities.probability_word_given_outcome( distinct_vocab, vocab_lib, 1.0) p_word_con = probabilities.probability_word_given_outcome( distinct_vocab, vocab_con, 1.0)