def format_review(raw_data): formatted_data = [] for review in raw_data: formatted_review = [] tokenized_review = sent_tokenize(review) tokenized_review = list(flatten(tokenized_review)) tokenized_review[0] = tokenized_review[0].partition(" ")[2] for sentence in tokenized_review: formatted_review.append(sentence.split()) formatted_data.append(formatted_review) debug() return formatted_data
def main(): input_dictionary = {} try: input_reviews = open(sys.argv[1]) except: print("Please supply an input file to be classified as the first argument and try again") try: input_dictionary = pickle.load(open(sys.argv[2], 'rb')) except: print("Please supply a pickled data set as the second argument and try again ") data_dictionary = input_dictionary["freq"] prior_dictionary = input_dictionary["prior"] TCount = 0 FCount = 0 correct = 0 incorrect = 0 test_list = list(flatten((('T',)*20,('F',)*20))) debug() for review in format_review(input_reviews): intermediate = classify(data_dictionary, prior_dictionary, review) probable_class = (max(intermediate, key=lambda k: (intermediate[k]))) if probable_class == 'final_T': TCount += 1 else: FCount += 1 if probable_class[6:] == test_list.pop(0): correct += 1 else: incorrect += 1 print(TCount) print(FCount) print(TCount/(TCount + FCount)) print(FCount/(TCount + FCount)) if TCount > FCount: print('T') else: print('F') print(correct/(correct + incorrect))
def main(): global input_files global output_file processed_dictionary = {} output_dictionary = {} prior_counts = {} #Populate input_files for i in range(1, len(sys.argv)-1): input_files.append(open(sys.argv[i])) #Populate output_file output_file = open(sys.argv[len(sys.argv)-1], 'wb') process_counts(input_files[0], 'T', processed_dictionary, prior_counts) debug() process_counts(input_files[1], 'F', processed_dictionary, prior_counts) debug() output_dictionary["freq"] = processed_dictionary output_dictionary["prior"] = prior_counts pickle.dump(output_dictionary, output_file)
def main(): global input_files global output_file processed_dictionary = {} output_dictionary = {} prior_counts = {} input_corpi = [] corpus = {} corpus["text"] = [] corpus["id"] = [] corpus["class"] = [] # Populate input_files for i in range(1, len(sys.argv) - 2): input_files.append(open(sys.argv[i])) debug() test_file = open(sys.argv[len(sys.argv) - 2]) for i, corpus_file in enumerate(input_files): input_corpi.append([line for line in corpus_file]) test_set = {} test_set["text"] = [] test_set["id"] = [] for line in test_file: split_line = line.split() for k, word in enumerate(split_line): if k is 0: continue split_line[k] = stem_word(word) line = " ".join(x for x in split_line) review_id = line[0:7] test_set["text"].append(line[8:]) test_set["id"].append(review_id) for i, corpus_file in enumerate(input_corpi): current_class = input_files[i].name[0] for line in corpus_file: split_line = line.split() for k, word in enumerate(split_line): if k is 0: continue split_line[k] = stem_word(word) line = " ".join(x for x in split_line) review_id = line[0:7] corpus["text"].append(line[8:]) corpus["id"].append(review_id) corpus["class"].append(current_class) selector = SelectKBest(chi2, k=100) training_data = vectorize_for_training(corpus["text"]) test_data = vectorize_for_testing(test_set["text"]) other_training_data = selector.fit_transform(training_data, corpus["class"]) filtered_test_data = selector.transform(test_data) linear_svm = svm.LinearSVC() linear_svm.fit(other_training_data, corpus["class"]) predictions = linear_svm.predict(filtered_test_data) print(predictions) t_count = 0 f_count = 0 for c in predictions: if c == "T": t_count += 1 else: f_count += 1 print("True Count: " + str(t_count)) print("False Count: " + str(f_count)) # Populate output_file output_file = open(sys.argv[len(sys.argv) - 1], "w") for i, prediction in enumerate(predictions): output_file.write(prediction + "\t" + test_set["id"][i])