def make_lex_based_on_sent(sentimentfile, trainfile, lexname, sentiment, mode, class_column): lex = [] sentiment_list = machine_learning_processing.make_list_of_column(sentimentfile, 1) data_list = machine_learning_processing.process_data(trainfile, class_column) utterance_id = 0 for list in data_list: if sentiment_list[utterance_id] == sentiment or sentiment_list[utterance_id] == str(sentiment): for word in list: if word not in lex: lex.append(word) #else: #print("already in lexicon") utterance_id += 1 lex = sorted(lex) # sort list alphabetically # only words that occur at least twice in the dataset will be part of the lexicon lex2 = [] for word in lex: count = sum(x.count(word) for x in data_list) if count > 1: lex2.append(word) with open(lexname, 'w') as f: if mode == 1: for word in lex: f.write("%s\n" % word) else: for word in lex2: f.write("%s\n" % word)
elif utterance_id == 500: print(500) elif utterance_id == 600: print(600) elif utterance_id == 700: print(700) elif utterance_id == 800: print(800) elif utterance_id == 900: print(900) utterance_id += 1 # labeled data test_list_ld = machine_learning_processing.process_data( "labeled_data_test.csv", 6) test_list_ld_unprocessed = machine_learning_processing.make_list_of_column( "labeled_data_test.csv", 6) do_test_set_mem_sent_pos_ld(test_list_ld, test_list_ld_unprocessed, "labeled_data_mem_final.csv", "lexicon_with_occurences_ld.txt", "labeled_data_train.csv", 5, "labeled_data_train_with_sentiment.csv", "labeled_data_test_with_sentiment.csv", 2) do_test_set_mem_sent_pos_hs_ld(test_list_ld, test_list_ld_unprocessed, "labeled_data_mem_final_hs.csv", "lexicon_with_occurences_hs_ld.txt", "labeled_data_train.csv", 5, "labeled_data_train_with_sentiment.csv", "labeled_data_test_with_sentiment.csv", 2) estimation.test_results("labeled_data_test.csv", 5,
print(utterance_id) elif utterance_id == 400: print(utterance_id) elif utterance_id == 500: print(utterance_id) elif utterance_id == 600: print(utterance_id) elif utterance_id == 700: print(utterance_id) elif utterance_id == 800: print(utterance_id) elif utterance_id == 900: print(utterance_id) test_list = machine_learning_processing.process_data("test_set.csv", 5) training_list = machine_learning_processing.process_data("train_set.csv", 5) test_list_unprocessed = machine_learning_processing.make_list_of_column( "test_set.csv", 5) term_utterance_matrix = support_vector_machine.do_matrix( training_list, "lexicon.txt") matrix_pos = support_vector_machine.do_matrix(training_list, "lexicon_pos.txt") matrix_neut = support_vector_machine.do_matrix(training_list, "lexicon_neut.txt") matrix_neg = support_vector_machine.do_matrix(training_list, "lexicon_neg.txt") matrix_pos2 = support_vector_machine.do_matrix(training_list, "lexicon_pos2.txt") matrix_neut2 = support_vector_machine.do_matrix(training_list, "lexicon_neut2.txt") matrix_neg2 = support_vector_machine.do_matrix(training_list,