def do_test_set_naive_bayes_sent_hs_other(utterances, filename, lex, file, column, sentimentfile_train, sentimentfile_test, mode): # annotation using naive_bayes will be saved in a new file with open(filename, 'w') as f: writer = csv.writer(f, delimiter=';') writer.writerow(["Utterance", "Hate Speech"]) # header sentimentlist = senti_strength.estimate_sentiment_probabilities_other_datasets( sentimentfile_train, file, column, mode) list_of_sentiments = machine_learning_processing.make_list_of_column( sentimentfile_test, 1) utterance_id = 0 for utterance in utterances: class_hs = do_sentiment_naive_bayes_hs( utterance, lex, list_of_sentiments[utterance_id], sentimentlist ) # determine class of the utterance using do_sentiment_naive_bayes_hs() # write utterance and its assigned class into the file utterance_string = "" for word in utterance: utterance_string = utterance_string + word + " " writer.writerow([utterance_string, class_hs]) utterance_id += 1
def make_lex_based_on_sent(sentimentfile, trainfile, lexname, sentiment, mode, class_column): lex = [] sentiment_list = machine_learning_processing.make_list_of_column(sentimentfile, 1) data_list = machine_learning_processing.process_data(trainfile, class_column) utterance_id = 0 for list in data_list: if sentiment_list[utterance_id] == sentiment or sentiment_list[utterance_id] == str(sentiment): for word in list: if word not in lex: lex.append(word) #else: #print("already in lexicon") utterance_id += 1 lex = sorted(lex) # sort list alphabetically # only words that occur at least twice in the dataset will be part of the lexicon lex2 = [] for word in lex: count = sum(x.count(word) for x in data_list) if count > 1: lex2.append(word) with open(lexname, 'w') as f: if mode == 1: for word in lex: f.write("%s\n" % word) else: for word in lex2: f.write("%s\n" % word)
def do_test_set_svm_sent_pos_bt(utterances_test, utterances_unprocessed, utterances_training, filename, lex_pos, lex_neut, lex_neg, file, column, matrix_pos, matrix_neut, matrix_neg, sentimentfile, columnname): # annotation using svm will be saved in a new file with open(filename, 'w') as f: writer = csv.writer(f, delimiter=';') writer.writerow(["Utterance", columnname]) # header list_of_sentiments = machine_learning_processing.make_list_of_column( sentimentfile, 1) utterance_id = 0 for utterance in utterances_test: utterance_unprocessed = utterances_unprocessed[utterance_id] # use lexicon with positive, neutral or negative vocabulary based on the sentiment of the utterance if list_of_sentiments[utterance_id] == 1 or list_of_sentiments[ utterance_id] == "1": class_cb = do_svm_sent_pos_bt( utterances_training, utterance_unprocessed, lex_pos, utterance, file, column, matrix_pos ) # determine class of the utterance using do_svm() elif list_of_sentiments[utterance_id] == 0 or list_of_sentiments[ utterance_id] == "0": class_cb = do_svm_sent_pos_bt(utterances_training, utterance_unprocessed, lex_neut, utterance, file, column, matrix_neut) else: class_cb = do_svm_sent_pos_bt(utterances_training, utterance_unprocessed, lex_neg, utterance, file, column, matrix_neg) # write utterance and its assigned class into the file utterance_string = "" for word in utterance: utterance_string = utterance_string + word + " " writer.writerow([utterance_string, class_cb]) utterance_id += 1 if utterance_id == 100: print(utterance_id) elif utterance_id == 200: print(utterance_id) elif utterance_id == 300: print(utterance_id) elif utterance_id == 400: print(utterance_id) elif utterance_id == 500: print(utterance_id) elif utterance_id == 600: print(utterance_id) elif utterance_id == 700: print(utterance_id) elif utterance_id == 800: print(utterance_id) elif utterance_id == 900: print(utterance_id)
def do_test_set_naive_bayes_sent_pos_strength(utterances, utterances_unprocessed, filename, lex, file, column, sentimentfile_train, sentimentfile_test): # annotation using naive_bayes will be saved in a new file with open(filename, 'w') as f: writer = csv.writer(f, delimiter=';') writer.writerow(["Utterance", "Cyberbullying Strength"]) # header sentimentlist = senti_strength.estimate_sentiment_probabilities_strengths( sentimentfile_train, file, column) list_of_sentiments = machine_learning_processing.make_list_of_column( sentimentfile_test, 1) utterance_id = 0 for utterance in utterances: utterance_unprocessed = utterances_unprocessed[utterance_id] class_strength = do_naive_bayes_sent_pos_strength( utterance, utterance_unprocessed, lex, list_of_sentiments[utterance_id], sentimentlist ) # determine class of the utterance using do_naive_bayes() # write utterance and its assigned class into the file utterance_string = "" for word in utterance: utterance_string = utterance_string + word + " " writer.writerow([utterance_string, class_strength]) if utterance_id == 100: print(100) elif utterance_id == 200: print(200) elif utterance_id == 300: print(300) elif utterance_id == 400: print(400) elif utterance_id == 500: print(500) elif utterance_id == 600: print(600) elif utterance_id == 700: print(700) elif utterance_id == 800: print(800) elif utterance_id == 900: print(900) utterance_id += 1
def do_test_set_mem_sent_pos_hs_ths(utterances_test, utterances_unprocessed, filename, lex, file, column, sentimentfile_train, sentimentfile_test, mode): # annotation using maximum entropy model will be saved in a new file with open(filename, 'w') as f: writer = csv.writer(f, delimiter=';') writer.writerow(["Utterance", "Hate Speech"]) # header sentimentlist = senti_strength.estimate_sentiment_probabilities_other_datasets( sentimentfile_train, file, column, mode) list_of_sentiments = machine_learning_processing.make_list_of_column( sentimentfile_test, 1) utterance_id = 0 for utterance in utterances_test: utterance_unprocessed = utterances_unprocessed[utterance_id] class_hs = do_mem_sent_pos_hs_ths( utterance, utterance_unprocessed, lex, list_of_sentiments[utterance_id], sentimentlist ) # determine class of the utterance using do_svm() # write utterance and its assigned class into the file utterance_string = "" for word in utterance: utterance_string = utterance_string + word + " " writer.writerow([utterance_string, class_hs]) if utterance_id == 100: print(100) elif utterance_id == 200: print(200) elif utterance_id == 300: print(300) elif utterance_id == 400: print(400) elif utterance_id == 500: print(500) elif utterance_id == 600: print(600) elif utterance_id == 700: print(700) elif utterance_id == 800: print(800) elif utterance_id == 900: print(900) utterance_id += 1
def estimate_sentiment_probabilities(sentimentfile, cbfile, cbrow): cb_list = machine_learning_processing.make_list_of_column(cbfile, cbrow) # list of cyberbullying values count_pos = 0 # number of positive utterances count_neut = 0 # number of neutral utterances count_neg = 0 # number of negative utterances pos_cb = 0 # number of positive utterances in class cyberbullying pos_no_cb = 0 # number of positive utterances in class no_cyberbullying neut_cb = 0 # number of neutral utterances in class cyberbullying neut_no_cb = 0 # number of neutral utterances in class no_cyberbullying neg_cb = 0 # number of negative utterances in class cyberbullying neg_no_cb = 0 # number of negative utterances in class no_cyberbullying with open(sentimentfile, 'r') as file: reader = csv.reader(file, delimiter=';') next(reader, None) # skip header utterance_id = 0 for row in reader: if row[1] == 1 or row[1] == "1": count_pos += 1 if cb_list[utterance_id] == 1 or cb_list[utterance_id] == "1": pos_cb += 1 else: pos_no_cb += 1 elif row[1] == -1 or row[1] == "-1": count_neg += 1 if cb_list[utterance_id] == 1 or cb_list[utterance_id] == "1": neg_cb += 1 else: neg_no_cb += 1 else: count_neut += 1 if cb_list[utterance_id] == 1 or cb_list[utterance_id] == "1": neut_cb += 1 else: neut_no_cb += 1 utterance_id += 1 #print(count_pos, count_neut, count_neg) #print(pos_cb, pos_no_cb, neut_cb, neut_no_cb, neg_cb, neut_no_cb) p_pos_cb = pos_cb / count_pos # probability of a positive utterance being in class cyberbullying p_pos_no_cb = pos_no_cb / count_pos # probability of a positive utterance being in class no_cyberbullying p_neut_cb = neut_cb / count_neut # probability of a neutral utterance being in class cyberbullying p_neut_no_cb = neut_no_cb / count_neut # probability of a neutral utterance being in class no_cyberbullying p_neg_cb = neg_cb / count_neg # probability of a negative utterance being in class cyberbullying p_neg_no_cb = neg_no_cb / count_neg # probability of a negative utterance being in class no_cyberbullying p_pos_cb = round(p_pos_cb, 3) # round probabilities to values with 3 positions behind decimal point p_pos_no_cb = round(p_pos_no_cb, 3) p_neut_cb = round(p_neut_cb, 3) p_neut_no_cb = round(p_neut_no_cb, 3) p_neg_cb = round(p_neg_cb, 3) p_neg_no_cb = round(p_neg_no_cb, 3) sentiment_list = [p_pos_cb, p_pos_no_cb, p_neut_cb, p_neut_no_cb, p_neg_cb, p_neg_no_cb] return sentiment_list
def estimate_sentiment_probabilities_strengths(sentimentfile, cbfile, cbrow): strength_list = machine_learning_processing.make_list_of_column(cbfile, cbrow) # list of cyberbullying values count_pos = 0 # number of positive utterances count_neut = 0 # number of neutral utterances count_neg = 0 # number of negative utterances pos_s1 = 0 # number of positive utterances in class s1 pos_s2 = 0 pos_s3 = 0 pos_s4 = 0 pos_s5 = 0 neut_s1 = 0 # number of neutral utterances in class s1 neut_s2 = 0 neut_s3 = 0 neut_s4 = 0 neut_s5 = 0 neg_s1 = 0 # number of negative utterances in class s1 neg_s2 = 0 neg_s3 = 0 neg_s4 = 0 neg_s5 = 0 with open(sentimentfile, 'r') as csvfile: reader = csv.reader(csvfile, delimiter=';') next(reader, None) # skip header utterance_id = 0 for row in reader: if row[1] == 1 or row[1] == "1": count_pos += 1 if strength_list[utterance_id] == 1 or strength_list[utterance_id] == "1": pos_s1 += 1 elif strength_list[utterance_id] == 2 or strength_list[utterance_id] == "2": pos_s2 += 1 elif strength_list[utterance_id] == 3 or strength_list[utterance_id] == "3": pos_s3 += 1 elif strength_list[utterance_id] == 4 or strength_list[utterance_id] == "4": pos_s4 += 1 else: pos_s5 += 1 elif row[1] == -1 or row[1] == "-1": count_neg += 1 if strength_list[utterance_id] == 1 or strength_list[utterance_id] == "1": neg_s1 += 1 elif strength_list[utterance_id] == 2 or strength_list[utterance_id] == "2": neg_s2 += 1 elif strength_list[utterance_id] == 3 or strength_list[utterance_id] == "3": neg_s3 += 1 elif strength_list[utterance_id] == 4 or strength_list[utterance_id] == "4": neg_s4 += 1 else: neg_s5 += 1 else: count_neut += 1 if strength_list[utterance_id] == 1 or strength_list[utterance_id] == "1": neut_s1 += 1 elif strength_list[utterance_id] == 2 or strength_list[utterance_id] == "2": neut_s2 += 1 elif strength_list[utterance_id] == 3 or strength_list[utterance_id] == "3": neut_s3 += 1 elif strength_list[utterance_id] == 4 or strength_list[utterance_id] == "4": neut_s4 += 1 else: neut_s5 += 1 utterance_id += 1 #print(count_pos, count_neut, count_neg) #print(pos_s1, pos_s2, pos_s3, pos_s4, pos_s5) #print(neut_s1,neut_s2,neut_s3,neut_s4,neut_s5) #print(neg_s1, neg_s2, neg_s3, neg_s4, neg_s5) p_pos_s1 = pos_s1 / count_pos # probability of a positive utterance being in class s1 p_pos_s2 = pos_s2 / count_pos p_pos_s3 = pos_s3 / count_pos p_pos_s4 = pos_s4 / count_pos p_pos_s5 = pos_s5 / count_pos p_neut_s1 = neut_s1 / count_neut # probability of a neutral utterance being in class s1 p_neut_s2 = neut_s2 / count_neut p_neut_s3 = neut_s3 / count_neut p_neut_s4 = neut_s4 / count_neut p_neut_s5 = neut_s5 / count_neut p_neg_s1 = neg_s1 / count_neg # probability of a negative utterance being in class s1 p_neg_s2 = neg_s2 / count_neg p_neg_s3 = neg_s3 / count_neg p_neg_s4 = neg_s4 / count_neg p_neg_s5 = neg_s5 / count_neg p_pos_s1 = round(p_pos_s1, 3) # round probabilities to values with 3 positions behind decimal point p_pos_s2 = round(p_pos_s2, 3) p_pos_s3 = round(p_pos_s3, 3) p_pos_s4 = round(p_pos_s4, 3) p_pos_s5 = round(p_pos_s5, 3) p_neut_s1 = round(p_neut_s1, 3) p_neut_s2 = round(p_neut_s2, 3) p_neut_s3 = round(p_neut_s3, 3) p_neut_s4 = round(p_neut_s4, 3) p_neut_s5 = round(p_neut_s5, 3) p_neg_s1 = round(p_neg_s1, 3) p_neg_s2 = round(p_neg_s2, 3) p_neg_s3 = round(p_neg_s3, 3) p_neg_s4 = round(p_neg_s4, 3) p_neg_s5 = round(p_neg_s5, 3) sentiment_list = [p_pos_s1, p_pos_s2, p_pos_s3, p_pos_s4, p_pos_s5, p_neut_s1, p_neut_s2, p_neut_s3, p_neut_s4, p_neut_s5, p_neg_s1, p_neg_s2, p_neg_s3, p_neg_s4, p_neg_s5] return sentiment_list
elif utterance_id == 600: print(600) elif utterance_id == 700: print(700) elif utterance_id == 800: print(800) elif utterance_id == 900: print(900) utterance_id += 1 # labeled data test_list_ld = machine_learning_processing.process_data( "labeled_data_test.csv", 6) test_list_ld_unprocessed = machine_learning_processing.make_list_of_column( "labeled_data_test.csv", 6) do_test_set_mem_sent_pos_ld(test_list_ld, test_list_ld_unprocessed, "labeled_data_mem_final.csv", "lexicon_with_occurences_ld.txt", "labeled_data_train.csv", 5, "labeled_data_train_with_sentiment.csv", "labeled_data_test_with_sentiment.csv", 2) do_test_set_mem_sent_pos_hs_ld(test_list_ld, test_list_ld_unprocessed, "labeled_data_mem_final_hs.csv", "lexicon_with_occurences_hs_ld.txt", "labeled_data_train.csv", 5, "labeled_data_train_with_sentiment.csv", "labeled_data_test_with_sentiment.csv", 2) estimation.test_results("labeled_data_test.csv", 5, "labeled_data_mem_final.csv", 1) estimation.test_results("labeled_data_test.csv", 5,
def do_svm_other_hs(data_list, lex, test_utterance, file, column, matrix, mode): """ We use the Support Vector Machine (with k-nearest neighbour) algorithm to determine the class of each tweet. We work with our data_list containing all stemmed and processed utterances and our lexicon without occurence probabilities. First we need to estimate the vectors for each utterance of the training set. The values of the vector are the number of times each word of our lexicon appears in our utterance. Then we compare for each utterance its vector with all other vectors to find the ones most similar. To do this, we need to estimate the normalized dot product. The class of our tweet is estimated by the (intellectually assigned) class of the k tweets with the most similar vector (k=3 or k=5, whichever gets the better results). """ # make vector from the utterance of the test set vec_test_utterance = [] # vector of the utterance from the test_set for line in open(lex): line_split = line.split() n = 0 m = 0 if line_split[0] in test_utterance: #print(line_split[0]) while m < len(test_utterance): if line_split[0] == test_utterance[m]: n += 1 # count how often the word appears in the utterance m += 1 # loop through the words of the utterance else: m += 1 vec_test_utterance.append(n) #print(vec_test_utterance) distances = [] # array of distances to each vector of the training set np_vec = np.array(vec_test_utterance) vec_len = np.linalg.norm(np_vec) # length of test_vector #print(vec_len) for vector in matrix: # loop through all vectors from the training set np_vec2 = np.array(vector) vec2_len = np.linalg.norm( np_vec2) # length of vector from training set # estimate normalized dot product between the test vector and all vectors from the training set to calculate the distance dot_product_first = 0 i = 0 while i < len(vec_test_utterance): dot_product_first = dot_product_first + vec_test_utterance[ i] * vector[i] i += 1 if vec_len != 0 and vec2_len != 0: dot_product = dot_product_first / (vec_len * vec2_len ) # normalized dot product else: dot_product = 0 distances.append(dot_product) # save all dot products into the list # find the 3 or 5 most similar vectors for our test vector k1 = distances.index(max(distances)) k1_distance = max(distances) distances[k1] = 0 # set dot product to 0 to determine the next best vector k2 = distances.index(max(distances)) k2_distance = max(distances) distances[k2] = 0 k3 = distances.index(max(distances)) k3_distance = max(distances) distances[k3] = 0 # if k=5 is used instead of k=3 k4 = distances.index(max(distances)) k4_distance = max(distances) distances[k4] = 0 k5 = distances.index(max(distances)) k5_distance = max(distances) distances[k5] = 0 u1 = "" # utterances to which the dot products belong u2 = "" u3 = "" u4 = "" # if k=5 is used instead of k=3 u5 = "" u1 = data_list[k1] u2 = data_list[k2] u3 = data_list[k3] u4 = data_list[k4] # if k=5 is used instead of k=3 u5 = data_list[k5] #print(k1, k1_distance, u1) #print(k2, k2_distance, u2) #print(k3, k3_distance, u3) #print(k4, k4_distance, u4) #print(k5, k5_distance, u5) # find the classes of the utterances corresponding to the most similar vectors classes = [] cb_list = machine_learning_processing.make_list_of_column(file, column) classes.append(cb_list[k1]) classes.append(cb_list[k2]) classes.append(cb_list[k3]) classes.append(cb_list[k4]) classes.append(cb_list[k5]) #print(classes) if mode == 2: cb = classes.count( "0" ) # number of most similiar utterances with the class cyberbulling no_cb = classes.count("2") + classes.count( "1" ) # number of most similiar utterances with the class no_cyberbulling else: cb = classes.count("2") no_cb = classes.count("0") + classes.count("1") cb_class = "" # determine class of the test_utterance #if cb == max([cb, no_cb]): # cb_class = 1 #else: # cb_class = 0 #print(cb_class) contains_curses = False curses = machine_learning_processing.make_list_of_curse_words("curses.txt") for word in test_utterance: if word in curses: contains_curses = True # utterances that contain curses will automatically be labeled as cyberbullying later if cb >= 2: cb_class = 1 elif contains_curses == True: # utterances that contain curses will automatically be labeled as cyberbullying cb_class = 1 else: cb_class = 0 return cb_class
def do_svm_strength(data_list, lex, test_utterance, file, column, matrix): """ We use the Support Vector Machine (with k-nearest neighbour) algorithm to determine the class of each tweet. We work with our data_list containing all stemmed and processed utterances and our lexicon without occurence probabilities. First we need to estimate the vectors for each utterance of the training set. The values of the vector are the number of times each word of our lexicon appears in our utterance. Then we compare for each utterance its vector with all other vectors to find the ones most similar. To do this, we need to estimate the normalized dot product. The class of our tweet is estimated by the (intellectually assigned) class of the k tweets with the most similar vector (k=3 or k=5, whichever gets the better results). """ # make vector from the utterance of the test set vec_test_utterance = [] # vector of the utterance from the test_set for line in open(lex): line_split = line.split() n = 0 m = 0 if line_split[0] in test_utterance: #print(line_split[0]) while m < len(test_utterance): if line_split[0] == test_utterance[m]: n += 1 # count how often the word appears in the utterance m += 1 # loop through the words of the utterance else: m += 1 vec_test_utterance.append(n) #print(vec_test_utterance) distances = [] # array of distances to each vector of the training set np_vec = np.array(vec_test_utterance) vec_len = np.linalg.norm(np_vec) # length of test_vector #print(vec_len) for vector in matrix: # loop through all vectors from the training set np_vec2 = np.array(vector) vec2_len = np.linalg.norm( np_vec2) # length of vector from training set # estimate normalized dot product between the test vector and all vectors from the training set to calculate the distance dot_product_first = 0 i = 0 while i < len(vec_test_utterance): dot_product_first = dot_product_first + vec_test_utterance[ i] * vector[i] i += 1 if vec_len != 0 and vec2_len != 0: dot_product = dot_product_first / (vec_len * vec2_len ) # normalized dot product else: dot_product = 0 distances.append(dot_product) # save all dot products into the list # find the 3 or 5 most similar vectors for our test vector k1 = distances.index(max(distances)) k1_distance = max(distances) distances[k1] = 0 # set dot product to 0 to determine the next best vector k2 = distances.index(max(distances)) k2_distance = max(distances) distances[k2] = 0 k3 = distances.index(max(distances)) k3_distance = max(distances) distances[k3] = 0 # if k=5 is used instead of k=3 k4 = distances.index(max(distances)) k4_distance = max(distances) distances[k4] = 0 k5 = distances.index(max(distances)) k5_distance = max(distances) distances[k5] = 0 u1 = "" # utterances to which the dot products belong u2 = "" u3 = "" u4 = "" # if k=5 is used instead of k=3 u5 = "" u1 = data_list[k1] u2 = data_list[k2] u3 = data_list[k3] u4 = data_list[k4] # if k=5 is used instead of k=3 u5 = data_list[k5] #print(k1, k1_distance, u1) #print(k2, k2_distance, u2) #print(k3, k3_distance, u3) #print(k4, k4_distance, u4) #print(k5, k5_distance, u5) # find the classes of the utterances corresponding to the most similar vectors classes = [] cb_list = machine_learning_processing.make_list_of_column(file, column) classes.append(cb_list[k1]) classes.append(cb_list[k2]) classes.append(cb_list[k3]) classes.append(cb_list[k4]) classes.append(cb_list[k5]) #print(classes) s1 = classes.count( "1") # number of most similiar utterances with the class s1 s2 = classes.count( "2") # number of most similiar utterances with the class s2 s3 = classes.count( "3") # number of most similiar utterances with the class s3 s4 = classes.count( "4") # number of most similiar utterances with the class s4 s5 = classes.count( "5") # number of most similiar utterances with the class s5 values = [s1, s2, s3, s4, s5] strength_class = "" # determine class of the test_utterance if max(values) == values[0]: # determine class (max P(class|tweet)) strength_class = 1 elif max(values) == values[1]: strength_class = 2 elif max(values) == values[2]: strength_class = 3 elif max(values) == values[3]: strength_class = 4 else: strength_class = 5 return strength_class
def do_svm_sent_pos_hs(data_list, utterance_unprocessed, lex, test_utterance, file, column, matrix): """ We use the Support Vector Machine (with k-nearest neighbour) algorithm to determine the class of each tweet. We work with our data_list containing all stemmed and processed utterances and our lexicon without occurence probabilities. First we need to estimate the vectors for each utterance of the training set. The values of the vector are the number of times each word of our lexicon appears in our utterance. Then we compare for each utterance its vector with all other vectors to find the ones most similar. To do this, we need to estimate the normalized dot product. The class of our tweet is estimated by the (intellectually assigned) class of the k tweets with the most similar vector (k=3 or k=5, whichever gets the better results). """ # make vector from the utterance of the test set vec_test_utterance = [] # vector of the utterance from the test_set for line in open(lex): line_split = line.split() n = 0 m = 0 if line_split[0] in test_utterance: while m < len(test_utterance): if line_split[0] == test_utterance[m]: n += 1 # count how often the word appears in the utterance m += 1 # loop through the words of the utterance else: m += 1 vec_test_utterance.append(n) distances = [] # array of distances to each vector of the training set np_vec = np.array(vec_test_utterance) vec_len = np.linalg.norm(np_vec) # length of test_vector for vector in matrix: # loop through all vectors from the training set np_vec2 = np.array(vector) vec2_len = np.linalg.norm( np_vec2) # length of vector from training set # estimate normalized dot product between the test vector and all vectors from the training set to calculate the distance dot_product_first = 0 i = 0 while i < len(vec_test_utterance): dot_product_first = dot_product_first + vec_test_utterance[ i] * vector[i] i += 1 if vec_len != 0 and vec2_len != 0: dot_product = dot_product_first / (vec_len * vec2_len ) # normalized dot product else: dot_product = 0 distances.append(dot_product) # save all dot products into the list # find the 3 or 5 most similar vectors for our test vector k1 = distances.index(max(distances)) k1_distance = max(distances) distances[k1] = 0 # set dot product to 0 to determine the next best vector k2 = distances.index(max(distances)) k2_distance = max(distances) distances[k2] = 0 k3 = distances.index(max(distances)) k3_distance = max(distances) distances[k3] = 0 # if k=5 is used instead of k=3 k4 = distances.index(max(distances)) k4_distance = max(distances) distances[k4] = 0 k5 = distances.index(max(distances)) k5_distance = max(distances) distances[k5] = 0 u1 = "" # utterances to which the dot products belong u2 = "" u3 = "" u4 = "" # if k=5 is used instead of k=3 u5 = "" u1 = data_list[k1] u2 = data_list[k2] u3 = data_list[k3] u4 = data_list[k4] # if k=5 is used instead of k=3 u5 = data_list[k5] # find the classes of the utterances corresponding to the most similar vectors classes = [] hs_list = machine_learning_processing.make_list_of_column(file, column) classes.append(hs_list[k1]) classes.append(hs_list[k2]) classes.append(hs_list[k3]) classes.append(hs_list[k4]) classes.append(hs_list[k5]) hs = classes.count( "1") # number of most similiar utterances with the class cyberbulling no_hs = classes.count( "0" ) # number of most similiar utterances with the class no_cyberbulling hs_class = "" # determine class of the test_utterance # probability that utterance is in class cyberbullying / no cyberbullying based on pos-tagging hs_pos_prob = compare_tweets_hatespeech.compare_vec_tweet_hatespeech( utterance_unprocessed, 5) no_hs_pos_prob = 1 - hs_pos_prob hs = hs * hs_pos_prob no_hs = no_hs * no_hs_pos_prob if hs >= no_hs: cb_class = 1 else: cb_class = 0 return cb_class
print(utterance_id) elif utterance_id == 500: print(utterance_id) elif utterance_id == 600: print(utterance_id) elif utterance_id == 700: print(utterance_id) elif utterance_id == 800: print(utterance_id) elif utterance_id == 900: print(utterance_id) test_list = machine_learning_processing.process_data("test_set.csv", 5) training_list = machine_learning_processing.process_data("train_set.csv", 5) test_list_unprocessed = machine_learning_processing.make_list_of_column( "test_set.csv", 5) term_utterance_matrix = support_vector_machine.do_matrix( training_list, "lexicon.txt") matrix_pos = support_vector_machine.do_matrix(training_list, "lexicon_pos.txt") matrix_neut = support_vector_machine.do_matrix(training_list, "lexicon_neut.txt") matrix_neg = support_vector_machine.do_matrix(training_list, "lexicon_neg.txt") matrix_pos2 = support_vector_machine.do_matrix(training_list, "lexicon_pos2.txt") matrix_neut2 = support_vector_machine.do_matrix(training_list, "lexicon_neut2.txt") matrix_neg2 = support_vector_machine.do_matrix(training_list, "lexicon_neg2.txt") # cyberbullying