Пример #1
0
def do_test_set_naive_bayes_sent_hs_other(utterances, filename, lex, file,
                                          column, sentimentfile_train,
                                          sentimentfile_test, mode):
    # annotation using naive_bayes will be saved in a new file
    with open(filename, 'w') as f:
        writer = csv.writer(f, delimiter=';')
        writer.writerow(["Utterance", "Hate Speech"])  # header

        sentimentlist = senti_strength.estimate_sentiment_probabilities_other_datasets(
            sentimentfile_train, file, column, mode)
        list_of_sentiments = machine_learning_processing.make_list_of_column(
            sentimentfile_test, 1)

        utterance_id = 0
        for utterance in utterances:
            class_hs = do_sentiment_naive_bayes_hs(
                utterance, lex, list_of_sentiments[utterance_id], sentimentlist
            )  # determine class of the utterance using do_sentiment_naive_bayes_hs()

            # write utterance and its assigned class into the file
            utterance_string = ""
            for word in utterance:
                utterance_string = utterance_string + word + " "
            writer.writerow([utterance_string, class_hs])
            utterance_id += 1
Пример #2
0
def make_lex_based_on_sent(sentimentfile, trainfile, lexname, sentiment, mode, class_column):
    lex = []

    sentiment_list = machine_learning_processing.make_list_of_column(sentimentfile, 1)

    data_list = machine_learning_processing.process_data(trainfile, class_column)

    utterance_id = 0
    for list in data_list:
        if sentiment_list[utterance_id] == sentiment or sentiment_list[utterance_id] == str(sentiment):
            for word in list:
                if word not in lex:
                    lex.append(word)
                #else:
                    #print("already in lexicon")
        utterance_id += 1

    lex = sorted(lex)                                                       # sort list alphabetically

    # only words that occur at least twice in the dataset will be part of the lexicon
    lex2 = []
    for word in lex:
        count = sum(x.count(word) for x in data_list)
        if count > 1:
            lex2.append(word)

    with open(lexname, 'w') as f:
        if mode == 1:
            for word in lex:
                f.write("%s\n" % word)
        else:
            for word in lex2:
                f.write("%s\n" % word)
Пример #3
0
def do_test_set_svm_sent_pos_bt(utterances_test, utterances_unprocessed,
                                utterances_training, filename, lex_pos,
                                lex_neut, lex_neg, file, column, matrix_pos,
                                matrix_neut, matrix_neg, sentimentfile,
                                columnname):
    # annotation using svm will be saved in a new file
    with open(filename, 'w') as f:
        writer = csv.writer(f, delimiter=';')
        writer.writerow(["Utterance", columnname])  # header

        list_of_sentiments = machine_learning_processing.make_list_of_column(
            sentimentfile, 1)

        utterance_id = 0
        for utterance in utterances_test:
            utterance_unprocessed = utterances_unprocessed[utterance_id]
            # use lexicon with positive, neutral or negative vocabulary based on the sentiment of the utterance
            if list_of_sentiments[utterance_id] == 1 or list_of_sentiments[
                    utterance_id] == "1":
                class_cb = do_svm_sent_pos_bt(
                    utterances_training, utterance_unprocessed, lex_pos,
                    utterance, file, column, matrix_pos
                )  # determine class of the utterance using do_svm()
            elif list_of_sentiments[utterance_id] == 0 or list_of_sentiments[
                    utterance_id] == "0":
                class_cb = do_svm_sent_pos_bt(utterances_training,
                                              utterance_unprocessed, lex_neut,
                                              utterance, file, column,
                                              matrix_neut)
            else:
                class_cb = do_svm_sent_pos_bt(utterances_training,
                                              utterance_unprocessed, lex_neg,
                                              utterance, file, column,
                                              matrix_neg)

            # write utterance and its assigned class into the file
            utterance_string = ""
            for word in utterance:
                utterance_string = utterance_string + word + " "
            writer.writerow([utterance_string, class_cb])
            utterance_id += 1

            if utterance_id == 100:
                print(utterance_id)
            elif utterance_id == 200:
                print(utterance_id)
            elif utterance_id == 300:
                print(utterance_id)
            elif utterance_id == 400:
                print(utterance_id)
            elif utterance_id == 500:
                print(utterance_id)
            elif utterance_id == 600:
                print(utterance_id)
            elif utterance_id == 700:
                print(utterance_id)
            elif utterance_id == 800:
                print(utterance_id)
            elif utterance_id == 900:
                print(utterance_id)
def do_test_set_naive_bayes_sent_pos_strength(utterances,
                                              utterances_unprocessed, filename,
                                              lex, file, column,
                                              sentimentfile_train,
                                              sentimentfile_test):
    # annotation using naive_bayes will be saved in a new file
    with open(filename, 'w') as f:
        writer = csv.writer(f, delimiter=';')
        writer.writerow(["Utterance", "Cyberbullying Strength"])  # header

        sentimentlist = senti_strength.estimate_sentiment_probabilities_strengths(
            sentimentfile_train, file, column)
        list_of_sentiments = machine_learning_processing.make_list_of_column(
            sentimentfile_test, 1)

        utterance_id = 0
        for utterance in utterances:
            utterance_unprocessed = utterances_unprocessed[utterance_id]
            class_strength = do_naive_bayes_sent_pos_strength(
                utterance, utterance_unprocessed, lex,
                list_of_sentiments[utterance_id], sentimentlist
            )  # determine class of the utterance using do_naive_bayes()

            # write utterance and its assigned class into the file
            utterance_string = ""
            for word in utterance:
                utterance_string = utterance_string + word + " "
            writer.writerow([utterance_string, class_strength])

            if utterance_id == 100:
                print(100)
            elif utterance_id == 200:
                print(200)
            elif utterance_id == 300:
                print(300)
            elif utterance_id == 400:
                print(400)
            elif utterance_id == 500:
                print(500)
            elif utterance_id == 600:
                print(600)
            elif utterance_id == 700:
                print(700)
            elif utterance_id == 800:
                print(800)
            elif utterance_id == 900:
                print(900)

            utterance_id += 1
def do_test_set_mem_sent_pos_hs_ths(utterances_test, utterances_unprocessed,
                                    filename, lex, file, column,
                                    sentimentfile_train, sentimentfile_test,
                                    mode):
    # annotation using maximum entropy model will be saved in a new file
    with open(filename, 'w') as f:
        writer = csv.writer(f, delimiter=';')
        writer.writerow(["Utterance", "Hate Speech"])  # header

        sentimentlist = senti_strength.estimate_sentiment_probabilities_other_datasets(
            sentimentfile_train, file, column, mode)
        list_of_sentiments = machine_learning_processing.make_list_of_column(
            sentimentfile_test, 1)

        utterance_id = 0
        for utterance in utterances_test:
            utterance_unprocessed = utterances_unprocessed[utterance_id]
            class_hs = do_mem_sent_pos_hs_ths(
                utterance, utterance_unprocessed, lex,
                list_of_sentiments[utterance_id], sentimentlist
            )  # determine class of the utterance using do_svm()

            # write utterance and its assigned class into the file
            utterance_string = ""
            for word in utterance:
                utterance_string = utterance_string + word + " "
            writer.writerow([utterance_string, class_hs])

            if utterance_id == 100:
                print(100)
            elif utterance_id == 200:
                print(200)
            elif utterance_id == 300:
                print(300)
            elif utterance_id == 400:
                print(400)
            elif utterance_id == 500:
                print(500)
            elif utterance_id == 600:
                print(600)
            elif utterance_id == 700:
                print(700)
            elif utterance_id == 800:
                print(800)
            elif utterance_id == 900:
                print(900)

            utterance_id += 1
Пример #6
0
def estimate_sentiment_probabilities(sentimentfile, cbfile, cbrow):
    cb_list = machine_learning_processing.make_list_of_column(cbfile, cbrow)    # list of cyberbullying values

    count_pos = 0                                           # number of positive utterances
    count_neut = 0                                          # number of neutral utterances
    count_neg = 0                                           # number of negative utterances

    pos_cb = 0                                              # number of positive utterances in class cyberbullying
    pos_no_cb = 0                                           # number of positive utterances in class no_cyberbullying
    neut_cb = 0                                             # number of neutral utterances in class cyberbullying
    neut_no_cb = 0                                          # number of neutral utterances in class no_cyberbullying
    neg_cb = 0                                              # number of negative utterances in class cyberbullying
    neg_no_cb = 0                                           # number of negative utterances in class no_cyberbullying

    with open(sentimentfile, 'r') as file:
        reader = csv.reader(file, delimiter=';')
        next(reader, None)                                  # skip header

        utterance_id = 0
        for row in reader:
            if row[1] == 1 or row[1] == "1":
                count_pos += 1

                if cb_list[utterance_id] == 1 or cb_list[utterance_id] == "1":
                    pos_cb += 1
                else:
                    pos_no_cb += 1
            elif row[1] == -1 or row[1] == "-1":
                count_neg += 1

                if cb_list[utterance_id] == 1 or cb_list[utterance_id] == "1":
                    neg_cb += 1
                else:
                    neg_no_cb += 1
            else:
                count_neut += 1

                if cb_list[utterance_id] == 1 or cb_list[utterance_id] == "1":
                    neut_cb += 1
                else:
                    neut_no_cb += 1

            utterance_id += 1

    #print(count_pos, count_neut, count_neg)
    #print(pos_cb, pos_no_cb, neut_cb, neut_no_cb, neg_cb, neut_no_cb)

    p_pos_cb = pos_cb / count_pos                           # probability of a positive utterance being in class cyberbullying
    p_pos_no_cb = pos_no_cb / count_pos                     # probability of a positive utterance being in class no_cyberbullying
    p_neut_cb = neut_cb / count_neut                        # probability of a neutral utterance being in class cyberbullying
    p_neut_no_cb = neut_no_cb / count_neut                  # probability of a neutral utterance being in class no_cyberbullying
    p_neg_cb = neg_cb / count_neg                           # probability of a negative utterance being in class cyberbullying
    p_neg_no_cb = neg_no_cb / count_neg                     # probability of a negative utterance being in class no_cyberbullying

    p_pos_cb = round(p_pos_cb, 3)                           # round probabilities to values with 3 positions behind decimal point
    p_pos_no_cb = round(p_pos_no_cb, 3)
    p_neut_cb = round(p_neut_cb, 3)
    p_neut_no_cb = round(p_neut_no_cb, 3)
    p_neg_cb = round(p_neg_cb, 3)
    p_neg_no_cb = round(p_neg_no_cb, 3)

    sentiment_list = [p_pos_cb, p_pos_no_cb, p_neut_cb, p_neut_no_cb, p_neg_cb, p_neg_no_cb]
    return sentiment_list
Пример #7
0
def estimate_sentiment_probabilities_strengths(sentimentfile, cbfile, cbrow):
    strength_list = machine_learning_processing.make_list_of_column(cbfile, cbrow)    # list of cyberbullying values

    count_pos = 0                                           # number of positive utterances
    count_neut = 0                                          # number of neutral utterances
    count_neg = 0                                           # number of negative utterances

    pos_s1 = 0                                              # number of positive utterances in class s1
    pos_s2 = 0
    pos_s3 = 0
    pos_s4 = 0
    pos_s5 = 0

    neut_s1 = 0                                             # number of neutral utterances in class s1
    neut_s2 = 0
    neut_s3 = 0
    neut_s4 = 0
    neut_s5 = 0

    neg_s1 = 0                                              # number of negative utterances in class s1
    neg_s2 = 0
    neg_s3 = 0
    neg_s4 = 0
    neg_s5 = 0

    with open(sentimentfile, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=';')
        next(reader, None)                                  # skip header

        utterance_id = 0
        for row in reader:
            if row[1] == 1 or row[1] == "1":
                count_pos += 1

                if strength_list[utterance_id] == 1 or strength_list[utterance_id] == "1":
                    pos_s1 += 1
                elif strength_list[utterance_id] == 2 or strength_list[utterance_id] == "2":
                    pos_s2 += 1
                elif strength_list[utterance_id] == 3 or strength_list[utterance_id] == "3":
                    pos_s3 += 1
                elif strength_list[utterance_id] == 4 or strength_list[utterance_id] == "4":
                    pos_s4 += 1
                else:
                    pos_s5 += 1
            elif row[1] == -1 or row[1] == "-1":
                count_neg += 1

                if strength_list[utterance_id] == 1 or strength_list[utterance_id] == "1":
                    neg_s1 += 1
                elif strength_list[utterance_id] == 2 or strength_list[utterance_id] == "2":
                    neg_s2 += 1
                elif strength_list[utterance_id] == 3 or strength_list[utterance_id] == "3":
                    neg_s3 += 1
                elif strength_list[utterance_id] == 4 or strength_list[utterance_id] == "4":
                    neg_s4 += 1
                else:
                    neg_s5 += 1
            else:
                count_neut += 1

                if strength_list[utterance_id] == 1 or strength_list[utterance_id] == "1":
                    neut_s1 += 1
                elif strength_list[utterance_id] == 2 or strength_list[utterance_id] == "2":
                    neut_s2 += 1
                elif strength_list[utterance_id] == 3 or strength_list[utterance_id] == "3":
                    neut_s3 += 1
                elif strength_list[utterance_id] == 4 or strength_list[utterance_id] == "4":
                    neut_s4 += 1
                else:
                    neut_s5 += 1

            utterance_id += 1

    #print(count_pos, count_neut, count_neg)
    #print(pos_s1, pos_s2, pos_s3, pos_s4, pos_s5)
    #print(neut_s1,neut_s2,neut_s3,neut_s4,neut_s5)
    #print(neg_s1, neg_s2, neg_s3, neg_s4, neg_s5)

    p_pos_s1 = pos_s1 / count_pos                           # probability of a positive utterance being in class s1
    p_pos_s2 = pos_s2 / count_pos
    p_pos_s3 = pos_s3 / count_pos
    p_pos_s4 = pos_s4 / count_pos
    p_pos_s5 = pos_s5 / count_pos
    p_neut_s1 = neut_s1 / count_neut                        # probability of a neutral utterance being in class s1
    p_neut_s2 = neut_s2 / count_neut
    p_neut_s3 = neut_s3 / count_neut
    p_neut_s4 = neut_s4 / count_neut
    p_neut_s5 = neut_s5 / count_neut
    p_neg_s1 = neg_s1 / count_neg                           # probability of a negative utterance being in class s1
    p_neg_s2 = neg_s2 / count_neg
    p_neg_s3 = neg_s3 / count_neg
    p_neg_s4 = neg_s4 / count_neg
    p_neg_s5 = neg_s5 / count_neg

    p_pos_s1 = round(p_pos_s1, 3)                           # round probabilities to values with 3 positions behind decimal point
    p_pos_s2 = round(p_pos_s2, 3)
    p_pos_s3 = round(p_pos_s3, 3)
    p_pos_s4 = round(p_pos_s4, 3)
    p_pos_s5 = round(p_pos_s5, 3)
    p_neut_s1 = round(p_neut_s1, 3)
    p_neut_s2 = round(p_neut_s2, 3)
    p_neut_s3 = round(p_neut_s3, 3)
    p_neut_s4 = round(p_neut_s4, 3)
    p_neut_s5 = round(p_neut_s5, 3)
    p_neg_s1 = round(p_neg_s1, 3)
    p_neg_s2 = round(p_neg_s2, 3)
    p_neg_s3 = round(p_neg_s3, 3)
    p_neg_s4 = round(p_neg_s4, 3)
    p_neg_s5 = round(p_neg_s5, 3)

    sentiment_list = [p_pos_s1, p_pos_s2, p_pos_s3, p_pos_s4, p_pos_s5, p_neut_s1, p_neut_s2, p_neut_s3, p_neut_s4, p_neut_s5, p_neg_s1, p_neg_s2, p_neg_s3, p_neg_s4, p_neg_s5]
    return sentiment_list
            elif utterance_id == 600:
                print(600)
            elif utterance_id == 700:
                print(700)
            elif utterance_id == 800:
                print(800)
            elif utterance_id == 900:
                print(900)

            utterance_id += 1


# labeled data
test_list_ld = machine_learning_processing.process_data(
    "labeled_data_test.csv", 6)
test_list_ld_unprocessed = machine_learning_processing.make_list_of_column(
    "labeled_data_test.csv", 6)
do_test_set_mem_sent_pos_ld(test_list_ld, test_list_ld_unprocessed,
                            "labeled_data_mem_final.csv",
                            "lexicon_with_occurences_ld.txt",
                            "labeled_data_train.csv", 5,
                            "labeled_data_train_with_sentiment.csv",
                            "labeled_data_test_with_sentiment.csv", 2)
do_test_set_mem_sent_pos_hs_ld(test_list_ld, test_list_ld_unprocessed,
                               "labeled_data_mem_final_hs.csv",
                               "lexicon_with_occurences_hs_ld.txt",
                               "labeled_data_train.csv", 5,
                               "labeled_data_train_with_sentiment.csv",
                               "labeled_data_test_with_sentiment.csv", 2)
estimation.test_results("labeled_data_test.csv", 5,
                        "labeled_data_mem_final.csv", 1)
estimation.test_results("labeled_data_test.csv", 5,
def do_svm_other_hs(data_list, lex, test_utterance, file, column, matrix,
                    mode):
    """
    We use the Support Vector Machine (with k-nearest neighbour) algorithm to determine the class of each tweet.
    We work with our data_list containing all stemmed and processed utterances and our lexicon without occurence probabilities.

    First we need to estimate the vectors for each utterance of the training set.
    The values of the vector are the number of times each word of our lexicon appears in our utterance.
    Then we compare for each utterance its vector with all other vectors to find the ones most similar.
    To do this, we need to estimate the normalized dot product.

    The class of our tweet is estimated by the (intellectually assigned) class of the k tweets with the most similar vector
    (k=3 or k=5, whichever gets the better results).
    """

    # make vector from the utterance of the test set
    vec_test_utterance = []  # vector of the utterance from the test_set
    for line in open(lex):
        line_split = line.split()
        n = 0
        m = 0
        if line_split[0] in test_utterance:
            #print(line_split[0])
            while m < len(test_utterance):
                if line_split[0] == test_utterance[m]:
                    n += 1  # count how often the word appears in the utterance
                    m += 1  # loop through the words of the utterance
                else:
                    m += 1
        vec_test_utterance.append(n)
    #print(vec_test_utterance)

    distances = []  # array of distances to each vector of the training set
    np_vec = np.array(vec_test_utterance)
    vec_len = np.linalg.norm(np_vec)  # length of test_vector
    #print(vec_len)

    for vector in matrix:  # loop through all vectors from the training set
        np_vec2 = np.array(vector)
        vec2_len = np.linalg.norm(
            np_vec2)  # length of vector from training set

        # estimate normalized dot product between the test vector and all vectors from the training set to calculate the distance
        dot_product_first = 0
        i = 0
        while i < len(vec_test_utterance):
            dot_product_first = dot_product_first + vec_test_utterance[
                i] * vector[i]
            i += 1
        if vec_len != 0 and vec2_len != 0:
            dot_product = dot_product_first / (vec_len * vec2_len
                                               )  # normalized dot product
        else:
            dot_product = 0
        distances.append(dot_product)  # save all dot products into the list

    # find the 3 or 5 most similar vectors for our test vector
    k1 = distances.index(max(distances))
    k1_distance = max(distances)
    distances[k1] = 0  # set dot product to 0 to determine the next best vector
    k2 = distances.index(max(distances))
    k2_distance = max(distances)
    distances[k2] = 0
    k3 = distances.index(max(distances))
    k3_distance = max(distances)
    distances[k3] = 0  # if k=5 is used instead of k=3
    k4 = distances.index(max(distances))
    k4_distance = max(distances)
    distances[k4] = 0
    k5 = distances.index(max(distances))
    k5_distance = max(distances)
    distances[k5] = 0

    u1 = ""  # utterances to which the dot products belong
    u2 = ""
    u3 = ""
    u4 = ""  # if k=5 is used instead of k=3
    u5 = ""

    u1 = data_list[k1]
    u2 = data_list[k2]
    u3 = data_list[k3]
    u4 = data_list[k4]  # if k=5 is used instead of k=3
    u5 = data_list[k5]

    #print(k1, k1_distance, u1)
    #print(k2, k2_distance, u2)
    #print(k3, k3_distance, u3)
    #print(k4, k4_distance, u4)
    #print(k5, k5_distance, u5)

    # find the classes of the utterances corresponding to the most similar vectors
    classes = []
    cb_list = machine_learning_processing.make_list_of_column(file, column)
    classes.append(cb_list[k1])
    classes.append(cb_list[k2])
    classes.append(cb_list[k3])
    classes.append(cb_list[k4])
    classes.append(cb_list[k5])
    #print(classes)

    if mode == 2:
        cb = classes.count(
            "0"
        )  # number of most similiar utterances with the class cyberbulling
        no_cb = classes.count("2") + classes.count(
            "1"
        )  # number of most similiar utterances with the class no_cyberbulling
    else:
        cb = classes.count("2")
        no_cb = classes.count("0") + classes.count("1")

    cb_class = ""  # determine class of the test_utterance
    #if cb == max([cb, no_cb]):
    #    cb_class = 1
    #else:
    #    cb_class = 0
    #print(cb_class)

    contains_curses = False
    curses = machine_learning_processing.make_list_of_curse_words("curses.txt")

    for word in test_utterance:
        if word in curses:
            contains_curses = True  # utterances that contain curses will automatically be labeled as cyberbullying later

    if cb >= 2:
        cb_class = 1
    elif contains_curses == True:  # utterances that contain curses will automatically be labeled as cyberbullying
        cb_class = 1
    else:
        cb_class = 0

    return cb_class
def do_svm_strength(data_list, lex, test_utterance, file, column, matrix):
    """
    We use the Support Vector Machine (with k-nearest neighbour) algorithm to determine the class of each tweet.
    We work with our data_list containing all stemmed and processed utterances and our lexicon without occurence probabilities.

    First we need to estimate the vectors for each utterance of the training set.
    The values of the vector are the number of times each word of our lexicon appears in our utterance.
    Then we compare for each utterance its vector with all other vectors to find the ones most similar.
    To do this, we need to estimate the normalized dot product.

    The class of our tweet is estimated by the (intellectually assigned) class of the k tweets with the most similar vector
    (k=3 or k=5, whichever gets the better results).
    """

    # make vector from the utterance of the test set
    vec_test_utterance = []  # vector of the utterance from the test_set
    for line in open(lex):
        line_split = line.split()
        n = 0
        m = 0
        if line_split[0] in test_utterance:
            #print(line_split[0])
            while m < len(test_utterance):
                if line_split[0] == test_utterance[m]:
                    n += 1  # count how often the word appears in the utterance
                    m += 1  # loop through the words of the utterance
                else:
                    m += 1
        vec_test_utterance.append(n)
    #print(vec_test_utterance)

    distances = []  # array of distances to each vector of the training set
    np_vec = np.array(vec_test_utterance)
    vec_len = np.linalg.norm(np_vec)  # length of test_vector
    #print(vec_len)

    for vector in matrix:  # loop through all vectors from the training set
        np_vec2 = np.array(vector)
        vec2_len = np.linalg.norm(
            np_vec2)  # length of vector from training set

        # estimate normalized dot product between the test vector and all vectors from the training set to calculate the distance
        dot_product_first = 0
        i = 0
        while i < len(vec_test_utterance):
            dot_product_first = dot_product_first + vec_test_utterance[
                i] * vector[i]
            i += 1
        if vec_len != 0 and vec2_len != 0:
            dot_product = dot_product_first / (vec_len * vec2_len
                                               )  # normalized dot product
        else:
            dot_product = 0
        distances.append(dot_product)  # save all dot products into the list

    # find the 3 or 5 most similar vectors for our test vector
    k1 = distances.index(max(distances))
    k1_distance = max(distances)
    distances[k1] = 0  # set dot product to 0 to determine the next best vector
    k2 = distances.index(max(distances))
    k2_distance = max(distances)
    distances[k2] = 0
    k3 = distances.index(max(distances))
    k3_distance = max(distances)
    distances[k3] = 0  # if k=5 is used instead of k=3
    k4 = distances.index(max(distances))
    k4_distance = max(distances)
    distances[k4] = 0
    k5 = distances.index(max(distances))
    k5_distance = max(distances)
    distances[k5] = 0

    u1 = ""  # utterances to which the dot products belong
    u2 = ""
    u3 = ""
    u4 = ""  # if k=5 is used instead of k=3
    u5 = ""

    u1 = data_list[k1]
    u2 = data_list[k2]
    u3 = data_list[k3]
    u4 = data_list[k4]  # if k=5 is used instead of k=3
    u5 = data_list[k5]

    #print(k1, k1_distance, u1)
    #print(k2, k2_distance, u2)
    #print(k3, k3_distance, u3)
    #print(k4, k4_distance, u4)
    #print(k5, k5_distance, u5)

    # find the classes of the utterances corresponding to the most similar vectors
    classes = []
    cb_list = machine_learning_processing.make_list_of_column(file, column)
    classes.append(cb_list[k1])
    classes.append(cb_list[k2])
    classes.append(cb_list[k3])
    classes.append(cb_list[k4])
    classes.append(cb_list[k5])
    #print(classes)

    s1 = classes.count(
        "1")  # number of most similiar utterances with the class s1
    s2 = classes.count(
        "2")  # number of most similiar utterances with the class s2
    s3 = classes.count(
        "3")  # number of most similiar utterances with the class s3
    s4 = classes.count(
        "4")  # number of most similiar utterances with the class s4
    s5 = classes.count(
        "5")  # number of most similiar utterances with the class s5
    values = [s1, s2, s3, s4, s5]
    strength_class = ""  # determine class of the test_utterance

    if max(values) == values[0]:  # determine class (max P(class|tweet))
        strength_class = 1
    elif max(values) == values[1]:
        strength_class = 2
    elif max(values) == values[2]:
        strength_class = 3
    elif max(values) == values[3]:
        strength_class = 4
    else:
        strength_class = 5

    return strength_class
Пример #11
0
def do_svm_sent_pos_hs(data_list, utterance_unprocessed, lex, test_utterance,
                       file, column, matrix):
    """
    We use the Support Vector Machine (with k-nearest neighbour) algorithm to determine the class of each tweet.
    We work with our data_list containing all stemmed and processed utterances and our lexicon without occurence probabilities.

    First we need to estimate the vectors for each utterance of the training set.
    The values of the vector are the number of times each word of our lexicon appears in our utterance.
    Then we compare for each utterance its vector with all other vectors to find the ones most similar.
    To do this, we need to estimate the normalized dot product.

    The class of our tweet is estimated by the (intellectually assigned) class of the k tweets with the most similar vector
    (k=3 or k=5, whichever gets the better results).
    """

    # make vector from the utterance of the test set
    vec_test_utterance = []  # vector of the utterance from the test_set
    for line in open(lex):
        line_split = line.split()
        n = 0
        m = 0
        if line_split[0] in test_utterance:
            while m < len(test_utterance):
                if line_split[0] == test_utterance[m]:
                    n += 1  # count how often the word appears in the utterance
                    m += 1  # loop through the words of the utterance
                else:
                    m += 1
        vec_test_utterance.append(n)

    distances = []  # array of distances to each vector of the training set
    np_vec = np.array(vec_test_utterance)
    vec_len = np.linalg.norm(np_vec)  # length of test_vector

    for vector in matrix:  # loop through all vectors from the training set
        np_vec2 = np.array(vector)
        vec2_len = np.linalg.norm(
            np_vec2)  # length of vector from training set

        # estimate normalized dot product between the test vector and all vectors from the training set to calculate the distance
        dot_product_first = 0
        i = 0
        while i < len(vec_test_utterance):
            dot_product_first = dot_product_first + vec_test_utterance[
                i] * vector[i]
            i += 1
        if vec_len != 0 and vec2_len != 0:
            dot_product = dot_product_first / (vec_len * vec2_len
                                               )  # normalized dot product
        else:
            dot_product = 0
        distances.append(dot_product)  # save all dot products into the list

    # find the 3 or 5 most similar vectors for our test vector
    k1 = distances.index(max(distances))
    k1_distance = max(distances)
    distances[k1] = 0  # set dot product to 0 to determine the next best vector
    k2 = distances.index(max(distances))
    k2_distance = max(distances)
    distances[k2] = 0
    k3 = distances.index(max(distances))
    k3_distance = max(distances)
    distances[k3] = 0  # if k=5 is used instead of k=3
    k4 = distances.index(max(distances))
    k4_distance = max(distances)
    distances[k4] = 0
    k5 = distances.index(max(distances))
    k5_distance = max(distances)
    distances[k5] = 0

    u1 = ""  # utterances to which the dot products belong
    u2 = ""
    u3 = ""
    u4 = ""  # if k=5 is used instead of k=3
    u5 = ""

    u1 = data_list[k1]
    u2 = data_list[k2]
    u3 = data_list[k3]
    u4 = data_list[k4]  # if k=5 is used instead of k=3
    u5 = data_list[k5]

    # find the classes of the utterances corresponding to the most similar vectors
    classes = []
    hs_list = machine_learning_processing.make_list_of_column(file, column)
    classes.append(hs_list[k1])
    classes.append(hs_list[k2])
    classes.append(hs_list[k3])
    classes.append(hs_list[k4])
    classes.append(hs_list[k5])

    hs = classes.count(
        "1")  # number of most similiar utterances with the class cyberbulling
    no_hs = classes.count(
        "0"
    )  # number of most similiar utterances with the class no_cyberbulling
    hs_class = ""  # determine class of the test_utterance

    # probability that utterance is in class cyberbullying / no cyberbullying based on pos-tagging
    hs_pos_prob = compare_tweets_hatespeech.compare_vec_tweet_hatespeech(
        utterance_unprocessed, 5)
    no_hs_pos_prob = 1 - hs_pos_prob

    hs = hs * hs_pos_prob
    no_hs = no_hs * no_hs_pos_prob

    if hs >= no_hs:
        cb_class = 1
    else:
        cb_class = 0

    return cb_class
Пример #12
0
                print(utterance_id)
            elif utterance_id == 500:
                print(utterance_id)
            elif utterance_id == 600:
                print(utterance_id)
            elif utterance_id == 700:
                print(utterance_id)
            elif utterance_id == 800:
                print(utterance_id)
            elif utterance_id == 900:
                print(utterance_id)


test_list = machine_learning_processing.process_data("test_set.csv", 5)
training_list = machine_learning_processing.process_data("train_set.csv", 5)
test_list_unprocessed = machine_learning_processing.make_list_of_column(
    "test_set.csv", 5)

term_utterance_matrix = support_vector_machine.do_matrix(
    training_list, "lexicon.txt")
matrix_pos = support_vector_machine.do_matrix(training_list, "lexicon_pos.txt")
matrix_neut = support_vector_machine.do_matrix(training_list,
                                               "lexicon_neut.txt")
matrix_neg = support_vector_machine.do_matrix(training_list, "lexicon_neg.txt")
matrix_pos2 = support_vector_machine.do_matrix(training_list,
                                               "lexicon_pos2.txt")
matrix_neut2 = support_vector_machine.do_matrix(training_list,
                                                "lexicon_neut2.txt")
matrix_neg2 = support_vector_machine.do_matrix(training_list,
                                               "lexicon_neg2.txt")

# cyberbullying