Exemplo n.º 1
0
def get_np_array(myDb, engLines, users, words, year):
    reviewed = []
    nparr = np.array([0] * len(users) * len(words))
    nparr = np.reshape(nparr, [len(users), len(words)])
    errors = [0, 0, 0, 0, 0, 0]

    rawLines = get_translated_text("Translated_text.txt")

    for i in range(0, DATA_LEN):
        if valid_index(myDb, rawLines, i, year, errors):
            user = myDb.ID_coded[i]
            body = engLines[i].lower()

            if user not in reviewed:
                reviewed.append(user)
                wlist = body.split()
                for word in wlist:
                    word = re.sub('[^a-zA-Z]+', '', word)
                    if is_valid(word):
                        nparr[users.index(user),
                              words.index(word)] = nparr[users.index(user),
                                                         words.index(word)] + 1

    # Code to check which users has no word added to the nparr
    #for i in range(users.__len__()):
    #    if sum(nparr[i]) == 0:
    #        print("SOMETHING IS WRONG ({})!!!".format(users[i]))

    return nparr
Exemplo n.º 2
0
def run_tm_and_dump(cand_year, files):
    db = get_cands_data('thesis_db.xls', DATA_LEN)
    engLines = get_translated_text("Translated_text.txt")

    engLines = engLines[:DATA_LEN]

    index = 0
    reviewed_cands = []
    cand_ids = []
    index2cand = {}
    run_text = []
    errors = [0, 0, 0, 0, 0, 0]
    for line in engLines:
        this_cand_id = db.ID_coded[index]
        if this_cand_id not in reviewed_cands:
            reviewed_cands.append(this_cand_id)
            cand = get_cand(db, engLines, index, [cand_year], errors)
            if cand is not None:
                run_text.append(line)
                cand_ids.append(cand.id)
                index2cand[index] = cand
        index = index + 1

    lem_text = get_data_lemmatized(run_text)
    id2word, corpus = text2corpus(lem_text)
    X2 = corpus2nparray(corpus, id2word)

    users, words, cands = get_users_and_words(db, engLines, cand_year,
                                              files[0])
    X = get_np_array(db, engLines, users, words, cand_year)

    model = lda.LDA(n_topics=N, n_iter=ITERATIONS, random_state=1)
    model.fit(X2)  # model.fit_transform(X) is also available
    topic_word = model.topic_word_  # model.components_ also works
    n_top_words = 8

    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(words)[np.argsort(topic_dist)][:-(n_top_words +
                                                                 1):-1]
        str = 'Topic {}: {}'.format(i, ' '.join(topic_words))
        print(str)

    #dump_tm(db, model.doc_topic_, users, cands, files)
    rr = 6
Exemplo n.º 3
0
def get_users_and_words(myDb, engLines, year, outFile):
    users = []
    words = []
    cands = {}
    duplicate = 0
    errors = [0, 0, 0, 0, 0, 0]

    rawLines = get_translated_text("Translated_text.txt")

    for i in range(0, DATA_LEN):
        #if i == 8407:
        #    stop = True

        print(i)
        if valid_index(myDb, rawLines, i, year, errors):
            user = myDb.ID_coded[i]
            body = engLines[i].lower()
            if user not in users:
                users.append(user)
                cands[user] = i
                wlist = body.split()
                for word in wlist:
                    word = re.sub('[^a-zA-Z]+', '', word)
                    if is_valid(word) and word not in words:
                        words.append(word)
            else:
                #print("{} already in users".format(user))
                duplicate = duplicate + 1

    #outFile.write("Users: {} (Diff Year : {}, Inv text: {}, T.ratio: {}, Dup: {}, Inv Off: {}, Inv Sadir {}, Inv Fail {})\n".format(users.__len__(),
    #                                                                                        errors[YEAR_MISMATCH],
    #                                                                                        errors[INVALID_TEXT],
    #                                                                                        errors[INVALID_TRANS_RATIO],
    #                                                                                        duplicate,
    #                                                                                        errors[INVALID_OFFICER],
    #                                                                                        errors[INVALID_SADIR],
    #                                                                                        errors[INVALID_FAIL]))
    return users, words, cands
Exemplo n.º 4
0
def run_topic_modeling(cand_year, outFile):
    outFile.write("Results for {}\n".format(cand_year))

    db = get_cands_data('thesis_db.xls', DATA_LEN)
    #engLines = get_translated_text("Translated_text.txt")
    engLines = get_translated_text("lemmatized_db.txt")

    users, words, cands = get_users_and_words(db, engLines, cand_year, outFile)
    X = get_np_array(db, engLines, users, words, cand_year)

    model = lda.LDA(n_topics=N, n_iter=ITERATIONS, random_state=1)
    model.fit(X)  # model.fit_transform(X) is also available
    topic_word = model.topic_word_  # model.components_ also works
    n_top_words = 8

    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(words)[np.argsort(topic_dist)][:-(n_top_words +
                                                                 1):-1]
        str = 'Topic {}: {}'.format(i, ' '.join(topic_words))
        print(str)
        outFile.write(str + '\n')

    #print_topic_modeling_stats(db, model.doc_topic_, users, cands, outFile)
    outFile.write('\n')
Exemplo n.º 5
0
            #print("prediction: {}, result: {} - TRUE".format(prediction[0], predict_results[i]))
            right[prediction[0]] = right[prediction[0]] + 1
        else:
            print("prediction: {}, result: {} - FALSE".format(
                prediction[0], predict_results[i]))

    print("Train data: {}/{}".format(sum(train_results), len(train_results)))
    print("Predict data: {}/{}".format(sum(predict_results),
                                       len(predict_results)))
    print("predictions: [0]={}/{}, [1]={}/{}, rate = {}".format(
        right[0], results[0], right[1], results[1],
        sum(right) / len(predict_samples)))


print(datetime.datetime.now())
full_text = get_translated_text("Translated_text.txt")
#full_text = get_translated_text("fake_translated.txt")
text = full_text[:MAX_LINE]
db = get_cands_data('thesis_db.xls', MAX_LINE)
#db = get_cands_data('fake_db.xls', MAX_LINE)
reviewed_cands = []
characters_map = {}
close_type_map = {}
errors = [0, 0, 0, 0, 0, 0]
lda_text = [[], [], [], []]
accum_kkz_text = [""] * 4
accum_grade_text = [""] * 41
entire_text = []
sample1_text = []
sample2_text = []
boys = []
Exemplo n.º 6
0
# data = df.content.values.tolist()

# Remove Emails
# data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
# data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
# data = [re.sub("\'", "", sent) for sent in data]

# print("after remove mails, new lines and quotes")
# pprint(data[:1])
# print("\n\n")

sentences = get_translated_text("Translated_text.txt")
sen = sentences[:2000]

data_words = list(sent_to_words(sentences))

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=3, threshold=20)  # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=20)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
# print(trigram_mod[bigram_mod[data_words[0]]])
# print("\n\n")
Exemplo n.º 7
0
def analyze_characters():
    print("Analyzing characters\n")
    full_text = get_translated_text("Translated_text.txt")
    db = get_cands_data('thesis_db_copy.xls', 12110)
    reviewed_cands = []
    errors = [0, 0, 0, 0, 0, 0]
    to_drop = []
    found_char = []
    no_char = []
    tmp_cnt = 0

    cnt = 0
    char_cnt = 0
    found_cnt = 0

    for index in range(12110):
        this_cand_id = db.ID_coded[index]
        if this_cand_id in reviewed_cands:
            to_drop.append(index)
        else:
            reviewed_cands.append(this_cand_id)
            cand = get_cand(db, full_text, index, [2015], errors)
            if cand is None:
                to_drop.append(index)
            else:
                found = False
                ffound = False
                mfound = False
                cnt = cnt + 1
                db.f_char_type[index] = 0
                db.m_char_type[index] = 0
                for char in temp:
                    if char in cand.hebText:
                        tmp_cnt = tmp_cnt + 1
                        print(cand.hebText)

                if search_characters(fiction_males, cand):
                    found = True
                    char_cnt = char_cnt + 1
                    db.m_char_type[index] = 3
                    mfound = True
                if search_characters(historic_males, cand):
                    found = True
                    char_cnt = char_cnt + 1
                    db.m_char_type[index] = 2
                    mfound = True
                if search_characters(real_males, cand):
                    found = True
                    char_cnt = char_cnt + 1
                    db.m_char_type[index] = 1
                    mfound = True
                if search_characters(fiction_females, cand):
                    found = True
                    char_cnt = char_cnt + 1
                    db.f_char_type[index] = 3
                    ffound = True
                if search_characters(historic_females, cand):
                    found = True
                    char_cnt = char_cnt + 1
                    db.f_char_type[index] = 2
                    ffound = True
                if search_characters(real_females, cand):
                    found = True
                    char_cnt = char_cnt + 1
                    db.f_char_type[index] = 1
                    ffound = True


                    #print(cand.hebText)
                if found:
                    if mfound and ffound:
                        db.f_char_type[index] = ""
                        db.m_char_type[index] = ""
                        no_char.append(index)
                    else:
                        found_cnt = found_cnt + 1
                        found_char.append(index)
                else:
                    db.f_char_type[index] = ""
                    db.m_char_type[index] = ""
                    no_char.append(index)

    print(index)
    print(cnt)
    print("Different candidates that found a match: {}".format(found_cnt))
    print("Different characters found: {}".format(char_cnt))
    #print(len())
    print("temps {}".format(tmp_cnt))