def get_np_array(myDb, engLines, users, words, year): reviewed = [] nparr = np.array([0] * len(users) * len(words)) nparr = np.reshape(nparr, [len(users), len(words)]) errors = [0, 0, 0, 0, 0, 0] rawLines = get_translated_text("Translated_text.txt") for i in range(0, DATA_LEN): if valid_index(myDb, rawLines, i, year, errors): user = myDb.ID_coded[i] body = engLines[i].lower() if user not in reviewed: reviewed.append(user) wlist = body.split() for word in wlist: word = re.sub('[^a-zA-Z]+', '', word) if is_valid(word): nparr[users.index(user), words.index(word)] = nparr[users.index(user), words.index(word)] + 1 # Code to check which users has no word added to the nparr #for i in range(users.__len__()): # if sum(nparr[i]) == 0: # print("SOMETHING IS WRONG ({})!!!".format(users[i])) return nparr
def run_tm_and_dump(cand_year, files): db = get_cands_data('thesis_db.xls', DATA_LEN) engLines = get_translated_text("Translated_text.txt") engLines = engLines[:DATA_LEN] index = 0 reviewed_cands = [] cand_ids = [] index2cand = {} run_text = [] errors = [0, 0, 0, 0, 0, 0] for line in engLines: this_cand_id = db.ID_coded[index] if this_cand_id not in reviewed_cands: reviewed_cands.append(this_cand_id) cand = get_cand(db, engLines, index, [cand_year], errors) if cand is not None: run_text.append(line) cand_ids.append(cand.id) index2cand[index] = cand index = index + 1 lem_text = get_data_lemmatized(run_text) id2word, corpus = text2corpus(lem_text) X2 = corpus2nparray(corpus, id2word) users, words, cands = get_users_and_words(db, engLines, cand_year, files[0]) X = get_np_array(db, engLines, users, words, cand_year) model = lda.LDA(n_topics=N, n_iter=ITERATIONS, random_state=1) model.fit(X2) # model.fit_transform(X) is also available topic_word = model.topic_word_ # model.components_ also works n_top_words = 8 for i, topic_dist in enumerate(topic_word): topic_words = np.array(words)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] str = 'Topic {}: {}'.format(i, ' '.join(topic_words)) print(str) #dump_tm(db, model.doc_topic_, users, cands, files) rr = 6
def get_users_and_words(myDb, engLines, year, outFile): users = [] words = [] cands = {} duplicate = 0 errors = [0, 0, 0, 0, 0, 0] rawLines = get_translated_text("Translated_text.txt") for i in range(0, DATA_LEN): #if i == 8407: # stop = True print(i) if valid_index(myDb, rawLines, i, year, errors): user = myDb.ID_coded[i] body = engLines[i].lower() if user not in users: users.append(user) cands[user] = i wlist = body.split() for word in wlist: word = re.sub('[^a-zA-Z]+', '', word) if is_valid(word) and word not in words: words.append(word) else: #print("{} already in users".format(user)) duplicate = duplicate + 1 #outFile.write("Users: {} (Diff Year : {}, Inv text: {}, T.ratio: {}, Dup: {}, Inv Off: {}, Inv Sadir {}, Inv Fail {})\n".format(users.__len__(), # errors[YEAR_MISMATCH], # errors[INVALID_TEXT], # errors[INVALID_TRANS_RATIO], # duplicate, # errors[INVALID_OFFICER], # errors[INVALID_SADIR], # errors[INVALID_FAIL])) return users, words, cands
def run_topic_modeling(cand_year, outFile): outFile.write("Results for {}\n".format(cand_year)) db = get_cands_data('thesis_db.xls', DATA_LEN) #engLines = get_translated_text("Translated_text.txt") engLines = get_translated_text("lemmatized_db.txt") users, words, cands = get_users_and_words(db, engLines, cand_year, outFile) X = get_np_array(db, engLines, users, words, cand_year) model = lda.LDA(n_topics=N, n_iter=ITERATIONS, random_state=1) model.fit(X) # model.fit_transform(X) is also available topic_word = model.topic_word_ # model.components_ also works n_top_words = 8 for i, topic_dist in enumerate(topic_word): topic_words = np.array(words)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] str = 'Topic {}: {}'.format(i, ' '.join(topic_words)) print(str) outFile.write(str + '\n') #print_topic_modeling_stats(db, model.doc_topic_, users, cands, outFile) outFile.write('\n')
#print("prediction: {}, result: {} - TRUE".format(prediction[0], predict_results[i])) right[prediction[0]] = right[prediction[0]] + 1 else: print("prediction: {}, result: {} - FALSE".format( prediction[0], predict_results[i])) print("Train data: {}/{}".format(sum(train_results), len(train_results))) print("Predict data: {}/{}".format(sum(predict_results), len(predict_results))) print("predictions: [0]={}/{}, [1]={}/{}, rate = {}".format( right[0], results[0], right[1], results[1], sum(right) / len(predict_samples))) print(datetime.datetime.now()) full_text = get_translated_text("Translated_text.txt") #full_text = get_translated_text("fake_translated.txt") text = full_text[:MAX_LINE] db = get_cands_data('thesis_db.xls', MAX_LINE) #db = get_cands_data('fake_db.xls', MAX_LINE) reviewed_cands = [] characters_map = {} close_type_map = {} errors = [0, 0, 0, 0, 0, 0] lda_text = [[], [], [], []] accum_kkz_text = [""] * 4 accum_grade_text = [""] * 41 entire_text = [] sample1_text = [] sample2_text = [] boys = []
# data = df.content.values.tolist() # Remove Emails # data = [re.sub('\S*@\S*\s?', '', sent) for sent in data] # Remove new line characters # data = [re.sub('\s+', ' ', sent) for sent in data] # Remove distracting single quotes # data = [re.sub("\'", "", sent) for sent in data] # print("after remove mails, new lines and quotes") # pprint(data[:1]) # print("\n\n") sentences = get_translated_text("Translated_text.txt") sen = sentences[:2000] data_words = list(sent_to_words(sentences)) # Build the bigram and trigram models bigram = gensim.models.Phrases(data_words, min_count=3, threshold=20) # higher threshold fewer phrases. trigram = gensim.models.Phrases(bigram[data_words], threshold=20) # Faster way to get a sentence clubbed as a trigram/bigram bigram_mod = gensim.models.phrases.Phraser(bigram) trigram_mod = gensim.models.phrases.Phraser(trigram) # See trigram example # print(trigram_mod[bigram_mod[data_words[0]]]) # print("\n\n")
def analyze_characters(): print("Analyzing characters\n") full_text = get_translated_text("Translated_text.txt") db = get_cands_data('thesis_db_copy.xls', 12110) reviewed_cands = [] errors = [0, 0, 0, 0, 0, 0] to_drop = [] found_char = [] no_char = [] tmp_cnt = 0 cnt = 0 char_cnt = 0 found_cnt = 0 for index in range(12110): this_cand_id = db.ID_coded[index] if this_cand_id in reviewed_cands: to_drop.append(index) else: reviewed_cands.append(this_cand_id) cand = get_cand(db, full_text, index, [2015], errors) if cand is None: to_drop.append(index) else: found = False ffound = False mfound = False cnt = cnt + 1 db.f_char_type[index] = 0 db.m_char_type[index] = 0 for char in temp: if char in cand.hebText: tmp_cnt = tmp_cnt + 1 print(cand.hebText) if search_characters(fiction_males, cand): found = True char_cnt = char_cnt + 1 db.m_char_type[index] = 3 mfound = True if search_characters(historic_males, cand): found = True char_cnt = char_cnt + 1 db.m_char_type[index] = 2 mfound = True if search_characters(real_males, cand): found = True char_cnt = char_cnt + 1 db.m_char_type[index] = 1 mfound = True if search_characters(fiction_females, cand): found = True char_cnt = char_cnt + 1 db.f_char_type[index] = 3 ffound = True if search_characters(historic_females, cand): found = True char_cnt = char_cnt + 1 db.f_char_type[index] = 2 ffound = True if search_characters(real_females, cand): found = True char_cnt = char_cnt + 1 db.f_char_type[index] = 1 ffound = True #print(cand.hebText) if found: if mfound and ffound: db.f_char_type[index] = "" db.m_char_type[index] = "" no_char.append(index) else: found_cnt = found_cnt + 1 found_char.append(index) else: db.f_char_type[index] = "" db.m_char_type[index] = "" no_char.append(index) print(index) print(cnt) print("Different candidates that found a match: {}".format(found_cnt)) print("Different characters found: {}".format(char_cnt)) #print(len()) print("temps {}".format(tmp_cnt))