def __topdmm_wc(name, dst_vocab_file, dst_topics_file): all_doc_contents = utils.read_lines_to_list(WC_SEG_DOC_CONTENT_NODUP_FILE) name_doc_dict = utils.load_entity_name_to_doc_file(WC_NAME_DOC_ND_FILE) doc_idxs = name_doc_dict[name] contents = [all_doc_contents[idx] for idx in doc_idxs] print(len(contents), 'docs') docs_words = [content.split(' ') for content in contents] words_exist = utils.get_word_set(docs_words) extra_exclude_words = {name} if name == '姜子牙': extra_exclude_words = {'姜', '子牙'} cv = textvectorizer.CountVectorizer( (WC_DF_ND_FILE, 20, 700), remove_stopwords=True, words_exist=words_exist, extra_exclude_words=extra_exclude_words) print(len(cv.vocab), 'words in vocab') X = cv.get_vecs(contents, normalize=False) # D_codoc = utils.get_codoc_matrix(cv.vocab, contents) n_topic_words_disp = 10 print('starting training ...') # for k in range(10, 11): k = 10 dmm = TOPDMM(k, 100, alpha=0.01, beta=0.01, n_top=-1) dmm.fit(X) for t in dmm.topic_word_: widxs = np.argpartition(-t, range(n_topic_words_disp))[:n_topic_words_disp] topic_words = [cv.vocab[i] for i in widxs] print(' '.join(topic_words)) dmm.save(cv.vocab, dst_vocab_file, dst_topics_file)
def process_quora(): name = 'DC' all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE) name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE) doc_idxs = name_doc_dict[name] contents = [all_doc_contents[idx] for idx in doc_idxs] docs_words = [content.split(' ') for content in contents] words_exist = utils.get_word_set(docs_words) cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 50, 6000), remove_stopwords=True, words_exist=words_exist) print(len(cv.vocab), 'words in vocab') word_idfs = [ np.log(QUORA_NUM_TOTAL_DOCS / cv.word_cnts[w]) for w in cv.vocab ] docs = list() for words in docs_words: doc = list() for w in words: widx = cv.word_dict.get(w, -1) if widx > -1: doc.append(widx) docs.append(doc) return docs, cv.vocab, word_idfs
def __run_quora(): name = 'DC' all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE) name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE) doc_idxs = name_doc_dict[name] contents = [all_doc_contents[idx] for idx in doc_idxs] docs_words = [content.split(' ') for content in contents] words_exist = utils.get_word_set(docs_words) cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 100, 5000), remove_stopwords=True, words_exist=words_exist) print(len(cv.vocab), 'words in vocab') X = cv.get_vecs(contents) parser = argparse.ArgumentParser() parser.add_argument("-lamb", "--lamb", type=float) parser.add_argument("-N", "--N", type=int) args = parser.parse_args() lamb = 1 if args.lamb is None else args.lamb N = 10 if args.N is None else args.N dst_file = os.path.join(QUORA_DATA_DIR, 'dpmfs_z_{}.txt'.format(lamb)) print(dst_file) n_docs = len(doc_idxs) dpmfs = DPMFS(cv.n_words, N=N, n_docs=n_docs, n_iter=1000, lamb=lamb) dpmfs.fit(X, dst_file, cv.vocab) np.savetxt(dst_file, dpmfs.z, fmt='%d')
def __topdmm_wc_minidocs(name, dst_vocab_file, dst_topics_file): # all_doc_contents = utils.read_lines_to_list(WC_MINIDOC_TEXT_SEG_NODUP_FILE) # name_doc_dict = utils.load_entity_name_to_minidoc_file(WC_MINIDOC_INFO_NODUP_FILE) all_doc_contents = utils.read_lines_to_list( 'd:/data/indec/docs-14k-minidocs-text-seg-new.txt') name_doc_dict = utils.load_entity_name_to_minidoc_file( 'd:/data/indec/docs-14k-minidocs-info-new.txt') doc_idxs = name_doc_dict[name] contents = [all_doc_contents[idx] for idx in doc_idxs] # print(max(doc_idxs), len(all_doc_contents)) print(len(contents), 'docs') common_words = utils.read_lines_to_list(COMMON_CH_WORDS_FILE) docs_words = [content.split(' ') for content in contents] words_exist = utils.get_word_set(docs_words) extra_exclude_words = set(common_words) extra_exclude_words.add(name) # extra_exclude_words = {name} if name == '姜子牙': extra_exclude_words.add('姜') extra_exclude_words.add('子牙') if name == '夏侯惇': extra_exclude_words.add('夏侯') extra_exclude_words.add('惇') cv = textvectorizer.CountVectorizer( (WC_DF_ND_FILE, 20, 700), remove_stopwords=True, words_exist=words_exist, extra_exclude_words=extra_exclude_words) print(len(cv.vocab), 'words in vocab') # print('吃' in cv.vocab) # exit() X = cv.get_vecs(contents, normalize=False) # D_codoc = utils.get_codoc_matrix(cv.vocab, contents) n_topic_words_disp = 10 print('starting training ...') # for k in range(10, 11): k = 10 dmm = TOPDMM(k, 80, alpha=0.01, beta=0.01, n_top=-1) dmm.fit(X) for t in dmm.topic_word_: widxs = np.argpartition(-t, range(n_topic_words_disp))[:n_topic_words_disp] topic_words = [cv.vocab[i] for i in widxs] print(' '.join(topic_words)) dmm.save(cv.vocab, dst_vocab_file, dst_topics_file)
def __filter_duplicate_minidocs(): df_minidocs = pd.read_csv(WC_MINIDOC_INFO_FILE) # print(df_minidocs.head()) all_doc_contents = commonutils.read_lines_to_list(WC_MINIDOC_TEXT_SEG_FILE) cv = textvectorizer.CountVectorizer((WC_DF_FILE, 100, 2000), remove_stopwords=True) print(cv.n_words, 'words in vocab') X = cv.get_vecs(all_doc_contents) n_docs = len(all_doc_contents) print(n_docs, 'docs', X.shape) dup_docs = set() for i, x1 in enumerate(X): cur_name = df_minidocs['entity_name'][i] # print(cur_name) if i % 100 == 0: print(i) # print(i) if i in dup_docs: continue for j in range(i + 1, n_docs): if j in dup_docs: continue sim = cosine_similarity(x1, X[j]) # if 0.8 < sim < 0.9: # print(i, j, sim) if sim > 0.9 and cur_name == df_minidocs['entity_name'][j]: # print(i, j, minidocs[i]['entity_name'], minidocs[j]['entity_name']) dup_docs.add(j) # if i == 3: # break # exit() dup_docs_list = list(dup_docs) dup_docs_list.sort() print(dup_docs_list[:30]) # TODO mdid not correct df_fil = df_minidocs.drop(dup_docs_list) with open(WC_MINIDOC_INFO_NODUP_FILE, 'w', encoding='utf-8', newline='\n') as fout: df_fil.to_csv(fout, index=False) commonutils.remove_lines(WC_MINIDOC_TEXT_FILE, dup_docs, WC_MINIDOC_TEXT_NODUP_FILE) commonutils.remove_lines(WC_MINIDOC_TEXT_SEG_FILE, dup_docs, WC_MINIDOC_TEXT_SEG_NODUP_FILE)
def __process_quora(): cv = textvectorizer.CountVectorizer(QUORA_DF_FILE, 50, 10000, True) print(cv.n_words, 'words in vocabulary') name = 'DC' all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE) name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE) doc_idxs = name_doc_dict[name] contents = [all_doc_contents[idx] for idx in doc_idxs] X = cv.get_vecs(contents) k = 10 lda = LatentDirichletAllocation(k, learning_method='batch', doc_topic_prior=.1, topic_word_prior=0.01) X_new = lda.fit_transform(X) # for t in lda.components_: # max_word_idxs = np.argpartition(-t, np.arange(10))[:10] # for idx in max_word_idxs: # print(cv.vocab[idx], end=' ') # print() topic_cnts = {i: 0 for i in range(k)} for i, x in enumerate(X_new): max_topic_idxs = np.argpartition(-x, np.arange(3))[:3] topic_cnts[max_topic_idxs[0]] += 1 # print(i + 1) # for tidx in max_topic_idxs: # topic_dist = lda.components_[tidx] # max_word_idxs = np.argpartition(-topic_dist, np.arange(10))[:10] # topic_words = [cv.vocab[idx] for idx in max_word_idxs] # print(x[tidx], ' '.join(topic_words)) # print() # if i == 50: # break for tidx, cnt in topic_cnts.items(): print(tidx, cnt) max_word_idxs = np.argpartition(-lda.components_[tidx], np.arange(10))[:10] for idx in max_word_idxs: print('{}*{:.3f}'.format(cv.vocab[idx], lda.components_[tidx][idx]), end=' ') print()
def __filter_duplicate_docs(): all_doc_contents = commonutils.read_lines_to_list(WC_SEG_DOC_CONTENT_FILE) cv = textvectorizer.CountVectorizer((WC_DF_FILE, 100, 2000), remove_stopwords=True) print(cv.n_words, 'words in vocab') X = cv.get_vecs(all_doc_contents) n_docs = len(all_doc_contents) print(n_docs, 'docs', X.shape) dup_docs = set() for i, x1 in enumerate(X): if i % 100 == 0: print(i) # print(i) if i in dup_docs: continue for j in range(i + 1, n_docs): if j in dup_docs: continue sim = cosine_similarity(x1, X[j]) # if 0.8 < sim < 0.9: # print(i, j, sim) if sim > 0.8: dup_docs.add(j) # if i == 5: # break # exit() doc_info_df = pd.read_csv(doc_file) dup_docs_list = list(dup_docs) dup_docs_list.sort() print(dup_docs_list[:30]) df_fil = doc_info_df.drop(dup_docs_list) with open(WC_DOC_INFO_NODUP_FILE, 'w', encoding='utf-8', newline='\n') as fout: df_fil.to_csv(fout, index=False) commonutils.remove_lines(WC_DOC_CONTENT_FILE, dup_docs, WC_DOC_CONTENT_NODUP_FILE) commonutils.remove_lines(WC_SEG_DOC_CONTENT_FILE, dup_docs, WC_SEG_DOC_CONTENT_NODUP_FILE)
def __run_with_quora(): name = 'DC' # name = 'WP' # name = 'Austin' # name = 'Mark' all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE) name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE) doc_idxs = name_doc_dict[name] contents = [all_doc_contents[idx] for idx in doc_idxs] docs_words = [content.split(' ') for content in contents] words_exist = utils.get_word_set(docs_words) cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 50, 6000), remove_stopwords=True, words_exist=words_exist) print(len(cv.vocab), 'words in vocab') X = cv.get_vecs(contents, normalize=False) # D_codoc = utils.get_codoc_matrix(cv.vocab, contents) # k = 3 n_topic_words_disp = 10 for k in range(10, 11): dmm = TOPDMM(k, 100, alpha=0.01, beta=0.01) dmm.fit(X) for t in dmm.topic_word_: widxs = np.argpartition( -t, range(n_topic_words_disp))[:n_topic_words_disp] topic_words = [cv.vocab[i] for i in widxs] print(' '.join(topic_words)) # __show_coherences(k, dmm.topic_word_, D_codoc) test_vocab_file = os.path.join(QUORA_DATA_DIR, '{}_vocab.txt'.format(name)) test_topic_file = os.path.join(QUORA_DATA_DIR, '{}_topics.txt'.format(name)) dmm.save(cv.vocab, test_vocab_file, test_topic_file)
def __process_quora(): name = 'DC' # name = 'Mark' all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE) name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE) doc_idxs = name_doc_dict[name] contents = [all_doc_contents[idx] for idx in doc_idxs] docs_words = [content.split(' ') for content in contents] words_exist = utils.get_word_set(docs_words) cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 50, 6000), remove_stopwords=True, words_exist=words_exist) print(len(cv.vocab), 'words in vocab') X = cv.get_vecs(contents, normalize=True) print(X.shape) k = 10 tsvd = TruncatedSVD(n_components=k) X_new = tsvd.fit_transform(X) for i in range(k): max_idxs = np.argpartition(-tsvd.components_[i], range(20))[:20] words = [cv.vocab[idx] for idx in max_idxs] print(tsvd.explained_variance_[i], tsvd.singular_values_[i]) print(words)