def __topdmm_wc_minidocs(name, dst_vocab_file, dst_topics_file): # all_doc_contents = utils.read_lines_to_list(WC_MINIDOC_TEXT_SEG_NODUP_FILE) # name_doc_dict = utils.load_entity_name_to_minidoc_file(WC_MINIDOC_INFO_NODUP_FILE) all_doc_contents = utils.read_lines_to_list( 'd:/data/indec/docs-14k-minidocs-text-seg-new.txt') name_doc_dict = utils.load_entity_name_to_minidoc_file( 'd:/data/indec/docs-14k-minidocs-info-new.txt') doc_idxs = name_doc_dict[name] contents = [all_doc_contents[idx] for idx in doc_idxs] # print(max(doc_idxs), len(all_doc_contents)) print(len(contents), 'docs') common_words = utils.read_lines_to_list(COMMON_CH_WORDS_FILE) docs_words = [content.split(' ') for content in contents] words_exist = utils.get_word_set(docs_words) extra_exclude_words = set(common_words) extra_exclude_words.add(name) # extra_exclude_words = {name} if name == '姜子牙': extra_exclude_words.add('姜') extra_exclude_words.add('子牙') if name == '夏侯惇': extra_exclude_words.add('夏侯') extra_exclude_words.add('惇') cv = textvectorizer.CountVectorizer( (WC_DF_ND_FILE, 20, 700), remove_stopwords=True, words_exist=words_exist, extra_exclude_words=extra_exclude_words) print(len(cv.vocab), 'words in vocab') # print('吃' in cv.vocab) # exit() X = cv.get_vecs(contents, normalize=False) # D_codoc = utils.get_codoc_matrix(cv.vocab, contents) n_topic_words_disp = 10 print('starting training ...') # for k in range(10, 11): k = 10 dmm = TOPDMM(k, 80, alpha=0.01, beta=0.01, n_top=-1) dmm.fit(X) for t in dmm.topic_word_: widxs = np.argpartition(-t, range(n_topic_words_disp))[:n_topic_words_disp] topic_words = [cv.vocab[i] for i in widxs] print(' '.join(topic_words)) dmm.save(cv.vocab, dst_vocab_file, dst_topics_file)
def process_quora(): name = 'DC' all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE) name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE) doc_idxs = name_doc_dict[name] contents = [all_doc_contents[idx] for idx in doc_idxs] docs_words = [content.split(' ') for content in contents] words_exist = utils.get_word_set(docs_words) cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 50, 6000), remove_stopwords=True, words_exist=words_exist) print(len(cv.vocab), 'words in vocab') word_idfs = [ np.log(QUORA_NUM_TOTAL_DOCS / cv.word_cnts[w]) for w in cv.vocab ] docs = list() for words in docs_words: doc = list() for w in words: widx = cv.word_dict.get(w, -1) if widx > -1: doc.append(widx) docs.append(doc) return docs, cv.vocab, word_idfs
def __topdmm_wc(name, dst_vocab_file, dst_topics_file): all_doc_contents = utils.read_lines_to_list(WC_SEG_DOC_CONTENT_NODUP_FILE) name_doc_dict = utils.load_entity_name_to_doc_file(WC_NAME_DOC_ND_FILE) doc_idxs = name_doc_dict[name] contents = [all_doc_contents[idx] for idx in doc_idxs] print(len(contents), 'docs') docs_words = [content.split(' ') for content in contents] words_exist = utils.get_word_set(docs_words) extra_exclude_words = {name} if name == '姜子牙': extra_exclude_words = {'姜', '子牙'} cv = textvectorizer.CountVectorizer( (WC_DF_ND_FILE, 20, 700), remove_stopwords=True, words_exist=words_exist, extra_exclude_words=extra_exclude_words) print(len(cv.vocab), 'words in vocab') X = cv.get_vecs(contents, normalize=False) # D_codoc = utils.get_codoc_matrix(cv.vocab, contents) n_topic_words_disp = 10 print('starting training ...') # for k in range(10, 11): k = 10 dmm = TOPDMM(k, 100, alpha=0.01, beta=0.01, n_top=-1) dmm.fit(X) for t in dmm.topic_word_: widxs = np.argpartition(-t, range(n_topic_words_disp))[:n_topic_words_disp] topic_words = [cv.vocab[i] for i in widxs] print(' '.join(topic_words)) dmm.save(cv.vocab, dst_vocab_file, dst_topics_file)
def __run_quora(): name = 'DC' all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE) name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE) doc_idxs = name_doc_dict[name] contents = [all_doc_contents[idx] for idx in doc_idxs] docs_words = [content.split(' ') for content in contents] words_exist = utils.get_word_set(docs_words) cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 100, 5000), remove_stopwords=True, words_exist=words_exist) print(len(cv.vocab), 'words in vocab') X = cv.get_vecs(contents) parser = argparse.ArgumentParser() parser.add_argument("-lamb", "--lamb", type=float) parser.add_argument("-N", "--N", type=int) args = parser.parse_args() lamb = 1 if args.lamb is None else args.lamb N = 10 if args.N is None else args.N dst_file = os.path.join(QUORA_DATA_DIR, 'dpmfs_z_{}.txt'.format(lamb)) print(dst_file) n_docs = len(doc_idxs) dpmfs = DPMFS(cv.n_words, N=N, n_docs=n_docs, n_iter=1000, lamb=lamb) dpmfs.fit(X, dst_file, cv.vocab) np.savetxt(dst_file, dpmfs.z, fmt='%d')
def __check_topics(): k = 10 n_top = 10 vocab = utils.read_lines_to_list(test_vocab_file) n_words = len(vocab) print(n_words, 'words') nzw = np.zeros((k, n_words), np.float32)
def __process_quora(): tfidf = textvectorizer.TfIdf(QUORA_DF_FILE, 10, 24600, 143479, True) all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE) eps = 0.7 min_samples = 3 name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE) doc_idxs = name_doc_dict['DC'] contents = [all_doc_contents[idx] for idx in doc_idxs] __dbscan_docs(contents, tfidf, eps, min_samples, result_dir_quora)
def __process_wechat(): tfidf = textvectorizer.TfIdf(WC_DF_FILE, 5, 62000, 14357) all_doc_contents = utils.read_lines_to_list(WC_SEG_DOC_CONTENT_FILE) eps = 0.55 min_samples = 20 name_doc_dict = utils.load_entity_name_to_doc_file(WC_NAME_DOC_FILE) for name, doc_idxs in name_doc_dict.items(): print(name) contents = [all_doc_contents[idx] for idx in doc_idxs] __dbscan_docs(contents, tfidf, eps, min_samples, result_dir_wc) break
def __process_quora(): cv = textvectorizer.CountVectorizer(QUORA_DF_FILE, 50, 10000, True) print(cv.n_words, 'words in vocabulary') name = 'DC' all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE) name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE) doc_idxs = name_doc_dict[name] contents = [all_doc_contents[idx] for idx in doc_idxs] X = cv.get_vecs(contents) k = 10 lda = LatentDirichletAllocation(k, learning_method='batch', doc_topic_prior=.1, topic_word_prior=0.01) X_new = lda.fit_transform(X) # for t in lda.components_: # max_word_idxs = np.argpartition(-t, np.arange(10))[:10] # for idx in max_word_idxs: # print(cv.vocab[idx], end=' ') # print() topic_cnts = {i: 0 for i in range(k)} for i, x in enumerate(X_new): max_topic_idxs = np.argpartition(-x, np.arange(3))[:3] topic_cnts[max_topic_idxs[0]] += 1 # print(i + 1) # for tidx in max_topic_idxs: # topic_dist = lda.components_[tidx] # max_word_idxs = np.argpartition(-topic_dist, np.arange(10))[:10] # topic_words = [cv.vocab[idx] for idx in max_word_idxs] # print(x[tidx], ' '.join(topic_words)) # print() # if i == 50: # break for tidx, cnt in topic_cnts.items(): print(tidx, cnt) max_word_idxs = np.argpartition(-lda.components_[tidx], np.arange(10))[:10] for idx in max_word_idxs: print('{}*{:.3f}'.format(cv.vocab[idx], lda.components_[tidx][idx]), end=' ') print()
def __check_data(): docs_file = 'd:/data/indec/title_content_new_entities-09-08.csv' names_file = 'd:/data/indec/ambig-names-from-wiki-wz-dhl.txt' names = utils.read_lines_to_list(names_file) name_docs_dict = {n: list() for n in names} f = open(docs_file, encoding='utf-8') next(f) for i, line in enumerate(f): p = line.find(',') title, content = line[:p], line[p + 1:] for name in names: if name in content: name_docs_dict[name].append(i) if '王刚' in content: print(content) f.close()
def __run_with_quora(): name = 'DC' # name = 'WP' # name = 'Austin' # name = 'Mark' all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE) name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE) doc_idxs = name_doc_dict[name] contents = [all_doc_contents[idx] for idx in doc_idxs] docs_words = [content.split(' ') for content in contents] words_exist = utils.get_word_set(docs_words) cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 50, 6000), remove_stopwords=True, words_exist=words_exist) print(len(cv.vocab), 'words in vocab') X = cv.get_vecs(contents, normalize=False) # D_codoc = utils.get_codoc_matrix(cv.vocab, contents) # k = 3 n_topic_words_disp = 10 for k in range(10, 11): dmm = TOPDMM(k, 100, alpha=0.01, beta=0.01) dmm.fit(X) for t in dmm.topic_word_: widxs = np.argpartition( -t, range(n_topic_words_disp))[:n_topic_words_disp] topic_words = [cv.vocab[i] for i in widxs] print(' '.join(topic_words)) # __show_coherences(k, dmm.topic_word_, D_codoc) test_vocab_file = os.path.join(QUORA_DATA_DIR, '{}_vocab.txt'.format(name)) test_topic_file = os.path.join(QUORA_DATA_DIR, '{}_topics.txt'.format(name)) dmm.save(cv.vocab, test_vocab_file, test_topic_file)
def __process_quora(): name = 'DC' # name = 'Mark' all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE) name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE) doc_idxs = name_doc_dict[name] contents = [all_doc_contents[idx] for idx in doc_idxs] docs_words = [content.split(' ') for content in contents] words_exist = utils.get_word_set(docs_words) cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 50, 6000), remove_stopwords=True, words_exist=words_exist) print(len(cv.vocab), 'words in vocab') X = cv.get_vecs(contents, normalize=True) print(X.shape) k = 10 tsvd = TruncatedSVD(n_components=k) X_new = tsvd.fit_transform(X) for i in range(k): max_idxs = np.argpartition(-tsvd.components_[i], range(20))[:20] words = [cv.vocab[idx] for idx in max_idxs] print(tsvd.explained_variance_[i], tsvd.singular_values_[i]) print(words)
def __init__(self, vocab_file, topic_file): self.vocab = utils.read_lines_to_list(vocab_file) df = pd.read_csv(topic_file, header=None) self.topics = df.as_matrix() self.n_topics = len(self.topics)