def __topdmm_wc(name, dst_vocab_file, dst_topics_file): all_doc_contents = utils.read_lines_to_list(WC_SEG_DOC_CONTENT_NODUP_FILE) name_doc_dict = utils.load_entity_name_to_doc_file(WC_NAME_DOC_ND_FILE) doc_idxs = name_doc_dict[name] contents = [all_doc_contents[idx] for idx in doc_idxs] print(len(contents), 'docs') docs_words = [content.split(' ') for content in contents] words_exist = utils.get_word_set(docs_words) extra_exclude_words = {name} if name == '姜子牙': extra_exclude_words = {'姜', '子牙'} cv = textvectorizer.CountVectorizer( (WC_DF_ND_FILE, 20, 700), remove_stopwords=True, words_exist=words_exist, extra_exclude_words=extra_exclude_words) print(len(cv.vocab), 'words in vocab') X = cv.get_vecs(contents, normalize=False) # D_codoc = utils.get_codoc_matrix(cv.vocab, contents) n_topic_words_disp = 10 print('starting training ...') # for k in range(10, 11): k = 10 dmm = TOPDMM(k, 100, alpha=0.01, beta=0.01, n_top=-1) dmm.fit(X) for t in dmm.topic_word_: widxs = np.argpartition(-t, range(n_topic_words_disp))[:n_topic_words_disp] topic_words = [cv.vocab[i] for i in widxs] print(' '.join(topic_words)) dmm.save(cv.vocab, dst_vocab_file, dst_topics_file)
def process_quora(): name = 'DC' all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE) name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE) doc_idxs = name_doc_dict[name] contents = [all_doc_contents[idx] for idx in doc_idxs] docs_words = [content.split(' ') for content in contents] words_exist = utils.get_word_set(docs_words) cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 50, 6000), remove_stopwords=True, words_exist=words_exist) print(len(cv.vocab), 'words in vocab') word_idfs = [ np.log(QUORA_NUM_TOTAL_DOCS / cv.word_cnts[w]) for w in cv.vocab ] docs = list() for words in docs_words: doc = list() for w in words: widx = cv.word_dict.get(w, -1) if widx > -1: doc.append(widx) docs.append(doc) return docs, cv.vocab, word_idfs
def __run_quora(): name = 'DC' all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE) name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE) doc_idxs = name_doc_dict[name] contents = [all_doc_contents[idx] for idx in doc_idxs] docs_words = [content.split(' ') for content in contents] words_exist = utils.get_word_set(docs_words) cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 100, 5000), remove_stopwords=True, words_exist=words_exist) print(len(cv.vocab), 'words in vocab') X = cv.get_vecs(contents) parser = argparse.ArgumentParser() parser.add_argument("-lamb", "--lamb", type=float) parser.add_argument("-N", "--N", type=int) args = parser.parse_args() lamb = 1 if args.lamb is None else args.lamb N = 10 if args.N is None else args.N dst_file = os.path.join(QUORA_DATA_DIR, 'dpmfs_z_{}.txt'.format(lamb)) print(dst_file) n_docs = len(doc_idxs) dpmfs = DPMFS(cv.n_words, N=N, n_docs=n_docs, n_iter=1000, lamb=lamb) dpmfs.fit(X, dst_file, cv.vocab) np.savetxt(dst_file, dpmfs.z, fmt='%d')
def __topdmm_wc_minidocs(name, dst_vocab_file, dst_topics_file): # all_doc_contents = utils.read_lines_to_list(WC_MINIDOC_TEXT_SEG_NODUP_FILE) # name_doc_dict = utils.load_entity_name_to_minidoc_file(WC_MINIDOC_INFO_NODUP_FILE) all_doc_contents = utils.read_lines_to_list( 'd:/data/indec/docs-14k-minidocs-text-seg-new.txt') name_doc_dict = utils.load_entity_name_to_minidoc_file( 'd:/data/indec/docs-14k-minidocs-info-new.txt') doc_idxs = name_doc_dict[name] contents = [all_doc_contents[idx] for idx in doc_idxs] # print(max(doc_idxs), len(all_doc_contents)) print(len(contents), 'docs') common_words = utils.read_lines_to_list(COMMON_CH_WORDS_FILE) docs_words = [content.split(' ') for content in contents] words_exist = utils.get_word_set(docs_words) extra_exclude_words = set(common_words) extra_exclude_words.add(name) # extra_exclude_words = {name} if name == '姜子牙': extra_exclude_words.add('姜') extra_exclude_words.add('子牙') if name == '夏侯惇': extra_exclude_words.add('夏侯') extra_exclude_words.add('惇') cv = textvectorizer.CountVectorizer( (WC_DF_ND_FILE, 20, 700), remove_stopwords=True, words_exist=words_exist, extra_exclude_words=extra_exclude_words) print(len(cv.vocab), 'words in vocab') # print('吃' in cv.vocab) # exit() X = cv.get_vecs(contents, normalize=False) # D_codoc = utils.get_codoc_matrix(cv.vocab, contents) n_topic_words_disp = 10 print('starting training ...') # for k in range(10, 11): k = 10 dmm = TOPDMM(k, 80, alpha=0.01, beta=0.01, n_top=-1) dmm.fit(X) for t in dmm.topic_word_: widxs = np.argpartition(-t, range(n_topic_words_disp))[:n_topic_words_disp] topic_words = [cv.vocab[i] for i in widxs] print(' '.join(topic_words)) dmm.save(cv.vocab, dst_vocab_file, dst_topics_file)
def __run_with_quora(): name = 'DC' # name = 'WP' # name = 'Austin' # name = 'Mark' all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE) name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE) doc_idxs = name_doc_dict[name] contents = [all_doc_contents[idx] for idx in doc_idxs] docs_words = [content.split(' ') for content in contents] words_exist = utils.get_word_set(docs_words) cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 50, 6000), remove_stopwords=True, words_exist=words_exist) print(len(cv.vocab), 'words in vocab') X = cv.get_vecs(contents, normalize=False) # D_codoc = utils.get_codoc_matrix(cv.vocab, contents) # k = 3 n_topic_words_disp = 10 for k in range(10, 11): dmm = TOPDMM(k, 100, alpha=0.01, beta=0.01) dmm.fit(X) for t in dmm.topic_word_: widxs = np.argpartition( -t, range(n_topic_words_disp))[:n_topic_words_disp] topic_words = [cv.vocab[i] for i in widxs] print(' '.join(topic_words)) # __show_coherences(k, dmm.topic_word_, D_codoc) test_vocab_file = os.path.join(QUORA_DATA_DIR, '{}_vocab.txt'.format(name)) test_topic_file = os.path.join(QUORA_DATA_DIR, '{}_topics.txt'.format(name)) dmm.save(cv.vocab, test_vocab_file, test_topic_file)
def __process_quora(): name = 'DC' # name = 'Mark' all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE) name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE) doc_idxs = name_doc_dict[name] contents = [all_doc_contents[idx] for idx in doc_idxs] docs_words = [content.split(' ') for content in contents] words_exist = utils.get_word_set(docs_words) cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 50, 6000), remove_stopwords=True, words_exist=words_exist) print(len(cv.vocab), 'words in vocab') X = cv.get_vecs(contents, normalize=True) print(X.shape) k = 10 tsvd = TruncatedSVD(n_components=k) X_new = tsvd.fit_transform(X) for i in range(k): max_idxs = np.argpartition(-tsvd.components_[i], range(20))[:20] words = [cv.vocab[idx] for idx in max_idxs] print(tsvd.explained_variance_[i], tsvd.singular_values_[i]) print(words)