Пример #1
0
def __topdmm_wc(name, dst_vocab_file, dst_topics_file):
    all_doc_contents = utils.read_lines_to_list(WC_SEG_DOC_CONTENT_NODUP_FILE)
    name_doc_dict = utils.load_entity_name_to_doc_file(WC_NAME_DOC_ND_FILE)
    doc_idxs = name_doc_dict[name]
    contents = [all_doc_contents[idx] for idx in doc_idxs]
    print(len(contents), 'docs')

    docs_words = [content.split(' ') for content in contents]
    words_exist = utils.get_word_set(docs_words)
    extra_exclude_words = {name}
    if name == '姜子牙':
        extra_exclude_words = {'姜', '子牙'}
    cv = textvectorizer.CountVectorizer(
        (WC_DF_ND_FILE, 20, 700),
        remove_stopwords=True,
        words_exist=words_exist,
        extra_exclude_words=extra_exclude_words)
    print(len(cv.vocab), 'words in vocab')
    X = cv.get_vecs(contents, normalize=False)
    # D_codoc = utils.get_codoc_matrix(cv.vocab, contents)

    n_topic_words_disp = 10
    print('starting training ...')
    # for k in range(10, 11):
    k = 10
    dmm = TOPDMM(k, 100, alpha=0.01, beta=0.01, n_top=-1)
    dmm.fit(X)
    for t in dmm.topic_word_:
        widxs = np.argpartition(-t,
                                range(n_topic_words_disp))[:n_topic_words_disp]
        topic_words = [cv.vocab[i] for i in widxs]
        print(' '.join(topic_words))

    dmm.save(cv.vocab, dst_vocab_file, dst_topics_file)
Пример #2
0
def process_quora():
    name = 'DC'
    all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE)
    name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE)
    doc_idxs = name_doc_dict[name]
    contents = [all_doc_contents[idx] for idx in doc_idxs]
    docs_words = [content.split(' ') for content in contents]
    words_exist = utils.get_word_set(docs_words)
    cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 50, 6000),
                                        remove_stopwords=True,
                                        words_exist=words_exist)
    print(len(cv.vocab), 'words in vocab')
    word_idfs = [
        np.log(QUORA_NUM_TOTAL_DOCS / cv.word_cnts[w]) for w in cv.vocab
    ]

    docs = list()
    for words in docs_words:
        doc = list()
        for w in words:
            widx = cv.word_dict.get(w, -1)
            if widx > -1:
                doc.append(widx)
        docs.append(doc)

    return docs, cv.vocab, word_idfs
Пример #3
0
def __run_quora():
    name = 'DC'
    all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE)
    name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE)
    doc_idxs = name_doc_dict[name]
    contents = [all_doc_contents[idx] for idx in doc_idxs]
    docs_words = [content.split(' ') for content in contents]
    words_exist = utils.get_word_set(docs_words)
    cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 100, 5000),
                                        remove_stopwords=True,
                                        words_exist=words_exist)
    print(len(cv.vocab), 'words in vocab')
    X = cv.get_vecs(contents)

    parser = argparse.ArgumentParser()
    parser.add_argument("-lamb", "--lamb", type=float)
    parser.add_argument("-N", "--N", type=int)
    args = parser.parse_args()
    lamb = 1 if args.lamb is None else args.lamb
    N = 10 if args.N is None else args.N

    dst_file = os.path.join(QUORA_DATA_DIR, 'dpmfs_z_{}.txt'.format(lamb))
    print(dst_file)

    n_docs = len(doc_idxs)
    dpmfs = DPMFS(cv.n_words, N=N, n_docs=n_docs, n_iter=1000, lamb=lamb)
    dpmfs.fit(X, dst_file, cv.vocab)
    np.savetxt(dst_file, dpmfs.z, fmt='%d')
Пример #4
0
def __topdmm_wc_minidocs(name, dst_vocab_file, dst_topics_file):
    # all_doc_contents = utils.read_lines_to_list(WC_MINIDOC_TEXT_SEG_NODUP_FILE)
    # name_doc_dict = utils.load_entity_name_to_minidoc_file(WC_MINIDOC_INFO_NODUP_FILE)
    all_doc_contents = utils.read_lines_to_list(
        'd:/data/indec/docs-14k-minidocs-text-seg-new.txt')
    name_doc_dict = utils.load_entity_name_to_minidoc_file(
        'd:/data/indec/docs-14k-minidocs-info-new.txt')
    doc_idxs = name_doc_dict[name]
    contents = [all_doc_contents[idx] for idx in doc_idxs]
    # print(max(doc_idxs), len(all_doc_contents))
    print(len(contents), 'docs')

    common_words = utils.read_lines_to_list(COMMON_CH_WORDS_FILE)

    docs_words = [content.split(' ') for content in contents]
    words_exist = utils.get_word_set(docs_words)
    extra_exclude_words = set(common_words)
    extra_exclude_words.add(name)
    # extra_exclude_words = {name}
    if name == '姜子牙':
        extra_exclude_words.add('姜')
        extra_exclude_words.add('子牙')
    if name == '夏侯惇':
        extra_exclude_words.add('夏侯')
        extra_exclude_words.add('惇')
    cv = textvectorizer.CountVectorizer(
        (WC_DF_ND_FILE, 20, 700),
        remove_stopwords=True,
        words_exist=words_exist,
        extra_exclude_words=extra_exclude_words)
    print(len(cv.vocab), 'words in vocab')
    # print('吃' in cv.vocab)
    # exit()
    X = cv.get_vecs(contents, normalize=False)
    # D_codoc = utils.get_codoc_matrix(cv.vocab, contents)

    n_topic_words_disp = 10
    print('starting training ...')
    # for k in range(10, 11):
    k = 10
    dmm = TOPDMM(k, 80, alpha=0.01, beta=0.01, n_top=-1)
    dmm.fit(X)
    for t in dmm.topic_word_:
        widxs = np.argpartition(-t,
                                range(n_topic_words_disp))[:n_topic_words_disp]
        topic_words = [cv.vocab[i] for i in widxs]
        print(' '.join(topic_words))

    dmm.save(cv.vocab, dst_vocab_file, dst_topics_file)
Пример #5
0
def __run_with_quora():
    name = 'DC'
    # name = 'WP'
    # name = 'Austin'
    # name = 'Mark'
    all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE)
    name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE)
    doc_idxs = name_doc_dict[name]
    contents = [all_doc_contents[idx] for idx in doc_idxs]
    docs_words = [content.split(' ') for content in contents]
    words_exist = utils.get_word_set(docs_words)
    cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 50, 6000),
                                        remove_stopwords=True,
                                        words_exist=words_exist)
    print(len(cv.vocab), 'words in vocab')
    X = cv.get_vecs(contents, normalize=False)
    # D_codoc = utils.get_codoc_matrix(cv.vocab, contents)

    # k = 3
    n_topic_words_disp = 10
    for k in range(10, 11):
        dmm = TOPDMM(k, 100, alpha=0.01, beta=0.01)
        dmm.fit(X)
        for t in dmm.topic_word_:
            widxs = np.argpartition(
                -t, range(n_topic_words_disp))[:n_topic_words_disp]
            topic_words = [cv.vocab[i] for i in widxs]
            print(' '.join(topic_words))

        # __show_coherences(k, dmm.topic_word_, D_codoc)

        test_vocab_file = os.path.join(QUORA_DATA_DIR,
                                       '{}_vocab.txt'.format(name))
        test_topic_file = os.path.join(QUORA_DATA_DIR,
                                       '{}_topics.txt'.format(name))
        dmm.save(cv.vocab, test_vocab_file, test_topic_file)
Пример #6
0
Файл: lsa.py Проект: hldai/indec
def __process_quora():
    name = 'DC'
    # name = 'Mark'
    all_doc_contents = utils.read_lines_to_list(QUORA_ANSWER_TOK_LOWER_FILE)
    name_doc_dict = utils.load_entity_name_to_doc_file(QUORA_NAME_DOC_FILE)
    doc_idxs = name_doc_dict[name]
    contents = [all_doc_contents[idx] for idx in doc_idxs]
    docs_words = [content.split(' ') for content in contents]
    words_exist = utils.get_word_set(docs_words)
    cv = textvectorizer.CountVectorizer((QUORA_DF_FILE, 50, 6000),
                                        remove_stopwords=True,
                                        words_exist=words_exist)
    print(len(cv.vocab), 'words in vocab')
    X = cv.get_vecs(contents, normalize=True)
    print(X.shape)

    k = 10
    tsvd = TruncatedSVD(n_components=k)
    X_new = tsvd.fit_transform(X)
    for i in range(k):
        max_idxs = np.argpartition(-tsvd.components_[i], range(20))[:20]
        words = [cv.vocab[idx] for idx in max_idxs]
        print(tsvd.explained_variance_[i], tsvd.singular_values_[i])
        print(words)