예제 #1
0
def perform_lda(dictionary, corpus, num_topics, wiki_path=None, passes=1, iterations=50, chunksize=200):
    """


    :param dictionary:
    :param corpus:
    :param wiki_path:
    :param num_topics:
    :param passes:
    :param iterations:
    :param chunksize:
    :return:
    """
    if wiki_path is not None:
        logging.info('Generating wiki corpus...')
        wikis = unpickle(wiki_path)
        wiki_corpus = [dictionary.doc2bow(wiki) for wiki in wikis]

        logging.info('Combining original corpus and wiki corpus...')
        corpus = corpus + wiki_corpus  # wiki_corpus is merged after the original corpus

    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes,
                         iterations=iterations, alpha='auto', chunksize=chunksize)
    corpus_ids = get_corpus_ids(dictionary.corpus_id2orig_id)
    # doc_vector_ids = dictionary.corpus_id2orig_id[corpus_ids]
    doc_vector_ids = [dictionary.corpus_id2orig_id[corpus_id] for corpus_id in corpus_ids]
    doc_vectors = lda_model.inference(corpus)[0]
    doc_vectors = doc_vectors[corpus_ids, :]
    doc_vectors = doc_vectors / doc_vectors.sum(axis=1).reshape(doc_vectors.shape[0], 1)

    return lda_model, doc_vectors, doc_vector_ids
예제 #2
0
def write_doc_topics(vector_path, id_path, csv_name):
    vectors = unpickle(vector_path)
    ids = unpickle(id_path)
    writer = csv.writer(file(csv_name, 'w'))

    # 1st row
    row = ['']
    for topic_id in range(vectors.shape[1]):
        row.append('z = ' + str(topic_id))
    writer.writerow(row)

    # 2nd row and onwards
    for row_num, id in enumerate(ids):
        row = [id]

        for topic_id in range(vectors.shape[1]):
            row.append(vectors[row_num, topic_id])

        writer.writerow(row)
예제 #3
0
        corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()]
        processed_corpora.append(corpus)
        corpus_id2orig_id.append(index)

    print '\n'

    logging.info('Creating dictionary and corpus...')
    dictionary = Dictionary(processed_corpora)
    dictionary.corpus_id2orig_id = corpus_id2orig_id

    logging.info('Filtering unimportant terms...')
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)
    dictionary.compactify()

    logging.info('Generating corpus...')
    dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora]
    dictionary.id2token = revdict(dictionary.token2id)

    return dictionary


if __name__ == '__main__':
    # logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

    logging.info('Loading corpora...')
    tweets_corpora = unpickle('data/processed/tweets.pkl')

    logging.info('Preprocessing corpora...')
    dictionary = preprocess_corpora(corpora=tweets_corpora, stopwords=stopwords, allowed_pos=re.compile('(NN)'))
    dictionary.save('data/dictionary/tweets.dict')
예제 #4
0
if __name__ == '__main__':
    # hyper-parameters
    allowed_pos = re.compile('(NN)')
    crawl = True
    # target = 'user'
    # topic_num = 100
    model_path = 'data/model/tweets_100.lda'

    # logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

    if crawl is True:
        logging.info('Crawling wikipedia...')
        wikis = crawl_wiki(model_path=model_path)
    else:
        wikis = unpickle('data/others/wikis.pkl')

    logging.info('Lemmatizing wikipedia texts...')
    count = 0
    doc_num = len(wikis)
    new_wikis = []
    keywords = []
    for keyword, wiki in wikis.items():
        count += 1

        print '\r', count, '/', doc_num,
        text = wiki['text']
        cleaned = clean_text(text)  # delete irrelevant characters

        wiki = []
        tokens = lemmatize(content=cleaned, allowed_tags=allowed_pos)  # lemmatize