def perform_lda(dictionary, corpus, num_topics, wiki_path=None, passes=1, iterations=50, chunksize=200): """ :param dictionary: :param corpus: :param wiki_path: :param num_topics: :param passes: :param iterations: :param chunksize: :return: """ if wiki_path is not None: logging.info('Generating wiki corpus...') wikis = unpickle(wiki_path) wiki_corpus = [dictionary.doc2bow(wiki) for wiki in wikis] logging.info('Combining original corpus and wiki corpus...') corpus = corpus + wiki_corpus # wiki_corpus is merged after the original corpus lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes, iterations=iterations, alpha='auto', chunksize=chunksize) corpus_ids = get_corpus_ids(dictionary.corpus_id2orig_id) # doc_vector_ids = dictionary.corpus_id2orig_id[corpus_ids] doc_vector_ids = [dictionary.corpus_id2orig_id[corpus_id] for corpus_id in corpus_ids] doc_vectors = lda_model.inference(corpus)[0] doc_vectors = doc_vectors[corpus_ids, :] doc_vectors = doc_vectors / doc_vectors.sum(axis=1).reshape(doc_vectors.shape[0], 1) return lda_model, doc_vectors, doc_vector_ids
def write_doc_topics(vector_path, id_path, csv_name): vectors = unpickle(vector_path) ids = unpickle(id_path) writer = csv.writer(file(csv_name, 'w')) # 1st row row = [''] for topic_id in range(vectors.shape[1]): row.append('z = ' + str(topic_id)) writer.writerow(row) # 2nd row and onwards for row_num, id in enumerate(ids): row = [id] for topic_id in range(vectors.shape[1]): row.append(vectors[row_num, topic_id]) writer.writerow(row)
corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()] processed_corpora.append(corpus) corpus_id2orig_id.append(index) print '\n' logging.info('Creating dictionary and corpus...') dictionary = Dictionary(processed_corpora) dictionary.corpus_id2orig_id = corpus_id2orig_id logging.info('Filtering unimportant terms...') dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n) dictionary.compactify() logging.info('Generating corpus...') dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora] dictionary.id2token = revdict(dictionary.token2id) return dictionary if __name__ == '__main__': # logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) logging.info('Loading corpora...') tweets_corpora = unpickle('data/processed/tweets.pkl') logging.info('Preprocessing corpora...') dictionary = preprocess_corpora(corpora=tweets_corpora, stopwords=stopwords, allowed_pos=re.compile('(NN)')) dictionary.save('data/dictionary/tweets.dict')
if __name__ == '__main__': # hyper-parameters allowed_pos = re.compile('(NN)') crawl = True # target = 'user' # topic_num = 100 model_path = 'data/model/tweets_100.lda' # logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) if crawl is True: logging.info('Crawling wikipedia...') wikis = crawl_wiki(model_path=model_path) else: wikis = unpickle('data/others/wikis.pkl') logging.info('Lemmatizing wikipedia texts...') count = 0 doc_num = len(wikis) new_wikis = [] keywords = [] for keyword, wiki in wikis.items(): count += 1 print '\r', count, '/', doc_num, text = wiki['text'] cleaned = clean_text(text) # delete irrelevant characters wiki = [] tokens = lemmatize(content=cleaned, allowed_tags=allowed_pos) # lemmatize