def cal_idf(): # brown.sents() total_wordlists = [] doc_sents = [] for f in brown.fileids(): print f doc_wordlist = [] doc_sentlist = brown.sents(fileids=[f]) d_sents = '' for sent in doc_sentlist: s = '' # sent = stem_tokens(sent) for w in sent: w = w.lower() s += w + ' ' d_sents += s + '\n' doc_wordlist.extend(sent) total_wordlists.append(doc_wordlist) doc_sents.append(d_sents) print 'start caling tfidf' from sklearn.feature_extraction.text import TfidfVectorizer corpus = doc_sents vectorizer = TfidfVectorizer(min_df=1) X = vectorizer.fit_transform(corpus) idf = vectorizer.idf_ # print dict(zip(vectorizer.get_feature_names(), idf)) pickle.dump(vectorizer, open('idf_vectorizer', 'w')) dictionary = corpora.Dictionary(total_wordlists) dic, corps = get_corpus_by_lists(total_wordlists) tfidf = models.TfidfModel(corps, id2word=dic) pickle.dump(tfidf, open('brown_tfidf', 'w'))
def train_model_by_wordlists(wordlists, num_topics=5, iterations=100, passes=10, is_tfidf=False): c_result = basic_utils.get_corpus_by_lists(wordlists) dic = c_result[0] corpus = c_result[1] if is_tfidf: tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lda_model = models.LdaModel(corpus_tfidf, id2word=dic, num_topics=num_topics, iterations=iterations, passes=passes) else: lda_model = models.LdaModel(corpus, id2word=dic, num_topics=num_topics, iterations=iterations, passes=passes) return lda_model