def lda(dataframe, num_topics=100): """Returns an LDA model for documents stored in a DataFrame. Precomputed models are read from file if previously cached, or generated then cached otherwise. Parameters ---------- dataframe : Pandas DataFrame The DataFrame containing the documents to process. num_topics : int (default is 300) The number of topics to train the LDA model with. Returns ------- model : Gensim LdaMulticore LDA model for documents stored in the DataFrame. """ filename = 'caches/models/lda.model' if not os.path.isfile(filename): dictionary = dictionary_corpus(dataframe) bow = bow_corpus(dataframe) lda_model = LdaModel(bow, id2word=dictionary, num_topics=num_topics, passes=20) lda_model.save(filename) else: lda_model = LdaModel.load(filename) return lda_model
def train_lda(load_model, corpus, num_topics, dictionary): # Train LDA model. if load_model and os.path.exists(MODEL_PATH): model = LdaModel.load(MODEL_PATH) else: # Set training parameters. chunksize = 2000 passes = 20 #epoch number iterations = 400 eval_every = None # Don't evaluate model perplexity, takes too much time. # Make a index to word dictionary. temp = dictionary[0] # This is only to "load" the dictionary. id2word = dictionary.id2token model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, alpha='auto', eta='auto', iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every) model.save(MODEL_PATH) return model
def lda(clean_docs, model_name, topics): # turn all data into a dictionary mappping of normalized words and their integer ids from gensim import corpora dictionary = corpora.Dictionary(clean_docs) # convert each document, called text, into bag-of-words representation (list of (token_id, token_count) tuples) # in other words, it counts how often each word occurs in each doc of the text and saves that in the corpus corpus = [] for doc in clean_docs: corpus.append(dictionary.doc2bow(doc)) # serialize version: save dictionary and corpus for future use from gensim.corpora import MmCorpus MmCorpus.serialize('corpus_' + model_name + '.mm', corpus) dictionary.save('dictionary_' + model_name + '.gensim') # Train LDA model from gensim.models import LdaModel num_topics = topics # find this number of topics in the data passes = 15 ldamodel = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes) ldamodel.save('model_' + model_name + '.gensim') topics = ldamodel.print_topics(num_words=5) for topic in topics: print(topic)
class LdaVec(TopicVec): def __init__(self, vec_num): TopicVec.__init__(self, vec_num) def __gen_model(self, corpus): # if self.p_corpus == 'onehot': # model_name = 'lda_one_hot.model' # else: # model_name = 'lda_tfidf.model' model_name = 'lda.model' self.model = LdaModel(corpus, id2word=self.dictionary, num_topics=self.vec_num) self.model.save(os.path.join(self.out_dir, model_name)) def __get_model(self): model_name = 'lda.model' if os.path.exists(os.path.join(self.out_dir, model_name)): self.model = LdaModel.load(os.path.join(self.out_dir, model_name)) else: raise FileNotFoundError('"{}" file not found!'.format(model_name)) def fit(self, doc, out_dir, use_exist_dictionary=False): TopicVec.fit(self, doc, out_dir, use_exist_dictionary) self.__gen_model(self.corpus)
def train_lda(is_tfidf, num_topics): # Create corpus print('Create corpus') corpus = doc_processor.create_corpus(dictionary, doc_list, is_tfidf) # Set training parameters. num_topics = num_topics chunksize = 20000 # passes = 20 # iterations = 400 eval_every = None print('Start LDI training') start = time.time() id2word = dictionary.id2token lda_model = LdaModel( corpus=corpus, # id2word=id2word, chunksize=chunksize, # alpha='auto', # eta='auto', num_topics=num_topics, # passes=passes, # iterations=iterations, eval_every=eval_every ) ir_method = 'tfidf' if is_tfidf else 'bow' lda_model.save('saved_models/lda_model_%s_%s' % (ir_method, num_topics)) print('LDA for %s %s done in %.1f seconds' % (ir_method, num_topics, time.time() - start))
def train_model(): dictionary = pickle.load(codecs.open('dictionary.pkl')) train = pickle.load(codecs.open('train.pkl')) corpus = [dictionary.doc2bow(text) for text in train] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100) #模型的保存/ 加载 lda.save('test_lda.model')
def train_model(num_topic): train = get_dict() dictionary = Dictionary.load('train_data.dict') # Path corpus = [dictionary.doc2bow(text) for text in train] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topic) # 模型的保存 lda.save('LDA_trained_model/lda.model') # Path
def train(corpuspath,modelpath): train = [] # stopwords = codecs.open('stopWords/1893(utf8).txt','r',encoding='utf8').readlines() # stopwords = [ w.strip() for w in stopwords ] fp = codecs.open(corpuspath, 'r', encoding='utf8') for line in fp: line = line.strip() if line == '':continue line = line.split() train.append([w for w in line]) dictionary = corpora.Dictionary(train) corpus = [dictionary.doc2bow(text) for text in train] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, passes=20) lda.save(modelpath) topic_words = open('../result/lda/fact_lad_print-10.txt','w',encoding='utf-8') print_str = '' for topic in lda.print_topics(num_words=100): termNumber = topic[0] listOfTerms = topic[1].split('+') # for term in listOfTerms: # listItems = term.split('*') # # print(listItems) # print(' ', listItems[1], '(', listItems[0], ')', sep='') print_str += topic[1] + '\n' topic_words.write(print_str) topic_words.close()
def lda_features( sentence_words, lexicon, model_path, num_topics=50, mode='train', ): dictionary = corpora.Dictionary([[lex] for lex in lexicon]) corpus = [dictionary.doc2bow(words) for words in sentence_words] if mode == 'train' and not os.path.exists(model_path): ldamodel = LdaModel(corpus, num_topics=num_topics) ldamodel.save(model_path) else: ldamodel = LdaModel.load(model_path) features = [] for sentence in corpus: lda_f = ldamodel[sentence] feats = np.zeros((num_topics, )) for (n_t, s_t) in lda_f: feats[n_t] = s_t features.append(feats) result = np.asarray(features) return result
def main(num_topics): f_path = os.path.join(DATA_PATH, 'interim', 'onsite_search_nlp_gensim_dictionary.pkl') with open(f_path, 'rb') as f: dictionary = pickle.load(f) print('Loaded dictionary: {}'.format(dictionary)) f_path = os.path.join(DATA_PATH, 'interim', 'onsite_search_terms_2017_2019_nlp.pkl') df_search_terms = pd.read_pickle(f_path) print('Loaded search corpus: {} rows'.format(len(df_search_terms))) print('Logging to terminal') logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) print('Starting model training...') print() ldamodel = LdaModel( corpus=df_search_terms.corpus.dropna().tolist(), num_topics=num_topics, id2word=dictionary, ) print() print('Done training, saving to file') f_path = 'onsite_search_terms_lda_2017_2019_{}_topic.model'.format( num_topics) ldamodel.save(f_path)
def save_model(): """ 保存LDA模型 :param model_path: :return: ----------------- corpus:[ [('词ID', 词频),('词ID', 词频)...], [('词ID', 词频),('词ID', 词频)...], ....... ] 稀疏向量集 id2word: {'词1':0, '词2':1. ..} """ train_set = get_train_set() word_dict = Dictionary(train_set) # 生成文档的词典,每个词与一个整型索引值对应 corpus_list = [word_dict.doc2bow(text) for text in train_set] # 词频统计,转化成空间向量格式 lda = LdaModel( corpus=corpus_list, id2word=word_dict, num_topics=100, # passes=5, # epoch alpha='auto') lda.print_topic(99) # 保存LDA 模型 lda.save(lda_model_path)
def train_model(): dictionary = get_dict()[0] train = get_dict()[1] corpus = [dictionary.doc2bow(text) for text in train] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=7) #模型的保存/ 加载 lda.save('test_lda.model')
def get_lda_model(): """ (50,28767) 获得话题 :return: """ text_array = list() with open("jobs-unigrams-filter") as f: for line in tqdm(f): line = line.strip().split(" ") line.remove(line[0]) text_array.append(line) dictionary = Dictionary(text_array) # print(common_dictionary) common_corpus = [dictionary.doc2bow(text) for text in text_array] # Train the model on the corpus. lda = LdaModel(common_corpus, id2word=dictionary, num_topics=50, passes=10, iterations=1000) temp_file = datapath("LDA_twitter") lda.save(temp_file) topics = lda.get_topics() print(topics.shape) topic_list = lda.print_topics(50) for topic in topic_list: print(topic)
def run(data_name): print('Working on ' + data_name) corpus = [] # preprocess with open('../data/' + data_name + '/' + data_name + '.tsv') as dfile: dfile.readline() for line in dfile: line = line.strip().split('\t') corpus.append(line[1].split()) # build dictionary dictionary = Dictionary(corpus) dictionary.save(data_name + '.dict') # documents to indices doc_matrix = [dictionary.doc2bow(doc) for doc in corpus] del corpus # release memory ldamodel = LdaModel(doc_matrix, id2word=dictionary, num_topics=10, passes=2, alpha='symmetric', eta=None) ldamodel.save(data_name + '.model')
def create_lda_model(project, corpus, id2word, name, use_level=True, force=False): model_fname = project.full_path + name + str(project.num_topics) if use_level: model_fname += project.level model_fname += '.lda.gz' if not os.path.exists(model_fname) or force: if corpus: update_every=None # run in batch if we have a pre-supplied corpus else: update_every=1 model = LdaModel(corpus=corpus, id2word=id2word, alpha=project.alpha, eta=project.eta, passes=project.passes, num_topics=project.num_topics, iterations=project.iterations, eval_every=None, # disable perplexity tests for speed update_every=update_every, ) if corpus: model.save(model_fname) else: model = LdaModel.load(model_fname) return model, model_fname
def train(data, valid_h1, valid_h2, vocab): #logging.basicConfig(filename=args.save_path + 'lda.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) model = LdaModel(id2word=vocab, num_topics=args.topics, random_state=0, chunksize=args.batch_size, update_every=args.batch_size, alpha='auto', eta=None, decay=args.decay, offset=args.offset, per_word_topics=True) best_perplexity = float('inf') for epoch in range(args.epochs): model.update(data, passes=1, eval_every=1, gamma_threshold=0.001) print("Epoch number {}".format(epoch), end=' ') val_perplexity = evaluate(data, valid_h1, valid_h2, model, 'valid') if val_perplexity < best_perplexity: best_perplexity = val_perplexity model.save(os.path.join(args.save_path, 'model.ckpt'))
def lda_main(word_with_pos = WORD_WITH_POS, topic_num = LDA_TOPIC_NUM): LDA_MODEL = './models/lda_{}.model'.format(topic_num) stop_word = read_stopword() begin_t = time.time() def func(line): ''' 捆绑词性是否 ''' line = line.strip() json_data = json.loads(line) content = json_data['content'] if word_with_pos: word_list = [j[0] + j[1] for j in content if j[0] not in stop_word] else: word_list = [j[0]for j in content if j[0] not in stop_word] return word_list with open(DATA_JSONLINE) as f: # words = [func(i) for i in f.readlines()] words = [] for i in f.readlines(): words.append(func(i)) print('数据装载完毕! use ', time.time()-begin_t, 'sec.\n begin lda modeling') dic = corpora.Dictionary(words) corpus = [dic.doc2bow(text) for text in words] dic.save(DICTIONARY_PATH) corpora.MmCorpus.serialize(CORPUS_PATH, corpus) lda = LdaModel(corpus=corpus, id2word=dic, num_topics=topic_num) lda.save(LDA_MODEL) vis_data = pyLDAvis.gensim.prepare(lda, corpus, dic) vis_html_path = 'ldavis_{}.html'.format(topic_num) pyLDAvis.save_html(vis_data, vis_html_path) print('LDA 建模完成!\nTotal use:', time.time()- begin_t, 'sec.')
def chosen_lda(corpus, dictionary, data, n_topics, alpha=.1, eta=0.01): ''' This function trains a Gensim LDA model on chosen hyperparameters Arguments: ---------- corpus : matrix-format corpus (BOW or TF-IDF) dictionary : corpus-related dictionary data : text data for coherence score computation n_topics : number of desired topics alpha : alpha parameter (from 0 to infinity) eta : beta parameter (from 0 to infinity) Outputs: ---------- lda : trained model ''' lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=35, random_state=100, alpha=alpha, eta=eta) ldatopics = [[word for word, prob in topic] for topicid, topic in lda.show_topics(formatted=False)] lda_coherence = CoherenceModel(topics=ldatopics, texts=data, dictionary=dictionary, window_size=10).get_coherence() print(lda_coherence) lda.print_topics(num_topics=n_topics) lda.save('../03_Dump/model') return lda
def set_model(self, lang: str, data_version: int, dictionary_version: float, model_version: str, param_name: str, param_version: int, model_file_path: str, language_processed_data: list): """ 'alpha'& 'eta' are hyperparameters that affect sparsity of the topics. According to the Gensim docs, both defaults to 1.0/num_topics prior. :return: """ # Make a index to word dictionary. logging.info("---- Creating LDA model") temp = self.essentials.dictionary[0] "for multicore model optimal workers=3, one less than the number of cores" model = LdaModel( # workers=self.workers, corpus=self.essentials.corpus, id2word=self.essentials.dictionary.id2token, chunksize=self.chunk_size, alpha=self.alpha, eta=self.beta, iterations=self.iterations, num_topics=self.number_of_topics, passes=self.passes, eval_every=self.eval_every ) model.save(model_file_path) self.model = model logging.info("---- LDA model is created") metrics = self.get_model_evaluation_metrics(language_processed_data) parameters = self.get_model_parameters() self.write_model_evaluation_metrics(lang, data_version, dictionary_version, model_version,param_name, param_version, metrics, parameters) return
def save_model(model: LdaModel, path='../artefacts/model', suffix=''): """Helper function to save Gensim LdaModel at specified path """ if suffix: path = path + '_' + suffix model.save(path) print(f'model saved at {path}')
def train_lda( corpus, dictionary, save=False, file=os.path.join(config.map("Storage")['storage_dir'] + 'lda.mdl')): lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=50, update_every=1, chunksize=10000, passes=10) if save: lda.save(file) #cm = CoherenceModel(model=lda, corpus=corpus, dictionary=dictionary, coherence='u_mass') #print(cm.get_coherence()) import pyLDAvis.gensim topicmodel = pyLDAvis.gensim.prepare(lda, corpus, dictionary) html = pyLDAvis.display(topicmodel) import webbrowser with open('viz.html', 'w') as f: message = html.data f.write(message) webbrowser.open_new_tab('viz.html')
def LDA_model_from_token(text_file_name): token_file_name = text_file_name[:-4] + '.csv' print("loading "+token_file_name) data_word = [] with codecs.open(token_file_name, 'r') as f: rdr = csv.reader(f) next(rdr) for i, line in enumerate(rdr): data_word.append(line) print("Complete loading") id2word=corpora.Dictionary(data_word) id2word.filter_extremes(no_below = 10) #10회 이하로 등장한 단어는 삭제 texts = data_word corpus=[id2word.doc2bow(text) for text in texts] lda = LdaModel(corpus, num_topics=10, id2word=id2word) temp_file = datapath(token_file_name[:-4]) lda.save(temp_file) lda = LdaModel.load(temp_file) topics = lda.print_topics(num_words=10) for topic in topics: print(topic)
def build_lda_model(dictionary, corpus, should_rebuild): lda = list() # DEBUG should_rebuild = True # debug_print('datapath:LDA', datapath(cfg.LDA_BACKUP)) if not should_rebuild: try: print('Loading LDA Model backup...') lda_file = utils.get_file_path(cfg.LDA_BACKUP) print('LDA file = {}'.format(lda_file)) lda = LdaModel.load(lda_file) except Exception as exc: utils.print_exception_details('Building LDA Model', exc) else: print('Building LDA Model...') lda = LdaModel(corpus, id2word=dictionary, random_state=cfg.RANDOM_STATE, num_topics=cfg.NUM_TOPICS, passes=cfg.NUM_PASSES) print('Done!') # Save Model Structures LDA_FILE = utils.get_file_path(cfg.LDA_BACKUP) lda.save(LDA_FILE) return lda
def save_ldamodel(dictionary, text_data, cnt_cata): corpus = [dictionary.doc2bow(text) for text in text_data] ldamodel = LdaModel(corpus, num_topics=cnt_cata, id2word=dictionary) # 查看主题 for topic in ldamodel.print_topics(): print(topic[1]) ldamodel.save('model/{}/ADA.gensim'.format(cnt_cata), "wb")
def lda_gensim(id2word, doc2bow, n_topics=params.lda_params_default['n_topics']): """ Implements gensim LDA algorithm. Parameters ---------- id2word Maps token IDs to words doc2bow Maps documents to bag-of-words lists n_topics : int Total number of topics Returns ------- model Trained LDA model """ try: model = LdaModel.load('lda_model_{}'.format(n_topics)) # coh_model_umass = CoherenceModel.load('umass_coherence_model_{}'.format(n_topics)) # coh_model_cv = CoherenceModel.load('cv_coherence_model_{}'.format(n_topics)) except FileNotFoundError: # Trains LDA model and returns key words for each topic model = LdaModel(corpus=doc2bow, id2word=id2word, iterations=500, num_topics=n_topics, random_state=1, alpha='auto', eta='auto', ) model.save('lda_model_{}'.format(n_topics)) ''' print('Training coherence models...') coh_model_umass = CoherenceModel(model=model, corpus=doc2bow, dictionary=id2word, coherence='u_mass', ) coh_model_umass.save('umass_coherence_model_{}'.format(n_topics)) coh_model_cv = CoherenceModel(model=model, texts=corpus_text.values, dictionary=id2word, coherence='c_v', ) # coh_model_cv.save('cv_coherence_model_{}'.format(n_topics)) ''' return model
class Lda(ModelABC): """Represent news articles as vectors using Latent Dirichlet Allocation.""" def __init__(self, dictionary: Dictionary, corpus=None, size: int = 100, decay=0.5, lda_filename: str = None): """ :param dictionary: A dictionary :param corpus: A corpus for training :param size: The length of feature vector :param decay: The decay parameter :param lda_filename: File name of a previously trained model """ super().__init__(size) # Check if we have already trained the Lda model if lda_filename is not None and os.path.exists(lda_filename): self.lda = LdaModel.load(lda_filename) logging.info("LDA model loaded") else: if corpus is None: raise ValueError("Corpus must be provided to train LDA") self.lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=size, passes=1, decay=decay, minimum_probability=0.0) def update(self, documents): """ Update model using documents. :param documents: The new documents used for update """ self.lda.update(documents) def save(self, filename: str): """ Save model to a file. :param filename: A model file name """ self.lda.save(filename) def _get_vector_representation(self, items): """ Represent documents as vectors. :param items: A list of documents :return: A list of feature vectors. """ return self.lda[items]
def get_topics(data, filepath='./data/spam_topics.pkl'): if not os.path.exists(filepath): import pyLDAvis.gensim from gensim.corpora import Dictionary from gensim.models import LdaModel, CoherenceModel texts = [sample['lemmas'] for sample in data] dictionary = Dictionary(texts) dictionary.filter_extremes(no_below=20, no_above=0.4) corpus = [dictionary.doc2bow(text) for text in texts] chunksize = 500 passes = 5 iterations = 400 eval_every = None temp = dictionary[0] # This is only to "load" the dictionary. id2word = dictionary.id2token best_coherence = 0 best_model_filepath = '' for num_topics in list(range(2, 20)): for alpha in ['asymmetric', 'symmetric']: for eta in ['symmetric', 'auto']: filepath = 'out/topics/{}_{}_{}'.format(num_topics, alpha, eta) model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, alpha='auto', eta='auto', iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every) coherence = float(CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v').get_coherence()) filepath += '_{:.4f}'.format(coherence) model.save(filepath + '_model.pkl') prepared = pyLDAvis.gensim.prepare(model, corpus, dictionary) pyLDAvis.save_html(prepared, filepath + '_plot.html') if coherence > best_coherence: best_coherence = coherence best_model_filepath = filepath + '_model.pkl' model = LdaModel.load(best_model_filepath) print('Best model: {}'.format(best_model_filepath)) topics = [x[0] for x in model.top_topics(corpus=corpus, texts=texts, dictionary=dictionary, topn=100)] data_topics = [] for i, text in enumerate(texts): data_topics.append({k: v for k, v in model.get_document_topics(dictionary.doc2bow(text), minimum_probability=0.0)}) pickle.dump([topics, data_topics], open(filepath, 'wb')) else: [topics, data_topics] = pickle.load(open(filepath, 'rb')) for i in range(len(data_topics)): data[i]['topics'] = data_topics[i] return topics, data
def save_model(model_path): train_set = get_train_set() # 构建训练语料 dictionary = Dictionary(train_set) corpus = [dictionary.doc2bow(text) for text in train_set] # lda模型训练 lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100) lda.print_topics(100) lda.save(model_path)
def build_topic(df, load_existing=True): words = ut.get_text_array(df) dictionary = corpora.Dictionary(words) if load_existing and os.path.exists('lda_model.h5'): model = LdaModel.load('lda_model.h5') return model, dictionary corpus = [dictionary.doc2bow(text) for text in words] model = LdaModel(corpus, num_topics=NUM_TOPICS) model.save('lda_model.h5') return model, dictionary
def train_model(self, num_topics): corpus = self.get_corpus() model = LdaModel(corpus, chunksize=2000, passes=20, iterations=200, num_topics=num_topics, eval_every=None) tmp_fname = self.path + "lda.model" model.save(tmp_fname) return model
def create_model(config, Kind): model_fname = config.model_fname % Kind.__name__ corpus_fname = config.corpus_fname % Kind.__name__ if not os.path.exists(model_fname): try: id2word = Dictionary.load(corpus_fname + '.dict') corpus = MalletCorpus(corpus_fname, id2word=id2word) logger.info('Opened previously created corpus: %s' % corpus_fname) except: error('Corpora for building file models not found!') file_model = LdaModel(corpus, id2word=corpus.id2word, alpha=config.alpha, passes=config.passes, num_topics=config.num_topics) file_model.save(model_fname)
def lda(): # remove stop words stopwords = codecs.open('../conf/stop_words_ch.txt', mode='r', encoding='utf8').readlines() stopwords = [ w.strip() for w in stopwords ] fp = codecs.open('D:\\nlp\corpora\segs.txt', mode='r', encoding='utf8') train = [] for line in fp: line = line.split() train.append([ w for w in line if w not in stopwords ]) dictionary = corpora.Dictionary(train) corpus = [ dictionary.doc2bow(text) for text in train ] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100) lda.print_topics(30) # print topic id=20 lda.print_topic(20) # save/load model lda.save('D:\\nlp\corpora\news.model')
# # logging.info('combine report and wiki dictionary...') # wiki_to_report = report_dict.merge_with(wiki_dict) # merged_dict = report_dict # # logging.info('combine report and wiki corpus...') # merged_corpus = wiki_to_report[wiki_corpus].corpus + report_corpus logging.info('generate wiki corpus...') wiki_txt = unpickle('data/txt/processed_wiki.pkl') wiki_corpus = [report_dict.doc2bow(wiki) for wiki in wiki_txt] logging.info('combine report and wiki corpus...') merged_corpus = wiki_corpus + report_corpus # compute TFIDF # logging.info('compute TFIDF...') # tfidf = TfidfModel(dictionary=report_dict, id2word=report_dict) # perform PLSA logging.info('perform PLSA...') if use_wiki is True: lda = LdaModel(corpus=merged_corpus, id2word=report_dict, num_topics=num_topics, passes=passes, iterations=iterations, chunksize=chunksize, alpha=1., eta=1.) lda.save('result/model_wiki.plsa') lda.print_topics(topics=num_topics, topn=10) else: lda = LdaModel(corpus=report_corpus, id2word=report_dict, num_topics=num_topics, passes=passes, iterations=iterations, chunksize=chunksize, alpha=1., eta=1.) lda.save('result/model.plsa') lda.print_topics(topics=num_topics, topn=10)
class DMP(object): def __init__(self): self.dic = None self.lda = None self.topic_num = config.getint('dmp', 'topic_num') self.corpus_file = config.get('dmp', 'corpus_file') @staticmethod def __text2doc(iterator, sep=u' '): '''将文本转换为文档 通过 split 函数将文本切成词的列表. 参数 sep: 分隔符 返回 返回已经切割好的词的列表 ''' docs = [] for line in iterator: text = line.strip().split(sep) docs.append(text) return docs def __load_corpus(self): '''读取语料. 通过调用 text2doc 将文本转换为词的列表. 返回 返回处理过后的文档的列表. ''' docs = None with codecs.open(self.corpus_file, 'r', 'utf-8') as iterator: docs = self.__text2doc(iterator) return docs def train(self): '''训练模型, 将会得到词典 (dic) 和模型 (lda) 两个对象. dic: 用来存储词, 每个词会有一个编号. 可以通过 dic[id] 来获取词 lda: 模型, 包含主题的列表. 每个主题有一个编号, 可以通过 lda.print_topic(id) 来获取主题中词的列表 ''' docs = self.__load_corpus() self.dic = Dictionary(docs) bow = [self.dic.doc2bow(doc) for doc in docs] self.lda = LdaModel(bow, id2word=self.dic, num_topics=self.topic_num) def infer(self, doc): '''推断新的文档是什么主题 参数 doc: 新的文档. 要以词的列表的形式呈现 返回 返回主题列表的迭代器, 其中主题均采用编号呈现, 需调用 lda.print_topic 函数来方便人工理解. ''' bow = self.dic.doc2bow(doc) topics = self.lda[bow] return topics def dump(self): '''导出 lda 模型和 dic 词典. ''' lda_file = config.get('dmp', 'lda_file') dic_file = config.get('dmp', 'dic_file') self.lda.save(lda_file) self.dic.save(dic_file) def load(self): '''读取 lda 模型和 dic 词典. ''' lda_file = config.get('dmp', 'lda_file') dic_file = config.get('dmp', 'dic_file') self.lda = LdaModel.load(lda_file) self.dic = Dictionary.load(dic_file)
# # logging.info('combine report and wiki dictionary...') # wiki_to_report = report_dict.merge_with(wiki_dict) # merged_dict = report_dict # # logging.info('combine report and wiki corpus...') # merged_corpus = wiki_to_report[wiki_corpus].corpus + report_corpus logging.info('generate wiki corpus...') wiki_txt = unpickle('data/txt/processed_wiki.pkl') wiki_corpus = [report_dict.doc2bow(wiki) for wiki in wiki_txt] logging.info('combine report and wiki corpus...') merged_corpus = wiki_corpus + report_corpus # compute TFIDF # logging.info('compute TFIDF...') # tfidf = TfidfModel(dictionary=report_dict, id2word=report_dict) # perform LDA logging.info('perform LDA...') if use_wiki is True: lda = LdaModel(corpus=merged_corpus, id2word=report_dict, num_topics=num_topics, passes=passes, iterations=iterations, alpha='auto', chunksize=chunksize) lda.save('result/model_wiki.lda') lda.print_topics(topics=num_topics, topn=10) else: lda = LdaModel(corpus=report_corpus, id2word=report_dict, num_topics=num_topics, passes=passes, iterations=iterations, alpha='auto', chunksize=chunksize) lda.save('result/model.lda') lda.print_topics(topics=num_topics, topn=10)
def create_lda_model(): logging.info('about to create all docs from chunks') start_time = datetime.datetime.now() create_all_docs() end_time = datetime.datetime.now() logging.info('total time is: %s', end_time - start_time) logging.info('about to load all docs') with open('./resources/LDA_processing/all_docs.pkl', mode='rb') as f: all_docs = pickle.load(f) logging.info('about to load english words') with open('./resources/LDA_input/english_full_list.txt') as f: english_words = f.read().splitlines() good_english_words = set(english_words[75:21000]) del english_words logging.info('about to remove all stop-words and unknown words') texts = [] for i, doc in enumerate(all_docs): filtered_doc = [word for word in doc if word in good_english_words] texts.append(filtered_doc) if i % 5000 == 0: logging.info('Finished doc: %s', i) logging.info('about to release memory of all_docs and english_words') del all_docs del good_english_words logging.info('about to save texts') with open('./resources/LDA_processing/texts.pkl', mode='wb') as f: pickle.dump(texts, f) logging.info('about to load texts') with open('./resources/LDA_processing/texts.pkl', mode='rb') as f: texts = pickle.load(f) logging.info('about to create dictionary') dictionary = corpora.Dictionary(texts) keys = dictionary.keys() logging.info('dict size before filter: %s', len(keys)) dictionary.filter_extremes(keep_n=150000) dictionary.filter_extremes(no_below=150, no_above=0.05) keys = dictionary.keys() logging.info('dict size after filter: %s', len(keys)) dictionary.save('./resources/LDA_processing/lda.dict') dictionary.save_as_text('./resources/LDA_processing/lda_dict.txt') logging.info('about to create corpus') corpus = [dictionary.doc2bow(text) for text in texts] logging.info('about to save corpus as mm file') corpora.MmCorpus.serialize('./resources/LDA_processing/corpus.mm', corpus) logging.info('about to load dictionary file') dictionary = corpora.Dictionary.load('./resources/LDA_processing/lda.dict') logging.info('about to load corpus as mm file') corpus = corpora.MmCorpus('./resources/LDA_processing/corpus.mm') logging.info('about to start LDA model') lda = LdaModel(corpus, id2word=dictionary, num_topics=num_topics) logging.info('finished LDA model') logging.info('about to save ldaModel') lda.save('./resources/LDA_processing/LdaModel') logging.info('about to load ldaModel') lda = LdaModel.load('./resources/LDA_processing/LdaModel') logging.info('about to find topics') topics = lda.show_topics(num_topics=num_topics, num_words=10000, log=True, formatted=False) logging.info('about to save topics') with open('./resources/LDA_processing/topics.pkl', mode='wb') as f: pickle.dump(topics, f) dict_word_sets = find_words_from_lda_model() with open('./resources/LDA_processing/dict_word_sets.pkl', mode='wb') as f: pickle.dump(dict_word_sets, f) topics_words = extract_words_from_word_sets() with open('./resources/LDA_result/topic_words', mode='wt', encoding='utf-8') as f: f.write('\n'.join(topics_words))
print("fitting the model ...\n") model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=no_of_topics, passes=no_of_passes, eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta) #model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=no_of_topics, passes=no_of_passes, # eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta) print(model, "\n") topics = model.show_topics(num_topics=no_of_topics) for item, i in zip(topics, enumerate(topics)): print("topic #"+str(i[0])+": "+str(item)+"\n") print("saving ...\n") if not os.path.exists("out"): os.makedirs("out") with open("out/"+foldername+"_doclabels.txt", "w") as f: for item in doc_labels: f.write(item+"\n") with open("out/"+foldername+"_topics.txt", "w") as f: for item, i in zip(topics, enumerate(topics)): f.write("topic #"+str(i[0])+": "+str(item)+"\n") dictionary.save("out/"+foldername+".dict") MmCorpus.serialize("out/"+foldername+".mm", corpus) model.save("out/"+foldername+".lda")
def main(): ''' Runs cuisine similarity and clustering analysis according to default args at top of file ''' # Select cuisines based on random sample print("Selecting cuisines...") start = clock() cuisine_files = [file for file in os.listdir(CUISINE_DIRECTORY) if os.path.isfile(os.path.join(CUISINE_DIRECTORY, file))] if os.path.isfile(CUISINE_IGNORE): cuisine_types = [] with open(CUISINE_IGNORE, "rt") as f: for line in f: cuisine_types.append(line.rstrip()) cuisine_files = list(set(cuisine_files) - set(cuisine_types)) random.seed(RSEED) random.shuffle(cuisine_files) cuisine_files = cuisine_files[:MAX_CUISINES] num_cuisines = len(cuisine_files) finish = clock() print("Running time: %.2f seconds" % (finish - start,)) print() # process reviews for each category, retaining only those with ratings in stars # and calculate total document word lengths for each cluster (prior to processing) print("Processing cuisine reviews...") start = clock() doc_lens = [] for filename in cuisine_files: doclen = 0 with open(os.path.join(CUISINE_DIRECTORY, filename), "rt") as f: with open(os.path.join(CUISINE_DIRECTORY, "processed", filename), "wt") as g: for line in f: if int(line[RATING_INDEX]) in STARS: info = line[:REVIEW_INDEX] line = line[REVIEW_INDEX:] doclen += len(line.split()) line = process_document(line) g.write(info + line + "\n") print("Processing of %s is complete!" % (filename,)) doc_lens.append(doclen) with open("cuisine_selected.txt", "wt") as f: for i in range(len(cuisine_files)): f.write(",".join([cuisine_files[i], str(doc_lens[i])]) + "\n") finish = clock() print("Processing of reviews is complete!") print("Running time: %.2f seconds" % (finish - start,)) print() # Build corpus from selected cuisine documents print("Building cuisine document corpus...") start = clock() cuisine_corpus = MyCorpus(cuisine_files, os.path.join(CUISINE_DIRECTORY, "processed")) with open("cuisine_corpus.pyobject", "wb") as f: pickle.dump(cuisine_corpus, f) finish = clock() print("Running time: %.2f seconds" % (finish - start,)) print() # create Tfidf-Model print("Running TFIDF model on cuisine corpus...") start = clock() cuisine_corpus.agglomerate = False cuisine_tfidf = TfidfModel(corpus = cuisine_corpus, id2word = cuisine_corpus.dictionary, \ wlocal = tf_func, wglobal = idf_func, normalize = False) with open("cuisine_tfidf.pyobject", "wb") as f: pickle.dump(cuisine_tfidf, f) finish = clock() print("Running time: %.2f seconds" % (finish - start,)) print() # create list of doc sparse vectors and perform document length normalization print("Calculating similarity matrices and writing to files...") start = clock() cuisine_corpus.agglomerate = True cuisine_types = list(map(lambda x: x.split(".")[0], cuisine_corpus.file_list)) vec_len = len(cuisine_corpus.dictionary.keys()) avgl = sum(doc_lens) / len(doc_lens) doc_sparse_list = [] for index, doc in enumerate(cuisine_corpus): vec = SparseVector(cuisine_tfidf[doc], vec_len) vec = vec / (1 - DOCLEN_NORM_B + DOCLEN_NORM_B * doc_lens[index] / avgl) doc_sparse_list.append((cuisine_types[index], vec)) # calculate similarity for all clusters and write to file dat_cs = np.zeros((len(cuisine_types), len(cuisine_types))) dat_ts = np.zeros((len(cuisine_types), len(cuisine_types))) dat_js = np.zeros((len(cuisine_types), len(cuisine_types))) for ind1 in range(len(doc_sparse_list)): for ind2 in range(ind1, len(doc_sparse_list)): csimvalue = doc_sparse_list[ind1][1].cosine_similarity(doc_sparse_list[ind2][1]) tsimvalue = doc_sparse_list[ind1][1].tanimoto_similarity(doc_sparse_list[ind2][1]) jsimvalue = doc_sparse_list[ind1][1].jaccard_similarity(doc_sparse_list[ind2][1]) dat_cs[ind1][ind2] = csimvalue dat_ts[ind1][ind2] = tsimvalue dat_js[ind1][ind2] = jsimvalue if ind1 != ind2: dat_cs[ind2][ind1] = csimvalue dat_ts[ind2][ind1] = tsimvalue dat_js[ind2][ind1] = jsimvalue cs_df = pd.DataFrame(dat_cs, index = cuisine_types, columns = cuisine_types) ts_df = pd.DataFrame(dat_ts, index = cuisine_types, columns = cuisine_types) js_df = pd.DataFrame(dat_js, index = cuisine_types, columns = cuisine_types) cs_df.to_csv("cosine_similarity_df.csv", header = True, index = True) ts_df.to_csv("tanimoto_similarity_df.csv", header = True, index = True) js_df.to_csv("jaccard_similarity_df.csv", header = True, index = True) finish = clock() print("Running time: %.2f seconds" % (finish - start,)) print() # clear potential large objects from memory prior to running any further analyses del doc_sparse_list del dat_cs del dat_ts del dat_js del cs_df del ts_df del js_df # run lda analysis print("Running LDA with %d topics..." % (LDA_TOPICS,)) start = clock() cuisine_corpus.agglomerate = False lda = LdaModel(corpus = cuisine_tfidf[cuisine_corpus], id2word = cuisine_corpus.dictionary, \ num_topics = LDA_TOPICS, eval_every = None, chunksize = LDA_CHUNKSIZE, iterations = 200, \ passes = 2) lda.save("lda_cuisines.pyobject") # create dense numpy matrix cuisine_corpus.agglomerate = True rows, cols = len(cuisine_files), LDA_TOPICS lda_array = np.zeros(rows * cols).reshape(rows, cols) for row, doc in enumerate(cuisine_corpus): entries = lda[doc] for col, value in entries: lda_array[row][col] = value with open("lda_array.npy", "wb") as f: np.save(f, lda_array) finish = clock() print("LDA complete!") print("Running time: %.2f seconds" % (finish - start,)) print() # calculate similarity for all lda documents and write to file print("Calculating LDA similarity matrices...") start = clock() dat_cs = np.zeros((len(cuisine_types), len(cuisine_types))) dat_ts = np.zeros((len(cuisine_types), len(cuisine_types))) for ind1 in range(lda_array.shape[0]): vec1 = lda_array[ind1,:] for ind2 in range(ind1, lda_array.shape[0]): vec2 = lda_array[ind2,:] csimvalue = vec1.dot(vec2) / np.sqrt(vec1.dot(vec1) * vec2.dot(vec2)) tsimvalue = vec1.dot(vec2) / (vec1.dot(vec1) + vec2.dot(vec2) - vec1.dot(vec2)) dat_cs[ind1][ind2] = csimvalue dat_ts[ind1][ind2] = tsimvalue if ind1 != ind2: dat_cs[ind2][ind1] = csimvalue dat_ts[ind2][ind1] = tsimvalue cs_df = pd.DataFrame(dat_cs, index = cuisine_types, columns = cuisine_types) ts_df = pd.DataFrame(dat_ts, index = cuisine_types, columns = cuisine_types) cs_df.to_csv("lda_cosine_similarity_df.csv", header = True, index = True) ts_df.to_csv("lda_tanimoto_similarity_df.csv", header = True, index = True) finish = clock() print("Running time: %.2f seconds" % (finish - start,)) print() # clear up memory del dat_cs del dat_ts del cs_df del ts_df # perform k-means clustering analysis on 50 clusters using 5-fold with penalty coefficient start = clock() cv_folds = np.tile(np.arange(10), np.ceil(len(cuisine_files) / 10)) np.random.seed(RSEED) np.random.shuffle(cv_folds) kmeans_results = [] for n_clusters in range(1, int(len(cuisine_files) - np.ceil(len(cuisine_files) / 10) + 1)): print("Analyzing for %d cluster(s)..." % (n_clusters,)) penalty = len(cuisine_files) / (len(cuisine_files) - n_clusters + 1) total_ssr = 0 for i in range(10): train_index = np.where(cv_folds != i)[0] test_index = np.where(cv_folds == i)[0] kmeans_model = KMeans(n_clusters) kmeans_model.fit(lda_array[train_index,:]) total_ssr += np.sum(np.min(kmeans_model.transform(lda_array[test_index,:]), axis = 1)**2) kmeans_results.append((n_clusters, total_ssr * penalty)) with open("kmeans_results.txt", "wt") as f: for tup in kmeans_results: f.write(str(tup[0]) + "," + str(tup[1]) + "\n") finish = clock() print("Cross-Validation analysis complete!") print("Running time: %.2f seconds" % (finish - start,)) print() # fit final model sel_clusters = min(kmeans_results, key = lambda x: x[1])[0] print("Fitting final optimal kmeans model...") print("Minimum occurs at %d clusters." % (sel_clusters,)) kmeans_final = KMeans(sel_clusters) final_clusters = np.argmin(kmeans_final.fit_transform(lda_array), axis = 1) clusters = {key:[] for key in range(sel_clusters)} for index, cuisine in enumerate(cuisine_types): clusters[final_clusters[index]].append(cuisine) with open("optimal_clusters.txt", "wt") as f: for i in range(sel_clusters): f.write(",".join(clusters[i]) + "\n") print() # fit model with 20 clusters sel_clusters = 20 if len(cuisine_files) > 20 else len(cuisine_files) print("Fitting kmeans model with %d clusters..." % (sel_clusters,)) kmeans_final = KMeans(sel_clusters, n_init = 100) final_clusters = np.argmin(kmeans_final.fit_transform(lda_array), axis = 1) clusters = {key:[] for key in range(sel_clusters)} for index, cuisine in enumerate(cuisine_types): clusters[final_clusters[index]].append(cuisine) with open("many_clusters.txt", "wt") as f: for i in range(sel_clusters): f.write(",".join(clusters[i]) + "\n") print()
log.info('generated topics...') # print topics topics = model.show_topics(num_topics=no_of_topics) for item, i in zip(topics, enumerate(topics)): log.info('topic #%s: %s', i[0], item) log.info('saving results...') # create output folder if not os.path.exists("out"): os.makedirs("out") # save doc_labels for further use with open(os.path.join(os.path.join(os.getcwd(), "out"),''.join([foldername, "_doclabels.txt"])), "w", encoding="utf-8") as f: for item in doc_labels: f.write(item+"\n") # save topics for further use with open(os.path.join(os.path.join(os.getcwd(), "out"), ''.join([foldername, "_topics.txt"])), "w", encoding="utf-8") as f: for item, i in zip(topics, enumerate(topics)): f.write("".join(["topic #",str(i[0]),": ",str(item),"\n"])) # save dictionary for further use dictionary.save(os.path.join(os.path.join(os.getcwd(), "out"), '.'.join([foldername, 'dict']))) # save model for further use model.save(os.path.join(os.path.join(os.getcwd(), "out"), '.'.join([foldername, 'lda']))) log.info('topic modeling finished')
class W2V_cpp2(W2V_base): def __init__(self,n_topic, path, folder): self.n_topic = n_topic W2V_base.__init__(self, path, folder) #process dict for prod_id in self.idx2prod.keys(): prod = self.idx2prod[prod_id] n_prod_id = prod_id - len(self.word_count) - 1 del self.idx2prod[prod_id] self.idx2prod[n_prod_id] = prod self.prod2idx[prod] = n_prod_id for user_id in self.idx2user.keys(): user = self.idx2user[user_id] n_user_id = user_id - len(self.word_count) - len(self.prod2idx) - 1 del self.idx2user[user_id] self.idx2user[n_user_id] = user self.user2idx[user] = n_user_id def train(self): data = [] entity2id = {} id2entity = [] for obj in self.data: doc = [] obj_sents = obj["text_data"] entity = obj["prod"] if entity not in entity2id: entity2id[entity] = len(entity2id) id2entity.append(entity) doc_id = entity2id[entity] for obj_sent in obj_sents: for pair in obj_sent: if pair[0] >= 0: doc.append((pair[0], doc_id)) data.append(doc) self.ldamodel = LdaModel(corpus=data, id2word=self.idx2word, num_topics=self.n_topic) f_entity = open("lda/prod.txt", "w") f_model = open("lda/model.txt", "w") f_model.write(str(len(entity2id))) f_model.write(" ") f_model.write(str(self.n_topic)) f_model.write("\n") for entity in id2entity: f_entity.write(entity) f_entity.write("\n") f_model.write(entity) f_model.write(" ") distr = self.ldamodel.get_document_topics(data[1], minimum_phi_value=0, minimum_probability=0) distr = [pair[1] for pair in distr] for prod in distr: f_model.write(str(prod)) f_model.write(" ") f_model.write("\n") self.ldamodel.save("lda/model_200")
# try with BoW vectors too? # vamos a utilizar Latent Dirichlet Allocation para tratar de categorizar los abstracts # este se demora la primera q lo corres para entrenar el modelo print("lda") lda_filename = 'model.lda' if not os.path.isfile(lda_filename): lda = LdaModel(corpus, num_topics=5, id2word=dictionary, update_every=5, chunksize=10000, passes=100) lda.save('/tmp/model.lda') else: lda = LdaModel.load('/tmp/model.lda') lda.show_topics() topics_matrix = lda.show_topics(formatted=False, num_words=7) print(topics_matrix) print(len(topics_matrix)) for topic in topics_matrix: i = topic[1] print([str(word) for word in i]) # # topics_matrix = np.array(topics_matrix) # # topic_words = topics_matrix[:, :, 1]
def upload_file(): """ Upload csv files and create: * ~/out/corpus.dict * ~/out/corpus.lda * ~/out/corpus.lda.state * ~/out/corpus.mm * ~/out/corpus.mm.index * ~/out/corpus_doclabels.txt * ~/out/corpus_topics.txt * ~/mycorpus.txt As well as (for example): * ~/swcorp/Doyle_AStudyinScarlet.txt * ~/swcorp/Lovecraft_AttheMountainofMadness.txt * etc. """ # INPUT # columns to read from csv file columns = ['ParagraphId', 'TokenId', 'Lemma', 'CPOS', 'NamedEntity'] # parts-of-speech to include into the model pos_tags = ['ADJ', 'NN', 'V'] # stopwords regex = re.compile('\w+') stopwords = request.files['stoplist'] stopwords = str(stopwords.readlines()) stopwords = regex.findall(stopwords) stopwords.extend(("'", "'d", "'s")) # temporary solution print(stopwords) # document size (in words) doc_size = 1000 # uses the pipeline's ParagraphId to split text into documents, # overrides doc_size - 1: on, 0: off doc_split = 0 # no. of topics to be generated no_of_topics = 30 # no. of lda iterations - usually, the more the better, but # increases computing time no_of_passes = 1 # perplexity estimation every n chunks - # the smaller the better, but increases computing time eval = 1 # documents to process at once chunk = 100 # "symmetric", "asymmetric", "auto", or array # (default: a symmetric 1.0/num_topics prior) affects sparsity of # the document-topic (theta) distribution alpha = "symmetric" # custom alpha may increase topic coherence, but may also produce # more topics with zero probability alpha = np.array([ 0.02, 0.02, # 0.02, 0.03, 0.03, 0.03, 0.04, 0.04, 0.04, 0.05, 0.05, 0.04, 0.04, # 0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02]) # can be a number (int/float), an array, or None # affects topic-word (lambda) distribution - not necessarily # beneficial to topic coherence eta = None # PREPROCESSING files = request.files.getlist('files') docs = [] doc_labels = [] print("\n reading files ...\n") for file in files: file_label = secure_filename(file.filename).split('.')[0] df = pd.read_csv(file, sep="\t", quoting=csv.QUOTE_NONE) df = df[columns] df = df.groupby('CPOS') doc = pd.DataFrame() for p in pos_tags: # collect only the specified parts-of-speech doc = doc.append(df.get_group(p)) # construct documents if doc_split: # size according to paragraph id doc = doc.groupby('ParagraphId') for para_id, para in doc: docs.append(para['Lemma'].values.astype(str)) doc_labels.append( ''.join([file_label, " #", str(para_id)])) else: # size according to doc_size doc = doc.sort_values(by='TokenId') i = 1 while(doc_size < doc.shape[0]): docs.append( doc[:doc_size]['Lemma'].values.astype(str)) doc_labels.append( ''.join([file_label, " #", str(i)])) doc = doc.drop(doc.index[:doc_size]) i += 1 docs.append(doc['Lemma'].values.astype(str)) doc_labels.append(''.join([file_label, " #", str(i)])) if not os.path.exists(os.path.join(os.getcwd(), "swcorp")): os.makedirs(os.path.join(os.getcwd(), "swcorp")) swpath = os.path.join('swcorp', "".join(file_label)) with open(swpath + ".txt", 'w', encoding="utf-8") as text: text.write(" ".join( word for word in doc['Lemma'].values.astype(str) if word not in stopwords)) print("\n normalizing and vectorizing ...\n") # texts = [ # [word for word in doc if word not in stopwords] for doc in docs] print("\n stopwords removed ...\n") print("\n writing mastercorpus ...\n") mastercorpus = os.path.join(os.getcwd(), 'mycorpus.txt') with open(mastercorpus, 'w', encoding="utf-8") as data: folder = glob.glob("swcorp/*") for text in folder: with open(text, 'r', encoding="utf-8") as text: textline = [re.sub( r'\\n\\r', '', document) for document in ' '.join( text.read().split())] if text != folder[-1]: data.write("".join(textline) + "\n") else: data.write("".join(textline)) # MAIN PART mastercorpus = os.path.join(os.getcwd(), 'mycorpus.txt') dictionary = corpora.Dictionary( line.lower().split() for line in open( mastercorpus, encoding="utf-8")) class MyCorpus(object): def __iter__(self): for line in open('mycorpus.txt'): # assume there's one document per line, tokens # separated by whitespace yield dictionary.doc2bow(line.lower().split()) # corpus = buildCorpus(mastercorpus, dictionary) corpus = MyCorpus() # corpus = glob.glob("swcorpus/*") if not os.path.exists("out"): os.makedirs("out") # if not os.path.exists(os.path.join(os.path.join(os.getcwd(), # 'out'), foldername)): os.makedirs(os.path.join # (os.path.join(os.getcwd(), 'out'), foldername)) MmCorpus.serialize( os.path.join(os.path.join(os.getcwd(), "out"), '.'.join( ['corpus.mm'])), corpus) mm = MmCorpus('out/corpus.mm') print(mm) # doc_labels = glob.glob("corpus/*") print("fitting the model ...\n") model = LdaModel( corpus=mm, id2word=dictionary, num_topics=no_of_topics, passes=no_of_passes, eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta) # model = LdaMulticore(corpus=corpus, id2word=dictionary, # num_topics=no_of_topics, passes=no_of_passes, # eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta) print(model, "\n") topics = model.show_topics(num_topics=no_of_topics) for item, i in zip(topics, enumerate(topics)): print("topic #"+str(i[0])+": "+str(item)+"\n") print("saving ...\n") if not os.path.exists("out"): os.makedirs("out") # if not os.path.exists(os.path.join(os.path.join(os.getcwd(), # 'out'), foldername)): # os.makedirs(os.path.join(os.path.join(os.getcwd(), 'out'), # foldername)) with open( os.path.join(os.path.join(os.getcwd(), "out"), ''.join( ["corpus_doclabels.txt"])), "w", encoding="utf-8") as f: for item in doc_labels: f.write(item + "\n") with open( os.path.join(os.path.join(os.getcwd(), "out"), ''.join( ["corpus_topics.txt"])), "w", encoding="utf-8") as f: for item, i in zip(topics, enumerate(topics)): f.write( "".join(["topic #", str(i[0]), ": ", str(item), "\n"])) dictionary.save( os.path.join(os.path.join(os.getcwd(), "out"), '.'.join( ['corpus', 'dict']))) # MmCorpus.serialize( # os.path.join(os.path.join(os.getcwd(), "out"), '.'.join( # [foldername, 'mm'])), corpus) model.save( os.path.join(os.path.join(os.getcwd(), "out"), '.'.join( ['corpus', 'lda']))) print("\n ta-daaaa ...\n") # VISUALIZATION no_of_topics = model.num_topics no_of_docs = len(doc_labels) doc_topic = np.zeros((no_of_docs, no_of_topics)) for doc, i in zip(corpus, range(no_of_docs)): # topic_dist is a list of tuples (topic_id, topic_prob) topic_dist = model.__getitem__(doc) for topic in topic_dist: doc_topic[i][topic[0]] = topic[1] # get plot labels topic_labels = [] for i in range(no_of_topics): # show_topic() returns tuples (word_prob, word) topic_terms = [x[0] for x in model.show_topic(i, topn=3)] topic_labels.append(" ".join(topic_terms)) # cf. https://de.dariah.eu/tatom/topic_model_visualization.html if no_of_docs > 20 or no_of_topics > 20: plt.figure(figsize=(20, 20)) # if many items, enlarge figure plt.pcolor(doc_topic, norm=None, cmap='Reds') plt.yticks(np.arange(doc_topic.shape[0])+1.0, doc_labels) plt.xticks( np.arange(doc_topic.shape[1])+0.5, topic_labels, rotation='90') plt.gca().invert_yaxis() plt.colorbar(cmap='Reds') plt.tight_layout() plt.savefig("./static/corpus_heatmap.svg") return render_template('success.html')
vocab = Dictionary.load_from_text('./vocab.txt') corpus = UnlabeledCorpus('./rumor_train.csv', vocab) valid_corpus = UnlabeledCorpus('./rumor_valid.csv', vocab) valid_sentences = [doc for doc in valid_corpus][5000:] # varing number of topics # result = {} # for num_topics in [2, 4, 8, 16, 32, 64]: # best_value = -100 # for i in range(5): # model = LdaModel(corpus=corpus, id2word=vocab, num_topics=num_topics) # likelihood = model.log_perplexity(valid_sentences) # best_value = max(best_value, likelihood) # result[num_topics]= best_value # # for num_topics, likelihood in result.iteritems(): # print 'num_topics: %d, best word_likelihood: %f' % (num_topics, likelihood) model = LdaModel(corpus=corpus, id2word=vocab, num_topics=8, passes=2) model.save('./lda_model.txt') # print topics to a file topics = model.show_topics(num_topics=100, num_words=50) with codecs.open('./topics.txt', 'w', 'utf-8') as out_f: for topic in topics: topic_id, topic_str = topic[0], topic[1] out_f.write('%d:\n%s\n' % (topic_id, topic_str)) out_f.write('\n')
print 'Saving dictionary (%s)...' % DICT dictionary.save(DICT) print 'Building bag-of-words corpus ...' bow_corpus = [ dictionary.doc2bow(t) for t in texts ] print 'Serializing corpus (%s) ...' % BOW MmCorpus.serialize(BOW, bow_corpus) size = len(bow_corpus) * 4 / 5 training = bow_corpus[:size] testing = bow_corpus[size:] print 'Training LDA w/ %d topics on first %d texts ...' % (Num_Topics, len(training)) lda = LdaModel(training, id2word=dictionary, num_topics=Num_Topics, passes=5, iterations = 1000) print 'Saving LDA model (%s) ...' % NSFLDA lda.save(NSFLDA) print 'Random subset of topics:' print '\n'.join(lda.print_topics()) print 'Computing perplexity on %d held-out documents ...' % len(testing) perplexity = 2 ** -(lda.log_perplexity(testing)) print 'Perplexity: %.2f' % perplexity
def run(lda_model_path, corpus_path, num_topics, id2word): corpus = corpora.BleiCorpus(corpus_path) lda = LdaModel(corpus, num_topics=num_topics, id2word=id2word) lda.save(lda_model_path) return lda