def make_lda_model(): tfidf_model = TfidfModel.load((output_dir / 'tfidf_model.pkl').as_posix()) lda_model = LdaModel(nmf_iterator( CONTENT_FILES, Dict.load((output_dir / 'dict.pkl').as_posix()), tfidf_model), num_topics=TOPIC_NUM) lda_model.save((output_dir / 'lda_model.pkl').as_posix())
def get_lda_model(doc_term_matrix, id2word, fname): try: lda_model = LdaModel.load(fname) except: lda_model = LdaModel( corpus=doc_term_matrix, id2word=id2word, num_topics=params['num_topics'], chunksize=params['chunksize'], random_state=100, update_every=1, # online iterative learning passes=2, distributed=False, # alpha='auto', per_word_topics=True) _save_model(lda_model, fname=fname) return lda_model
def _load_model(type, fname='../../model/'): try: if type == 'lsi': return LsiModel.load(fname) elif type == 'lda': return LdaModel.load(fname) elif type == 'mallet': return LdaMallet.load(fname) except: return None
def topic_model_visualize(textlist: list, num_topics: int) -> None: """Визуализация тематической модели""" textlist = [textlist] common_dictionary = Dictionary(textlist) common_corpus = [common_dictionary.doc2bow(text) for text in textlist] lda = LdaModel(common_corpus, num_topics=num_topics) vis = pyLDAvis.gensim.prepare(lda, common_corpus, common_dictionary) pyLDAvis.save_html(vis, 'LDA.html') pyLDAvis.show(data=vis, open_browser=True)
def get_lda_model(doc_term_matrix, id2word, fname, num_topics=None): if params['training']: lda_model = LdaModel(corpus=doc_term_matrix, id2word=id2word, num_topics=params['num_topics'] if num_topics is None else num_topics, passes=5, per_word_topics=True) _save_model('lda', lda_model, fname=fname) else: lda_model = _load_model('lda', fname) return lda_model
def _load_model(model_type, fname): logger.info(f'{model_type} type of {fname} is loading..') try: if model_type == 'lsi': return LsiModel.load(f'../model/lsi_model/{fname}') elif model_type == 'lda': return LdaModel.load(f'../model/lda_model/{fname}') elif model_type == 'mallet': return LdaMallet.load(f'../model/mallet_model/{fname}') elif model_type == 'hdp': return HdpModel.load(f'../model/mallet_model/{fname}') except Exception as ex: logger.warning(f'{model_type} type of {fname} could not be loaded.', exc_info=ex) return None
def get_lda_model_byDomains(domains): """ Создать LDA модель из заданных ссылок :param domains: имена сообществ VK """ common_texts = normilize_texts(domains[0]) for i in range(1, len(domains)): common_texts += normilize_texts(domains[i]) common_dictionary = Dictionary(common_texts) common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] lda = LdaModel(common_corpus, num_topics=len(domains)) return lda
def process_content(filename, chunk_size=CHUNK_SIZE): model = LdaModel.load((output_dir / 'lda_model.pkl').as_posix()) temp_dict = Dict.load((output_dir / 'dict.pkl').as_posix()) out_file = output_dir / (filename.stem + '_lda.csv') # touch one while not exist if not out_file.is_file(): Path(out_file).touch() for chunk in pd.read_csv(filename, usecols=['id', 'content'], chunksize=chunk_size): chunk['content'] = chunk['content'].str.lower().str.split() \ .apply(lambda doc: model[temp_dict.doc2bow(doc)]) chunk.to_csv(out_file, mode='a')
def build(self, _label): modelfile = "./models/{0}.model".format(_label) dictfile = "./models/{0}.dict".format(_label) corpusfile = "./models/{0}.mm".format(_label) if os.path.isfile(modelfile): dictionary = corpora.Dictionary.load(dictfile) corpus = corpora.MmCorpus(corpusfile) ldamodel = LdaModel.load(modelfile) else: texts = [] for tweetid, label in self.labels.items(): if label == _label: texts.extend(self.getTweetTexts(tweetid)) dictionary, corpus, ldamodel = self.buildModel(texts) dictionary.save(dictfile) corpora.MmCorpus.serialize(corpusfile, corpus) ldamodel.save(modelfile) return dictionary, corpus, ldamodel
def get_lda_model2(doc_term_matrix, id2word, fname, num_topics=None): if params['training']: lda_model = LdaModel( corpus=doc_term_matrix, id2word=id2word, num_topics=params['num_topics'] if num_topics is None else num_topics, chunksize=3000, passes=20, alpha='auto', # eta='auto', iterations=100, per_word_topics=True) _save_model('lda', lda_model, fname=fname) else: lda_model = _load_model('lda', fname) return lda_model
def LDA(textlist: list, num_topics: int): textlist = [textlist] common_dictionary = Dictionary(textlist) common_corpus = [common_dictionary.doc2bow(text) for text in textlist] lda = LdaModel(common_corpus, num_topics=num_topics) return lda
end_date = args.end_date n_topics = args.num_topics n_articles = args.top_na n_vocabs = args.top_nv DIR_NAME = os.path.join('./dirs', f"{category}-{start_date}-{end_date}") with open(os.path.join(DIR_NAME, 'corpus'), 'rb') as f: corpus = pickle.load(f) dictionary = Dictionary(corpus) gensim_corpus = [dictionary.doc2bow(doc) for doc in corpus] if args.estimate == 'y': print("\nEstimating parameters of LDA model") start = time.time() model = LdaModel(gensim_corpus, id2word=dictionary, num_topics=n_topics) model.save(datapath(f"{category}-{start_date}-{end_date}")) minute, second = list(map(int, divmod(time.time() - start, 60))) print(f">>> Elapsed time : {minute}m {second}s") print( f"\nSaving DataFrame of top {n_articles} relevant articles per topic and {n_vocabs} vocabularies from each topic" ) model = LdaModel.load(datapath(f"{category}-{start_date}-{end_date}")) start = time.time() topn_articles = docs_by_topic(model, gensim_corpus, n_articles) with open(os.path.join(DIR_NAME, f"topn_articles_{start_date[:6]}"), 'wb') as f: pickle.dump(topn_articles, f)
def grid_search_lda_ASM(texts, n_topics_range, iterations, passes, out_dir, verbose=True, save_doc_top=True): '''Fit topic models and search for optimal hyperparameters. LDA will be fitted for each number of topics, returned will be the model, it's coherence score and corresponding _asymmetrical_ priors the model learned (alpha and eta) Parameters ---------- texts : list preprocessed corpus, where texts[0] is a document and texts[0][0] is a token. n_topics_range : range of int range of integers to use as the number of topics in interations of the topic model. iterations : int maximum number of iterations for each topic models passes : int maximum number of passes (start iterations again) for each topic models out_dir : str path to a directory, where results will be saved (in a child directory). verbose : bool give comments about the progress? save_doc_top : bool save documet-topic matices from models? Exports ------- out_dir/report_lines/* pickled dict with model information (n topics, model coherence, per-topic coherence, hyperparameters) out_dir/models/* gensim objects, where the model is saved. out_dir/plots/* pyLDAvis visualizations of the model ''' # check how legit out_dir is make_folders(out_dir) # if a single model is to be fitted, # make sure it can be "iterated" if isinstance(n_topics_range, int): n_topics_range = [n_topics_range] # input texts to gensim format dictionary = corpora.Dictionary(texts) bows = [dictionary.doc2bow(tl) for tl in texts] # iterate report_list = [] for n_top in chain(n_topics_range): if verbose: print("{} topics".format(n_top)) start_time = time() # paths for saving ## it's not very elegant defining the paths here ## after there already is funciton make_folders filename = str(n_top) + "T_" + 'ASM' report_path = os.path.join( out_dir, 'report_lines', filename + '.ndjson' ) model_path = os.path.join( out_dir, 'models', filename + '.model' ) pyldavis_path = os.path.join( out_dir, 'plots', filename + '_pyldavis.html' ) doctop_path = os.path.join( out_dir, 'doctop_mats', filename + '_mat.ndjson' ) # train model # TODO: higher / cusomizable fine hyperparameters? model = LdaModel( corpus=bows, iterations=iterations, ## optimizing hyperparameters num_topics=n_top, alpha='auto', eta='auto', ## fine hyperparameters decay=0.5, offset=1.0, eval_every=10, gamma_threshold=0.001, minimum_probability=0.01, minimum_phi_value=0.01, ## utility random_state=None, per_word_topics=False, id2word=dictionary, passes=passes) # track time usage training_time = time() - start_time if verbose: print(' Time: {}'.format(training_time)) # coherence coherence_model = CoherenceModel( model=model, texts=texts, corpus=bows, coherence='c_v' ) coh_score = coherence_model.get_coherence() coh_topics = coherence_model.get_coherence_per_topic() if verbose: print(' Coherence: {}'.format(coh_score.round(2))) # save priors alpha = model.alpha.tolist() eta = model.eta.tolist() # save report report = (n_top, alpha, eta, training_time, coh_score, coh_topics) report_list.append(report) with open(report_path, 'w') as f: ndjson.dump(report, f) # save model model.save(model_path) # produce a visualization # it is imperative that sort_topics should never be turned on! vis = pyLDAvis.gensim.prepare( model, bows, dictionary, sort_topics=False ) pyLDAvis.save_html(vis, pyldavis_path) # save document-topic matrix if save_doc_top: # keep minimum_probability at 0 for a complete matrix doc_top = [model.get_document_topics(doc, minimum_probability=0) for doc in model[bows]] # unnest (n topic, prob) tuples # float to convert from np.float32 which is not # JSON serializable doc_top_prob = [ [float(prob) for i, prob in doc] for doc in doc_top ] # save the matrix as ndjson with open(doctop_path, 'w') as f: ndjson.dump(doc_top_prob, f) return None
def lda_mod(domains): common_texts = redact_finish(domains) common_dictionary = Dictionary(common_texts) common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] lda = LdaModel(common_corpus, num_topics=len(domains)) return lda