def createLDA(self, fileName = '', modelName= '', ldaPasses='', topicNum=''): ''' fileName -> file for the dictionary (.dict) and corpus (.mm) files modelName -> model name for LDA to save to disk ldaPasses -~ number of passes, 10 default topicNum -> number of topics to generate, 100 by default ''' if fileName == '': fileName = self.__fileName if ldaPasses == '': ldaPasses = self.__ldaPasses if topicNum == '': topicNum = self.__topicNum if modelName == '': modelName = fileName + '_' + str(ldaPasses) + 'P_' + str(topicNum) + 'T' dict = corpora.Dictionary.load(self.__destination+fileName+'.dict') mm = corpora.MmCorpus(self.__destination+fileName+'.mm') #lda = models.ldamodel.LdaModel(corpus=mm, id2word=dict, num_topics=6, update_every=1, chunksize=10000, passes=10) lda = LdaMulticore(corpus=mm, num_topics=topicNum, id2word=dict, chunksize=30000, passes=ldaPasses, workers=3) lda.save(self.__destination+modelName+'.lda') #======================================================================= # print lda #======================================================================= print 'Created LDA model %s'%self.__fileName
def train(self): tfidf = TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lda = LdaMulticore(corpus=corpus_tfidf, id2word=dictionary, num_topics=100) lda.save('lda.model')
def getLDA(self): logging.info("Creating bag of words for LDA model") self.dictionary = gensim.corpora.Dictionary(self.corpus) self.dictionary.filter_extremes(no_below=2, no_above=0.1) self.bow_corpus = [self.dictionary.doc2bow(doc) for doc in self.corpus] del self.corpus lda_models_coherence_cV = [] for num_topics in tqdm(range(3, 13)): model_lda = LdaMulticore(corpus = self.bow_corpus, num_topics=num_topics, id2word=self.dictionary, workers=8) #coherencemodel = CoherenceModel(model=model_lda, # texts=self.corpus, # dictionary=self.dictionary, # coherence='c_v') #coherence_value = coherencemodel.get_coherence() #lda_models_coherence_cV.append(coherence_value) if "topic" not in os.listdir(self.path): os.mkdir(self.path+"/topic") model_lda.save(self.path+"/topic/lda_"+str(num_topics)+".model") #coherencemodel.save(self.path+"/topic/coherence_" + str(num_topics) + ".model") self.lda_models[num_topics] = model_lda return
def train(corpus, dct, docs, ids, num_topics, field): model_dir = f'./models/{field}/k_{num_topics}/' os.makedirs(model_dir, exist_ok=True) model_file = model_dir + 'model' # The filename is the file that will be created with the log. # If the file already exists, the log will continue rather than being overwritten. log_file = model_dir + 'model_callbacks.log' logging.basicConfig(filename=log_file, format="%(asctime)s:%(levelname)s:%(message)s", level=logging.NOTSET) lda_model = LdaMulticore( corpus=corpus, id2word=dct, random_state=2020, num_topics=num_topics, # passes=100, chunksize=5000, # batch=False, alpha='asymmetric', decay=0.5, offset=64, eta='auto', eval_every=0, iterations=10, # gamma_threshold=0.001, per_word_topics=True) lda_model.save(model_file) return lda_model
def lda_train(train_data, part, save_root): ids = list(train_data['id']) texts = list(train_data[part]) with Pool() as pool: texts = list( tqdm.tqdm(pool.imap(tokenize, texts), total=len(texts), ncols=100)) text_dictionary = Dictionary(texts) text_dictionary.save(os.path.join(save_root, 'dict')) with Pool(initializer=make_dictionary_global, initargs=(text_dictionary, )) as pool: texts = list( tqdm.tqdm(pool.imap(doc2bow_unit, texts), total=len(texts), ncols=100)) lda_model = LdaMulticore(texts, workers=7) lda_model.save(os.path.join(save_root, 'model')) with Pool(initializer=make_model_global, initargs=(lda_model, )) as pool: rows = list( tqdm.tqdm(pool.imap(get_document_topics_unit, texts), total=len(texts), ncols=100)) topics = pd.DataFrame(rows, columns=['topics', 'topic_num']) topics.insert(0, 'id', ids) topics.to_csv(os.path.join(save_root, 'train.csv'), index=False) return text_dictionary, lda_model
def train_lda(): """ Usage: python Wechat_LDA.py wechat.csv """ with open(sys.argv[1], 'r') as wx: for f in wx: seg = jieba.cut(f) seg = [word for word in seg if word not in stopwords] with codecs.open('wechat_seg.txt', encoding='utf-8', mode='ab') as wx_seg: wx_seg.write(' '.join(seg)) documents = open('wechat_seg.txt', 'r') dictionary = corpora.Dictionary(LineSentence(documents)) corpus = [dictionary.doc2bow(text) for text in LineSentence(documents)] tfidf_model = TfidfModel(corpus, id2word=dictionary, normalize=True) tfidf_model.save('wechat_seg.txt.tfidf_model') # corpora.MmCorpus.serialize('wechat_seg.txt.tfidf_model.mm', tfidf_model[corpus]) lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=100, workers=cpu_count()-1) lda_model.save('wechat_lda_model.pkl') topics = [] for doc in corpus: topics.append(lda_model[doc]) counts = np.zeros(100) for top_doc in topics: for ti, _ in top_doc: counts[ti] += 1 words = lda_model.show_topic(counts.argmax(), 64) with open('top_words.txt', 'w') as tw: writer = UnicodeWriter(tw) for w in words: writer.writerow((w[0], int(float(w[1])*1000)))
def train(file=DATA_FILE, type=JSON): delete_previous_models() faq_df = get_dataframe(os.path.join(DATA_DIR, file), type=type) faq_df = clean_data(faq_df) faq_df[PROCESSED_QUESTION] = faq_df[CLEAN_QUESTION].apply(preprocess) faq_df[PROCESSED_ANSWER] = faq_df[CLEAN_ANSWER].apply(preprocess) print('Preprocessing Done') if DEBUG: print(faq_df.head()) for mode in modes: model = modes[mode] dictionary = corpora.Dictionary(faq_df[model.column]) dictionary.save(os.path.join(MODEL_DIR, model.dictionary)) corpus = faq_df[model.column].map(dictionary.doc2bow) if DEBUG: print(f'{model.corpus} generated') print(corpus.head()) corpora.MmCorpus.serialize(os.path.join(MODEL_DIR, model.corpus), corpus) tfidf_model = TfidfModel(corpus) if DEBUG: print(f'{model.tfidf} generated') tfidf_model.save(os.path.join(MODEL_DIR, model.tfidf)) tfidf = tfidf_model[corpus] lda_model = LdaMulticore(corpus=tfidf, id2word=dictionary, num_topics=30) lda_model.save(os.path.join(MODEL_DIR, model.model)) if DEBUG: print(f'{model.model} generated') print(lda_model.print_topics(5)) print('Training completed')
def train(self, num_topics, chunksize=10000, passes=6, iterations=40, eval_every=40): fmodel = f"./models/lda_{num_topics}top_{iterations}iter_{passes}pass" # logging.basicConfig(filename=fmodel + ".log", # format="%(asctime)s:%(levelname)s:%(message)s", # level=logging.INFO) temp = self.dictionary[0] id2word = self.dictionary.id2token model = LdaMulticore( corpus=self.corpus, id2word=id2word, chunksize=chunksize, iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every) model.save(fmodel + ".pt") self.model = model # p = re.compile("(-*\d+\.\d+) per-word .* (\d+\.\d+) perplexity") # matches = [p.findall(l) for l in open(fmodel+'.log')] # matches = [m for m in matches if len(m) > 0] # tuples = [t[0] for t in matches] # perplexity = [float(t[1]) for t in tuples] # liklihood = [float(t[0]) for t in tuples] # iter = list(range(0,len(tuples)*10,10)) # plt.plot(iter,liklihood,c="black") # plt.ylabel("log liklihood") # plt.xlabel("iteration") # plt.title("Topic Model Convergence") # plt.grid() # plt.savefig(fmodel + ".pdf") return model
def updateLDA(): api_file="./newsapi.key" categories=['business', 'entertainment', 'general', 'health', 'science', 'sports', 'technology'] with open(api_file,"r") as apikey: newsapi=NewsApiClient(api_key=apikey.read().strip()) headlines={cat:newsapi.get_top_headlines(category=cat, language='en', country='in') for cat in categories} pp_docs=[] for category in headlines: for article in headlines[category]['articles']: #print(lemma_pp(article['title'])) pp_docs.append(lemma_pp(article['title'])) if os.path.exists(MODEL_DIR+"corpus_dict.model"): corp_d=Dictionary.load(MODEL_DIR+"corpus_dict.model") corp_d.add_documents(pp_docs) else: corp_d = Dictionary(pp_docs) corp_d.filter_extremes(no_below=2, no_above=0.5) dtm=[corp_d.doc2bow(doc) for doc in pp_docs] tfidf=TfidfModel(dtm) corp_tfidf=tfidf[dtm] lda = LdaMulticore(corp_tfidf, num_topics=5, id2word=corp_d, passes=60, workers=3) print(lda.print_topics(num_topics=5, num_words=5)) checkdir(MODEL_DIR) corp_d.save(MODEL_DIR+"corpus_dict.model") #corp_tfidf.save(MODEL_DIR+"corpus_tfidf.model") lda.save(MODEL_DIR+"lda.model")
def createLDA(self, fileName='', modelName='', ldaPasses='', topicNum=''): ''' fileName -> file for the dictionary (.dict) and corpus (.mm) files modelName -> model name for LDA to save to disk ldaPasses -~ number of passes, 10 default topicNum -> number of topics to generate, 100 by default ''' if fileName == '': fileName = self.__fileName if ldaPasses == '': ldaPasses = self.__ldaPasses if topicNum == '': topicNum = self.__topicNum if modelName == '': modelName = fileName + '_' + str(ldaPasses) + 'P_' + str( topicNum) + 'T' dict = corpora.Dictionary.load(self.__destination + fileName + '.dict') mm = corpora.MmCorpus(self.__destination + fileName + '.mm') #lda = models.ldamodel.LdaModel(corpus=mm, id2word=dict, num_topics=6, update_every=1, chunksize=10000, passes=10) lda = LdaMulticore(corpus=mm, num_topics=topicNum, id2word=dict, chunksize=30000, passes=ldaPasses, workers=3) lda.save(self.__destination + modelName + '.lda') #======================================================================= # print lda #======================================================================= print 'Created LDA model %s' % self.__fileName
def exec_lda(): client = MongoClient() db = client.epistemonikos_files num_topics = range(2, 51) files = range(12) # Número de combinaciones posibles de preprocesamiento # TODOS los documentos for f in files: for t in num_topics: data_cursor = DBDataCursor(db, 'preprocess_{0}'.format(f)) corpus_cursor = CorpusCursor(data_cursor) lda = LdaMulticore(corpus=corpus_cursor, id2word=corpus_cursor.dictionary, num_topics=t) lda.save( 'processing_data/lda/all_docs/preprocess_{0}_topics_{1}'.format( f, t)) # Para cada TIPO de documento config_list = utils.create_config_list( 'processing_data/lda/types/config_list.json') for i, config in enumerate(config_list): for f in files: for t in num_topics: data_cursor = DBDataCursor(db, 'preprocess_{0}'.format(f), **config) corpus_cursor = CorpusCursor(data_cursor, config=i) lda = LdaMulticore(corpus=corpus_cursor, id2word=corpus_cursor.dictionary, num_topics=t) lda.save( 'processing_data/lda/types/preprocess_{0}-topics_{1}-config_{2}'.format( f, t, i))
def lda_matrix(matrix_id, preprocess, topics, data_path): data_cursor, corpus_cursor = get_file_cursor(matrix_id, preprocess, data_path) lda = LdaMulticore(corpus=corpus_cursor, id2word=corpus_cursor.dictionary, num_topics=topics) lda.save(os.path.join(data_path, '{2}-preprocess_{0}-topics_{1}.lda'.format( preprocess, topics, matrix_id))) return lda, corpus_cursor, data_cursor
def create_model(session, df, feature): print(f"Updating model for feature {feature}") freeze_support() dct = get_dict(feature, session) corpus = common.remove_stopwords(df[feature]).tolist() corpus = [doc.split() for doc in corpus] corpus = [dct.doc2bow(text) for text in corpus] dct = Dictionary.load(session + "LDA-dictionary-" + feature + ".pk1") lda_model = LdaMulticore(corpus=corpus, id2word=dct, workers=5, iterations=1500, alpha=0.01) lda_model.save(session + "LDA-model-" + feature)
class LDA(object): def __init__(self, max_workers, num_topics, passes, preprocessor=None): self.log = logging.getLogger('lda_model') self.passes = passes self.num_topics = num_topics self.max_workers = max_workers self.preprocessor = preprocessor if preprocessor is not None else Preprocessor( max_workers=max_workers) self.model, self.dictionary = None, None def train(self, doc_list): self.log.info('LDA.train called. Starting preprocessing %d documents', len(doc_list)) preprocessed_docs = self.preprocessor.process_docs(doc_list) self.log.info('Preprocessing ended. Building dictionary') self.dictionary = Dictionary(preprocessed_docs) self.log.info('Dictionary built with %d words. Building corpus', len(self.dictionary)) corpus = self.build_corpus(preprocessed_docs, self.dictionary) self.log.info( 'Built corpus. Starting actual training with ' '%d topics, %d workers, %d passes', self.num_topics, self.max_workers, self.passes) self.model = LdaMulticore(corpus, num_topics=self.num_topics, id2word=self.dictionary, workers=self.max_workers, passes=self.passes) def save_model(self, model_path): self.log.info('Saving LDA model to file: %s', model_path) self.model.save(model_path) def save_dictionary(self, dict_path): self.log.info('Saving dictionary to file: %s', dict_path) self.dictionary.save(dict_path) def build_corpus(self, doc_list, dictionary): with ProcessPoolExecutor(max_workers=self.max_workers) as executor: return list(executor.map(dictionary.doc2bow, doc_list)) @staticmethod def with_url_handling(max_workers, num_topics, passes): return LDA(max_workers, num_topics, passes, preprocessor=WithUrlPreprocessor(max_workers=max_workers))
def run(self): data = pd.read_pickle(self.input().path) sentences = (data['question1'].str.split().tolist() + data['question2'].str.split().tolist()) dictionary = corpora.Dictionary(sentences) corpus = list(map(dictionary.doc2bow, sentences)) lda = GensimLdaModel(corpus, num_topics=self.num_topics, id2word=dictionary, chunksize=1000, passes=self.passes, minimum_probability=-1.0) lda_file, dictionary_file = self.output() lda.save(lda_file.path) dictionary.save(dictionary_file.path)
def calculate_keys(vol, n_top, n_pass, cache_corpus=True, cache_model=True): texts_path = "../arxiv/{0}/{1}/".format(vol.section, vol.year) if not os.path.isdir(texts_path): raise Exception('There is no such path: {}'.format(texts_path)) files_list = shared.random_glob(texts_path, n_proc_articles) print(len(files_list)) texts = prepare_sentences(files_list, n_proc_articles) print(len(texts)) print("Searching for bigrams...") if config.biGram: bigram_transformer = Phrases(texts, min_count=10) texts = list(bigram_transformer[texts]) texts = shared.plural_filter(texts) print("Building corpus..") dictionary = corpora.Dictionary(texts) dictionary.filter_extremes(no_below=20) corpus = [dictionary.doc2bow(text) for text in texts] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) print("Running LDA...") lda = LdaMulticore(corpus, num_topics=n_top, id2word=dictionary, workers=4, passes=n_pass, iterations=400, eval_every=None) if cache_corpus: with open(config.lda_stat + "{0}.corpus".format(volume), 'wb') as f: pickle.dump(corpus, f) with open(config.lda_stat + "{0}.dict".format(volume), 'wb') as f: pickle.dump(texts, f) if cache_model: lda.save("{0}{1}".format(config.lda_stat, volume)) return lda
def LDA(dictionnaire, corpus, nbtopic=5): lda_model = LdaMulticore(corpus=corpus, id2word=dictionnaire, random_state=100, num_topics=nbtopic, passes=10, chunksize=1000, batch=False, alpha='asymmetric', decay=0.5, offset=64, eta=None, eval_every=0, iterations=100, gamma_threshold=0.001, per_word_topics=True) # save the lda model lda_model.save('lda_model.model')
def train_lda_multicore(articles, n_topics, outfile="lda", workers=3): common_dictionary = corpora.Dictionary(articles) common_corpus = [common_dictionary.doc2bow(d) for d in articles] print("Documents: ", str(len(articles))) print("Vocabulary: ", len(common_dictionary)) print("Topics: ", n_topics) print("Training LDA...") lda = LdaMulticore(common_corpus, id2word=common_dictionary, num_topics=n_topics, workers=workers) model_file = "trained_models/" + outfile lda.save(model_file) dict_filename = model_file + "_dict.pkl" pickle.dump(common_dictionary, open(dict_filename, "wb")) dict_filename = model_file + "_corpus.pkl" pickle.dump(common_corpus, open(dict_filename, "wb")) print("Saved trained LDA model as", model_file, "!")
class LDA(object): def __init__(self, max_workers, num_topics, passes): self.passes = passes self.num_topics = num_topics self.max_workers = max_workers self.model, self.dictionary = None, None def train(self, preprocessed_docs): logger.info('Building dictionary') self.dictionary = Dictionary(preprocessed_docs) logger.info('Dictionary built with %d words. Building corpus', len(self.dictionary)) corpus = self.build_corpus(preprocessed_docs, self.dictionary) logger.info('Built corpus. Starting actual training with ' '%d topics, %d workers, %d passes', self.num_topics, self.max_workers, self.passes) self.model = LdaMulticore(corpus, num_topics=self.num_topics, id2word=self.dictionary, workers=self.max_workers, passes=self.passes) def save_model(self, model_path): logger.info('Saving LDA model to file: %s', model_path) self.model.save(model_path) def save_dictionary(self, dict_path): logger.info('Saving dictionary to file: %s', dict_path) self.dictionary.save(dict_path) def build_corpus(self, doc_list, dictionary): with ProcessPoolExecutor(max_workers=self.max_workers) as executor: return list(executor.map(dictionary.doc2bow, doc_list)) @staticmethod def with_url_handling(max_workers, num_topics, passes ): return LDA(max_workers, num_topics, passes)
def train(self, dataset): corpus, dictionary = self._prepare(dataset) dictionary.save('../models.nosync/lda/dict') print('starting LDA') model = LdaMulticore( corpus=corpus, # distributed=True, workers=3, id2word=dictionary.id2token, chunksize=4000, alpha=self.c.alpha, # optimized alpha eta='auto', iterations=self.c.lda_iter, num_topics=self.c.lda_topics, passes=self.c.lda_passes, eval_every=5000) path = '../models.nosync/lda/model' model.save(path) return model, corpus
def lda3(corpus,dictionary): lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, random_state=22, num_topics=100, passes=10, chunksize=1000, batch=False, alpha='asymmetric', decay=0.5, offset=64, eta=None, eval_every=0, iterations=100, gamma_threshold=0.001, per_word_topics=True) # save the model lda_model.save('lda_model.model') # See the topics for topic in lda_model.print_topics(100,20): print(topic)
def train_lda_multicore(corpus_bow, dictionary, topic_num, model_path): """ 多核训练 :param corpus_bow: 语料 :param dictionary: 词典 :param topic_num: 主题数 :param model_path: 模型保存位置 :return: """ start = time.time() print '开始训练: %d个主题' % topic_num model_lda = LdaMulticore(corpus=corpus_bow, id2word=dictionary, num_topics=topic_num, alpha='asymmetric', minimum_probability=0.0001, minimum_phi_value=0.00001, passes=4, workers=2) print '多线程训练耗时%ds' % (time.time() - start) # 保存模型 model_lda.save(model_path)
def get_lda(self, lower_bound, higher_bound, read_corpus=None, save=True): if read_corpus!=None: with open(self.path + read_corpus, "rb") as file: corpus = pickle.load(file) file.close() self.corpus = corpus del corpus self.dictionary = gensim.corpora.Dictionary(self.corpus) self.dictionary.filter_extremes(no_below=2, no_above=0.1) self.bow_corpus = [self.dictionary.doc2bow(doc) for doc in self.corpus] #lda_models_coherence_cV = [] for num_topics in tqdm(range(lower_bound, higher_bound),desc="Training LDAs"): model_lda = LdaMulticore(corpus = self.bow_corpus, num_topics=num_topics, id2word=self.dictionary, workers=8) coherencemodel = CoherenceModel(model=model_lda, texts=self.corpus, dictionary=self.dictionary, coherence='c_v') coherence_value = coherencemodel.get_coherence() #lda_models_coherence_cV.append(coherence_value) if num_topics < 10: num_topics = "0"+str(num_topics) self.lda_models[str(num_topics)] = model_lda self.coherence[str(num_topics)] = coherence_value model_lda.save(self.path+"/topic/lda_"+str(num_topics)+".model") coherencemodel.save(self.path+"/coherence/coherence_" + str(num_topics) + ".model") if save: pickle.dump(self.bow_corpus, open(self.path + "/data/bow_corpus.pkl", 'wb')) pickle.dump(self.dictionary, open(self.path + "/data/Ldictionary.pkl", 'wb')) pickle.dump(self.lda_models, open(self.path + "/data/models.pkl", 'wb')) pickle.dump(self.coherence, open(self.path + "/data/models.pkl", 'wb')) json.dump(self.coherence, open(self.path + "/coherences.json", 'bw'))
def train_lda(n_topics=10): with open("../result/ad_issue_reviews") as fin: reviews = json.load(fin) # build bag-of-words, corpus reviews = [[word for word in review if word not in stopwords.words('english')] for review in reviews] from collections import defaultdict freq = defaultdict(int) for review in reviews: for token in review: freq[token] += 1 reviews = [[token for token in review if freq[token] > 1] for review in reviews] # dictionary = corpora.Dictionary(reviews) # only select ad related word with open("../result/relevant_ad_issues.json") as fin: ad_words = json.load(fin) ad_words = ad_words["ad"] dictionary = corpora.Dictionary([ad_words]) corpus = [dictionary.doc2bow(review) for review in reviews] logging.info("LDA start training...") lda = LdaMulticore(corpus, num_topics=n_topics) lda.save("../model/lda_ad_%d.model"%n_topics) return lda
def train(n_topics=num_topics): '''Train LDA model''' docs = read_ap.get_processed_docs() docs = [d for i, d in docs.items()] dictionary = corpora.Dictionary(docs) dictionary.filter_extremes(no_below=50) # save the dictionary with open('./objects/dictionary_lda', 'wb') as f: pkl.dump(dictionary, f) # creating bow print('creating bow corpus') corpus_bow = [dictionary.doc2bow(d) for d in docs] # creating binary bow print('creating binary bow') corpus_binary = [[(i, 1) for i, _ in d] for d in corpus_bow] # with open(os.path.join(objects_path, 'corpus'), 'wb') as f: # pickle.dump(corpus_tfidf, f) print(f'{time.ctime()} Start training LDA (BOW)') lda_bow = LdaMulticore(workers=5, corpus=corpus_binary, id2word=dictionary, chunksize=1000, num_topics=n_topics, dtype=np.float64) # save models to disk os.makedirs(models_path, exist_ok=True) lda_bow.save(os.path.join(models_path, f'lda_bow_multi'))
class Embedding(metaclass=SingletonMetaclass): def __init__(self): ''' @description: This is embedding class. Maybe call so many times. we need use singleton model. In this class, we can use tfidf, word2vec, fasttext, autoencoder word embedding @param {type} None @return: None ''' # 停止词 self.stopwords = [] with open(config.stopwords, encoding='utf-8', mode='r') as f: for line in f.readlines(): self.stopwords.append(line.strip()) self.tfidf = None self.w2v = None self.LDAmodel = None def load_data(self, path): ''' @description:Load all data, then do word segment @param {type} None @return:None ''' data = pd.read_csv(path, sep='\t', header=0) data = data.fillna("") # 对data['text']中的词进行分割,并去除停用词 参考格式: data['text'] = data["text"].apply(lambda x: " ".join(x)) data['text'] = data['text'].apply(lambda x: " ".join( [w for w in x.split() if w not in self.stopwords and w != ''])) self.labelToIndex = label2idx(data) data['label'] = data['label'].map(self.labelToIndex) data['label'] = data.apply(lambda row: float(row['label']), axis=1) data = data[['text', 'label']] # self.train, _, _ = np.split(data[['text', 'label']].sample(frac=1), [int(data.shape[0] * 0.7), int(data.shape[0] * 0.9)]) self.train = data['text'].tolist() vocab = {} for sentence in self.train: for word in sentence.split(): if word not in vocab: vocab[word] = 1 else: vocab[word] += 1 with open(config.vocab_path, "w", encoding='utf-8') as f: for k, v in vocab.items(): f.write("%s %s\n" % (k, v)) def trainer(self): ''' @description: Train tfidf, word2vec, fasttext and autoencoder @param {type} None @return: None ''' #count_vect 对 tfidfVectorizer 初始化 logging.info("Training tfidf..........") count_vect = TfidfVectorizer(stop_words=self.stopwords, max_df=0.4, min_df=0.001, ngram_range=(1, 2)) self.tfidf = count_vect.fit(self.train) self.train = [sample.split() for sample in self.train] #对 w2v 初始化 并建立词表,训练 logging.info("Training word2vec..........") self.w2v = models.Word2Vec(sentences=self.train, min_count=2, window=5, vector_size=300, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=15, workers=4, max_vocab_size=50000) self.w2v.build_vocab(self.train, update=True) self.w2v.train(self.train, total_examples=self.w2v.corpus_count, epochs=15) self.id2word = gensim.corpora.Dictionary(self.train) corpus = [self.id2word.doc2bow(text) for text in self.train] logging.info(corpus[:5]) # 建立LDA模型 logging.info("Training LDA model..........") self.LDAmodel = LdaMulticore(corpus=corpus, id2word=self.id2word, num_topics=30) def saver(self): ''' @description: save all model @param {type} None @return: None ''' if not os.path.exists("model"): os.makedirs("model") joblib.dump(self.tfidf, './model/tfidf') self.w2v.wv.save_word2vec_format('./model/w2v.bin', binary=False) self.LDAmodel.save('./model/lda') def load(self): ''' @description: Load all embedding model @param {type} None @return: None ''' self.tfidf = joblib.load('./model/tfidf') self.w2v = models.KeyedVectors.load_word2vec_format('./model/w2v.bin', binary=False) self.lda = models.ldamodel.LdaModel.load('./model/lda')
class Embedding(metaclass=SingletonMetaclass): def __init__(self): ''' @description: This is embedding class. Maybe call so many times. we need use singleton model. In this class, we can use tfidf, word2vec, fasttext, autoencoder word embedding @param {type} None @return: None ''' # 停止词 self.stopWords = open(root_path + '/data/stopwords.txt', encoding='utf-8').readlines() # autuencoder self.ae = AutoEncoder() def load_data(self): ''' @description:Load all data, then do word segment @param {type} None @return:None ''' logger.info('load data') self.data = pd.concat([ pd.read_csv(root_path + '/data/train.tsv', sep='\t'), pd.read_csv(root_path + '/data/dev.tsv', sep='\t'), pd.read_csv(root_path + '/data/test.tsv', sep='\t') ]) self.data["text"] = self.data['title'] + self.data['desc'] self.data["text"] = self.data["text"].apply(query_cut) self.data['text'] = self.data["text"].apply(lambda x: " ".join(x)) def trainer(self): ''' @description: Train tfidf, word2vec, fasttext and autoencoder @param {type} None @return: None ''' logger.info('train tfidf') count_vect = TfidfVectorizer(stop_words=self.stopWords, max_df=0.4, min_df=0.001, ngram_range=(1, 2)) self.tfidf = count_vect.fit(self.data["text"]) logger.info('train word2vec') self.data['text'] = self.data["text"].apply(lambda x: x.split(' ')) self.w2v = models.Word2Vec(min_count=2, window=5, size=300, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=15, workers=4, iter=30, max_vocab_size=50000) self.w2v.build_vocab(self.data["text"]) self.w2v.train(self.data["text"], total_examples=self.w2v.corpus_count, epochs=15, report_delay=1) logger.info('train fast') # 训练fast的词向量 self.fast = models.FastText( self.data["text"], size=300, # 向量维度 window=3, # 移动窗口 alpha=0.03, min_count=2, # 对字典进行截断, 小于该数的则会被切掉,增大该值可以减少词表个数 iter=30, # 迭代次数 max_n=3, word_ngrams=2, max_vocab_size=50000) logger.info('train lda') self.id2word = gensim.corpora.Dictionary(self.data.text) corpus = [self.id2word.doc2bow(text) for text in self.data.text] self.LDAmodel = LdaMulticore(corpus=corpus, id2word=self.id2word, num_topics=30, workers=2, chunksize=4000, passes=7, alpha='asymmetric') logger.info('train autoencoder') self.ae.train(self.data) def saver(self): ''' @description: save all model @param {type} None @return: None ''' logger.info('save autoencoder model') self.ae.save() logger.info('save tfidf model') joblib.dump(self.tfidf, root_path + '/model/embedding/tfidf') logger.info('save w2v model') self.w2v.wv.save_word2vec_format(root_path + '/model/embedding/w2v.bin', binary=False) logger.info('save fast model') self.fast.wv.save_word2vec_format(root_path + '/model/embedding/fast.bin', binary=False) logger.info('save lda model') self.LDAmodel.save(root_path + '/model/embedding/lda') def load(self): ''' @description: Load all embedding model @param {type} None @return: None ''' logger.info('load tfidf model') self.tfidf = joblib.load(root_path + '/model/embedding/tfidf') logger.info('load w2v model') self.w2v = models.KeyedVectors.load_word2vec_format( root_path + '/model/embedding/w2v.bin', binary=False) logger.info('load fast model') self.fast = models.KeyedVectors.load_word2vec_format( root_path + '/model/embedding/fast.bin', binary=False) logger.info('load lda model') self.lda = LdaModel.load(root_path + '/model/embedding/lda') logger.info('load autoencoder model') self.ae.load()
def model(n_topics, alpha=None, beta=None, saved=False, pyldavis=False, wordclouds=False, rep_letters=False, plots=False) -> dict: assert n_topics >= 2 """ aux functions to make sure it's loading the desired model """ def verify_alpha(lda_model, given): actual: list = lda_model.alpha if given == "asymmetric": return not np.isclose(actual[0], actual[-1]) elif given == "symmetric": return np.isclose(actual[0], actual[-1]) else: return np.isclose(given, actual[0]) and np.isclose( given, actual[-1]) def verify_beta(lda_model, given): actual = lda_model.eta if type(given) == float: return np.isclose(given, actual[0]) and np.isclose( given, actual[-1]) # basic == comparison doesn't work bc floats suck else: return False print(f"Building LDA model for {n_topics} topics.") if saved: lda = LdaMulticore.load(f"{TRAINED_LDA}{n_topics}") # if not (verify_alpha(lda, alpha) and verify_beta(lda, beta)): # print("Loaded model didn't pass parameter verification; train it from scratch or load the correct one.") # return print(f"Trained LDA model with {n_topics} topics loaded successfully.") else: lda = LdaMulticore( corpus, num_topics=n_topics, id2word=dictionary, passes=20, alpha=alpha if alpha is not None else "symmetric", # default eta=beta, random_state=1, iterations=100, eval_every=5, workers=3, per_word_topics=True) lda.save(f"{TRAINED_LDA}{n_topics}") print( f"LDA model with {n_topics} topics trained and saved successfully." ) """ save per-word-topics 3D matrix [!] alters global variable """ V = len(dictionary) K = n_topics N = len(corpus) global pwt pwt = np.zeros((V, K, N)) """ save topic assignment info in dataframes [!] alters global variables """ global vw global vws vws = get_topic_dists_dataframe(lda) vw, vws = set_main_topics(vw, vws) """ coherence and silhouette scores """ coherence = CoherenceModel(model=lda, texts=letters, dictionary=dictionary, coherence='c_v').get_coherence() print(f"Coherence score: {coherence}") # the higher the better avg_silhouette = plot_silhouette(vws) print(f"Average silhouette coefficient: {avg_silhouette}" ) # the higher the better """ other validation methods """ if pyldavis: vis = pyLDAvis.gensim.prepare(topic_model=lda, corpus=corpus, dictionary=dictionary, n_jobs=3) pyLDAvis.save_html(vis, f"{PYLDAVIS_PATH}/lda{n_topics}.html") if rep_letters: save_representative_letters(vws, 3) if wordclouds: save_topic_wordclouds(pwt) if plots: plot_topics_per_year(vw) plot_topics_per_recipient(vw) return { "model": lda, "num_topics": n_topics, "alpha": alpha, "beta": beta, "coherence": coherence, "silhouette": avg_silhouette, "vws": vws, "pwt": pwt }
random_state=100, num_topics=num_topics, passes=10, chunksize=1000, batch=False, alpha='asymmetric', decay=0.5, offset=64, eta=None, eval_every=0, iterations=100, gamma_threshold=0.001, per_word_topics=True) # save the model lda_model.save('tmp/lda_model.model') # See the topics lda_model.print_topics(-1) for c in lda_model[corpus[5:8]]: print("Document Topics : ", c[0]) # [(Topics, Perc Contrib)] print("Word id, Topics : ", c[1][:3]) # [(Word id, [Topics])] print("Phi Values (word id) : ", c[2][:2]) # [(Word id, [(Topic, Phi Value)])] print("Word, Topics : ", [(dct[wd], topic) for wd, topic in c[1][:2]]) # [(Word, [Topics])] print("Phi Values (word) : ", [(dct[wd], topic) for wd, topic in c[2][:2]]) # [(Word, [(Topic, Phi Value)])]
def train_save(n_topics, uni_or_bi, bow_corpus, dic): name = str(n_topics) + "-topics" lda_model = LDA(bow_corpus, num_topics=n_topics, id2word=dic, passes=2) lda_model.save(os.getcwd() + "/LDA models/{}/{}".format(uni_or_bi, name))
iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every, workers=4) top_topics = model.top_topics(corpus) #, num_words=20) # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics. avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics print('Average topic coherence: %.4f.' % avg_topic_coherence) pprint(top_topics) print(top_topics) numpy.save(os.path.join(out_path, "topics.npy"), top_topics) model.save(os.path.join(out_path, "lda_model")) #predict a topic for a document important_words = docs[2] print(important_words) print(len(important_words)) ques_vec = [] ques_vec = dictionary.doc2bow(important_words) print("ques_vec", ques_vec) topic_vec = [] topic_vec = model[ques_vec] print("topic_vec", topic_vec) word_count_array = numpy.empty((len(topic_vec), 2), dtype=numpy.object)