def build_model(self, fname=None, save_to=None): id2word = self.id2word or self.build_id2word() corpus = self.corpus or self.build_corpus() # read model.lda file if not fname: fname = click.prompt('model file name', type=str, default='model.lda') fname = self.__dest(fname) # if there is no model file or the user wants to rebuild, build .model if not os.path.isfile(fname) or click.confirm( 'There already is %s. Do you want to re run lda?' % fname): num_procs = click.prompt('Number of processes to launch', type=int, default=multiprocessing.cpu_count()) num_epochs = click.prompt('Number of epochs to run', type=int, default=20) num_topics = click.prompt('Number of topics', type=int, default=100) print 'start building model' start = time() model = LdaMulticore(corpus, id2word=id2word, num_topics=num_topics, workers=num_procs, passes=num_epochs) model.save(fname) #save print 'building model takes: %s' % LdaUtils.human_readable_time( time() - start) self.model = LdaMulticore.load(fname) return self.model
def create_lda_model(self, no_topics=10, random_state=42, passes=5, alpha='auto', eta=None, workers=None, chunksize=2000): """ :param no_topics: Number of topics that are to be explored by lda model :param random_state: Random state for reproducible results (default 42, gensim default is None) :param passes: Number of times the whole corpus is processed. :param alpha: set topic-document distribution prior alpha to "symmetric" or "asymmetric" (gensim default is "symmetric") :param eta: Word-topic distribution prior eta (beta) :param workers: number of workers to use. Defaulting to one as there seems to be a bug in gensim. 1 already uses all available cores. Higher number of workers results in a load bigger than the number of cores. :param chunksize: chunsize parameter of gensim """ if eta is None: eta = 1 / no_topics if workers is None: workers = self.processes if self.bag_of_words is None: self.create_bag_of_words() self.lda_model = LdaMulticore(corpus=self.bag_of_words, id2word=self.id2word, num_topics=no_topics, eta=eta, workers=workers, random_state=random_state, alpha=alpha, passes=passes, chunksize=chunksize)
def __init__(self, corpora, num_topics, print_topics=True): self.num_topics = num_topics self.tokenizer = nltk.tokenize.TreebankWordTokenizer() self.stemmer = nltk.stem.snowball.RussianStemmer() corpora_tokenzied = [ self.tokenizer.tokenize( (self._keep_only_russian_chars(str(doc).lower()))) for doc in corpora ] corpora_stemmed = [] for doc in corpora_tokenzied: stemmed_doc = [ self.stemmer.stem(token) for token in doc if token not in ru_stopwords ] stemmed_doc = [ token for token in stemmed_doc if token not in ru_stopwords ] corpora_stemmed.append(stemmed_doc) self.dictionary = gensim.corpora.Dictionary(corpora_stemmed) corpora_bow = [self.dictionary.doc2bow(doc) for doc in corpora_stemmed] # self.tfidf = gensim.models.TfidfModel(corpora_bow) # corpora_tfidf = self.tfidf[corpora_bow] self.lda = LdaMulticore(num_topics=self.num_topics, corpus=corpora_bow, id2word=self.dictionary) if print_topics: for s in self.lda.print_topics(): print(s)
def get_model(self, n_topics=50, n_workers=6, recalculate=False, from_scratch=True): filepath = self.paths.get_lda_filepath(n_topics) if not os.path.isfile(filepath) or recalculate: if not from_scratch: raise ValueError( 'No LDA file exists but from_scratch is False') trigram_dictionary = self.get_corpus_dict() trigram_bow_corpus = self.get_trigram_bow_corpus( trigram_dictionary) print('Building LDA model...') lda = LdaMulticore(trigram_bow_corpus, num_topics=n_topics, id2word=trigram_dictionary, workers=n_workers) lda.save(filepath) print('LDA model (n_topics={}) written to {}'.format( n_topics, filepath)) else: print('Loading LDA model (n_topics={})...'.format(n_topics)) lda = LdaMulticore.load(filepath) return lda
def createlda(num_topics, filename): dumppick(filename) num_topics = 50 texts, texts_tf_idf, dictionary = loadpcik() # 利用lsi做主题分类的情况 """ print("**************LSI*************") lsi = models.lsimodel.LsiModel(corpus=texts, id2word=dictionary, num_topics=20) # 初始化一个LSI转换 texts_lsi = lsi[texts_tf_idf] # 对其在向量空间进行转换 print(lsi.print_topics(num_topics=20, num_words=10)) """ # 利用LDA做主题分类的情况 print("**************LDA*************") #ppl = [] #for i in range(1,50,1): #texts = shuffle(texts) #texts_train = texts[:int(24012*(0.9))] #texts_vad = texts[int(24012*(0.9)):] lda = LdaMulticore(corpus=texts, iterations=1000, id2word=dictionary, num_topics=num_topics, passes=200, per_word_topics=True) #texts_lda = lda[texts_tf_idf] out = open("./ldamd/{}tpc-tpc".format(num_topics), mode="w", encoding="utf8") print(lda.print_topics(num_topics=num_topics, num_words=10), file=out) lda.save("./ldamd/{}tpc+{}".format(num_topics, filename[9:18])) #ppl.append(np.exp2(-lda.log_perplexity(texts_vad))/i) return lda, texts, texts_tf_idf, dictionary
def create_LDA_dict(): #ONE TIME USE, to create and save LDA model trigram_dictionary_filepath = '../Dataset/trigram_dict_all.dict' trigram_reviews = LineSentence( '../Dataset/trigram_transformed_reviews_all.txt') # learn the dictionary by iterating over all of the reviews trigram_dictionary = Dictionary(trigram_reviews) # filter tokens that are very rare or too common from # the dictionary (filter_extremes) and reassign integer ids (compactify) trigram_dictionary.filter_extremes(no_below=10, no_above=0.4) trigram_dictionary.compactify() trigram_dictionary.save(trigram_dictionary_filepath) print('LDA dict saved.') trigram_bow_filepath = '../Models/trigram_bow_corpus_all.mm' MmCorpus.serialize( trigram_bow_filepath, trigram_bow_generator( '../Dataset/trigram_transformed_reviews_all.txt')) trigram_bow_corpus = MmCorpus(trigram_bow_filepath) lda_model_filepath = '../Models/lda_model_all' #lda_model_all_30, lda_model_10topic # created LDA model with 10, 30, 50 topics, found 30 has best result with warnings.catch_warnings(): warnings.simplefilter('ignore') lda = LdaMulticore( trigram_bow_corpus, num_topics=30, #10, 30, 50 id2word=trigram_dictionary, workers=8) lda.save(lda_model_filepath) print('LDA model saved.')
def train_topics(datap): # build or load topic model dictp = './topic/twitter.dict' tmodelp = './topic/twitter.model' text_idx = 2 # the column idx of tweets in tsv file topic_ns = [5, 10, 15, 20] # number of topics, to choose the best topic k # load corpus corpus = [] with open(datap) as dfile: dfile.readline() for line in dfile: line = line.strip().split('\t') corpus.append(line[text_idx].split()) print('Build Dictionary for topic model...') if os.path.exists(dictp): dictionary = Dictionary.load(dictp) else: dictionary = Dictionary(corpus) dictionary.save(dictp) print('Training Topic Models......') if os.path.exists(tmodelp): best_m = LdaModel.load(tmodelp) else: # document to indices doc_matrix = [dictionary.doc2bow(doc) for doc in corpus] # find the best number of topics best_s = -10 best_m = None for idx in range(len(topic_ns)): print('Trying topic number: ', topic_ns[idx]) ldamodel = LdaMulticore(doc_matrix, id2word=dictionary, num_topics=topic_ns[idx], passes=1000, alpha='symmetric', eta=None) cm = CoherenceModel( model=ldamodel, corpus=doc_matrix, coherence='c_npmi', texts=corpus, ) if cm.get_coherence() > best_s: best_s = cm.get_coherence() best_m = ldamodel best_m.save(tmodelp) print( 'Topic number ' + str(topic_ns[idx]) + \ ', get coherence: ' + str(cm.get_coherence()) ) corpus = None del corpus # release memory return dictionary, best_m
def createLDAModel(docs, dictionary, num_topics = 100, iterations = NUM_ITERATIONS, passes = NUM_PASSES, workers = 3, output = 'lda_model'): """Creates the LDA model for the given documents. Args: docs (lst): List of tokenized documents dictionary (lst): The dictionary num_topics (int): The number of topics to discover iterations (int): The number of iterations of the LDA method passes (int): The number of passes of the LDA method workers (int): The number of workers employed in the creation of the model output (str): Prefix used to store the model in a set of files Returns: ldamodel: The LDA model """ # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in docs] # generate LDA model ldamodel = LdaMulticore(corpus, id2word = dictionary, num_topics = NUM_TOPICS, iterations = iterations, passes = passes, workers = workers) ldamodel.save(output + '_i' + str(iterations) + '_p' + str(passes) + '_T' + str(num_topics) + '.lda') return ldamodel
def fit_LdaMulticore(gensim_df, id2word, num_topics, alpha, workers=None, passes=1, iterations=1000, update_every=1000, chunksize=1000, minimum_topic_probability=0.05, forget_weight=0.5, random_state=0): model = LdaMulticore( corpus=gensim_df, id2word=id2word, num_topics=num_topics, alpha=alpha, workers=workers, passes=passes, # epochs iterations=iterations, chunksize=chunksize, #batch size minimum_probability=minimum_topic_probability, decay=forget_weight, per_word_topics=True, random_state=random_state) return model
def fit(self, X, y=None): tokens = list(X[self.token_column].values) dictionary = corpora.Dictionary(tokens) self.dictionary = dictionary self.dictionary.filter_extremes( no_below=self.no_below, no_above=self.no_above, keep_n=1000000, ) print('Number of unique tokens after filtering for LDA: %d' % len(dictionary)) if not self.inplace: X = X.copy() X['bow'] = X[self.token_column].apply(dictionary.doc2bow) from gensim.models.ldamulticore import LdaMulticore eval_every = int(self.iterations / 20) + 1 temp = dictionary[0] id2word = dictionary.id2token corpus = list(X['bow'].values) num_topics = 120 model = LdaMulticore(corpus=corpus, id2word=id2word, chunksize=750, \ eta='auto', \ iterations=self.iterations, num_topics=self.num_topics, \ passes=self.passes, eval_every=eval_every, workers=self.cpus) self.model = model
def lda_trainer(sentences, modelPath=None, nb_topics=190, multicore=False): ''' @return: lda_model: the lda_model model trained by gensim, dictionary: all terms dictionary in lda_model model ''' # load doc2bow dictionary = corpora.Dictionary(sentences) print('finish load dictionary!') corpus = [dictionary.doc2bow(text) for text in sentences] print('finish load doc2bow corpus!') # train lda_model model print('training lda_model model...') if multicore == True: # can just use in linux # very hard for CPU, cautiously use it lda_model = LdaMulticore(corpus=corpus, num_topics=nb_topics, id2word=dictionary) else: lda_model = LdaModel(corpus=corpus, num_topics=nb_topics, id2word=dictionary) print('finished lda_model model training, nb terms: %d' % lda_model.num_terms) # save lda_model model on disk if modelPath != None: lda_model.save(fname=modelPath) dictionary.save(fname_or_handle=modelPath.replace('.topic', '.dict')) print( 'producing lda_model & dictionary model ... ok! model store in {0}(.dict)' .format(modelPath)) return lda_model, dictionary
def train_lda(corpus, params, dictionary): """Train LDA model according to provided params""" # Set training parameters. num_topics = params.num_topics chunksize = params.chunksize passes = params.passes iterations = params.iterations decay = params.decay offset = params.offset # Make an index to word dictionary. logging.info("Mapping ids to words...") temp = dictionary[0] id2word = dictionary.id2token logging.info("Done mapping ids to words.") logging.info("Making the LDA model...") lda = LdaMulticore( corpus=corpus, id2word=id2word, workers=3, # Allows algorithm to run more efficiently chunksize=chunksize, alpha= 'asymmetric', # If low: Each document is represented by only a few topics eta='auto', # If low: Each topic is only represented by a few words decay=decay, offset=offset, iterations=iterations, num_topics=num_topics, passes=passes, eval_every=None, random_state=230, per_word_topics=True) logging.info("Done making the LDA model.") return lda
def pipeline_lda(que: pd.DataFrame, dim: int) -> (Dictionary, TfidfModel, LdaMulticore): """ Pipeline for training embeddings for questions via LDA algorithm on question titles and bodies :param que: raw questions.csv dataset :param dim: dimension of doc2vec embeddings to train :return: trained tags, industries embeddings and question's Doc2Vec model """ lda_tokens = que['questions_whole'].apply(lambda x: x.split()) # create Dictionary and train it on text corpus lda_dic = Dictionary(lda_tokens) lda_dic.filter_extremes(no_below=10, no_above=0.6, keep_n=8000) lda_corpus = [lda_dic.doc2bow(doc) for doc in lda_tokens] # create TfidfModel and train it on text corpus lda_tfidf = TfidfModel(lda_corpus) lda_corpus = lda_tfidf[lda_corpus] # create LDA Model and train it on text corpus lda_model = LdaMulticore(lda_corpus, num_topics=dim, id2word=lda_dic, workers=4, passes=20, chunksize=1000, random_state=0) return lda_dic, lda_tfidf, lda_model
def test_lda_streaming(): generator = token_stream(NOVELS_DIRPATH) dictionary = Dictionary(generator) bags_of_words = [dictionary.doc2bow(tokens) for tokens in generator] lda = LdaMulticore(corpus=bags_of_words, id2word=dictionary, random_state=723812, passes=10, workers=4) parsed_topics = parse_topics(lda) assert len(parsed_topics) == 20 #print(parsed_topics[0]) #assert parsed_topics[0] == {'upshe': 0.001, 'jane': 0.001, 'think': 0.001, 'regular': 0.001, 'facile': 0.001, 'her': 0.001, 'power': 0.001, 'intimate': 0.001, 'saythat': 0.001, 'manyacquaintance': 0.001} #assert parsed_topics[0] == {'quite': 0.001, 'pomps': 0.001, 'inutility': 0.001, 'counts': 0.001, 'brought': 0.001, 'repent': 0.001, 'dayabout': 0.001, 'professor': 0.001, 'upward': 0.001, 'been': 0.001} # results not repeatable unless you set the random_state param! assert parsed_topics[0] == { 'doctor': 0.001, 'companion': 0.001, 'lucky': 0.001, 'somewhat': 0.001, 'ofchildhood': 0.001, 'rub': 0.001, 'idea': 0.001, 'pleasure': 0.001, 'ofexistence': 0.001, 'disposition': 0.001 }
def train(self): split_archives = [article.tokens for article in self.articles] # create dictionary and corpus dictionary = corpora.Dictionary(split_archives) dictionary.filter_extremes(no_above=self.words_no_above) corpus = [dictionary.doc2bow(article) for article in split_archives] logger.info('Created dictionary and corpus') # get eta to force topics eta = get_eta(self.num_topics, dictionary) # create lda model with gensim lda_progress = LDAProgress(self.passes) ldamodel = LdaMulticore(corpus, num_topics=self.num_topics, id2word=dictionary, passes=self.passes, per_word_topics=True, iterations=self.iterations, eta=eta, workers=cpu_count()) lda_progress.close() logger.info('Created Topics model') # print the topics (debug) logger.debug('Topics:') topics = ldamodel.print_topics(num_words=5) for topic in topics: logger.debug(topic) self.model = ldamodel self.dictionary = dictionary
def lda(corpus, num_topics=5, save_as=None, load=None, verbose=True): module_path = os.path.dirname(__file__) model_path = module_path + "/models" if verbose: print("prepare data") corpus = corpus.apply(lambda x: x.split(" ")) dictionary = Dictionary(corpus) bow = [dictionary.doc2bow(doc) for doc in corpus] if type(load) == str: if verbose: print("loading lda") lda = LdaMulticore.load(model_path + "/" + load) else: if verbose: print("training lda") lda = LdaMulticore(bow, num_topics=num_topics) if save_as: try: os.mkdir(model_path) except: pass lda.save(model_path + "/" + save_as) if verbose: print("generate visualization") vis = pyLDAvis.gensim.prepare(lda, bow, dictionary) return lda, vis
def learn_lda_model(self, corpus, dictionary, k, iterations=100): """ learning LDA model :param corpus: corpus created by gensim :param dictionary: dictionary created by gensim :param k: number of topics :param iterations: number of iterations :return: """ if not self.use_mallet: lda = LdaMulticore(corpus, id2word=dictionary, workers=self.cpu_count, num_topics=k, random_state=42, iterations=iterations, per_word_topics=False, eval_every=None) else: lda = LdaMallet(self.path_to_mallet_binary, corpus=corpus, id2word=dictionary, workers=self.cpu_count, num_topics=k, random_seed=42, iterations=iterations, optimize_interval=10) cm = CoherenceModel(model=lda, corpus=corpus, coherence='u_mass') coherence = cm.get_coherence() print('{}: {}'.format(k, coherence)) return coherence, lda
def train_classifier(papers: list, num_topics: int) -> LdaModel: """ Trains the Lda model with selected documents. Training is done by cleaning the documents, index the words and train the model with a given number of topics Args: papers: list of papers, each item containing the corpus of a document num_topics: amount of topics that need to be trained Returns: Trained lda model """ papers_clean = [clean(paper) for paper in papers] dictionary = corpora.Dictionary(papers_clean) doc_term_matrix = [dictionary.doc2bow(paper) for paper in papers_clean] models = [] print("Start generating models") for x in range(13, 14): ldamodel = LdaMulticore(doc_term_matrix, num_topics=x, id2word=dictionary, passes=50) topic_words = [w[0] for x in range(ldamodel.num_topics) for w in ldamodel.show_topic(x)] unique_words = set(topic_words) models.append(ldamodel) print(x, len(unique_words), len(unique_words)/float(len(topic_words))) x = 1 while True: try: x = int(input("Enter the model you want to train labels for:\n")) except ValueError: print("not an integer") continue if x > len(models) or x < 1: print("Model does not exist") else: break return models[x-1], dictionary
def run_model(self, collection_name, num_topics, save_dir=None, save_file=None, alpha=0.1, beta=0.01, iterations=800, passes=1): model = LdaMulticore(corpus=self.corpus, id2word=self.dictionary, num_topics=num_topics, alpha=alpha, eta=beta, iterations=iterations, passes=passes) if save_dir is None: save_dir = Constants.SAVE_DIR.format( collection_name.lower().replace(' ', '_')) if not os.path.isdir(save_dir): os.makedirs(save_dir) if save_file is None: save_file = Constants.SAVE_FILE_FORMAT.format( collection_name.lower().replace(' ', '_'), num_topics, alpha, beta, iterations) logging.info(save_dir) model.save(os.path.join(save_dir, save_file)) return model
def infer_model(self, timeline: dict, exec_key, verbose: bool = False): bow, dictionary = self.prepare_data(timeline) if self.__class__.model_is_already_inferred(timeline, exec_key): logger.info('Model is already inferred') model = pickle.loads(timeline['models'][exec_key]) else: logger.info('Inferring LDA...') try: model = LdaMulticore( bow, id2word=dictionary, num_topics=self.n_topics, passes=self.n_passes, random_state=0, ) except ValueError as e: error = 'cannot compute LDA over an empty collection (no terms)' if str(e) == error: logger.error( 'Cannot compute LDA, there are no terms enough. ' 'Maybe you need to decrease LDA_MIN_DF setting') return None if verbose: self.print_terms(model) self.generate_html(model, bow, dictionary, timeline['user']) return model
def generate_tags(tokens: list) -> list: """Perform LDA Topic Modelling to aquire tags. Args: tokens (list): List of tokens Returns: tags_list (list) List of appropriate tags for given tokens. """ id2word = Dictionary(tokens) corpus = [id2word.doc2bow(d) for d in tokens] model = LdaMulticore( corpus=corpus, id2word=id2word, random_state=42, num_topics=10, passes=2, workers=1 ) words = [re.findall(r'"([^"]*)"', t[1]) for t in model.print_topics()] wordcount = Counter(words[0] + words[1] + words[2] + words[3] + words[4]) tags = pd.DataFrame.from_dict( wordcount, orient='index', columns=['number'] ) tags = tags.drop(tags[tags['number'] <= 1].index) tags = tags.sort_values(by=['number'], ascending=False).T tags_list = [word for word in tags.columns] return tags_list
def train(self, corpus, dictionary, num_topics=5, per_word_topics=False, passes=30, workers=4, iterations=10, chunksize=200, save=False): model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics, per_word_topics=per_word_topics, minimum_phi_value=0.005, passes=passes, workers=workers, iterations=iterations, chunksize=chunksize, random_state=93) if save: self.save_model(lda_model=model) return model
def guidedLDA_Model(topics, cores=11): """ Topics represents desired LDA topics, cores should be physical cores minus one. Both should be integers. """ # load finished dictionary from disk trigram_dictionary = Dictionary.load('./models2/trigram_dict_all.dict') # generate bag-of-words representations for # all reviews and save them as a matrix MmCorpus.serialize('./models2/trigram_bow_corpus.nm', trigram_bow_generator('./models2/trigram_transformed_reviews.txt')) # load finished bag-of-words corpus from disk trigram_bow_corpus = MmCorpus('./models2/trigram_bow_corpus.nm') # Pass the bag-of-words matrix and Dictionary from previous steps to LdaMulticore as inputs, # along with the number of topics the model should learn # workers => sets the parallelism, and should be # set to your number of physical cores minus one lda = LdaMulticore(trigram_bow_corpus, num_topics=topics, id2word=trigram_dictionary, workers=cores) lda.save('./models2/lda_model') # load the finished LDA model from disk #lda = LdaMulticore.load('./models/lda_model_neg') return trigram_bow_corpus, lda
def train(self, num_topics, passes, iterations, workers): ldamodel = LdaMulticore(self.corpus, num_topics=num_topics, id2word=self.dictionary, passes=passes, workers=workers, iterations=iterations) self.model = ldamodel
def train_lda(args): print "[LDA > n_topics: %d ]" % args.dim lda_reader = LDAReader(args.ds, max_sent=args.max_sent) ldazito = LdaMulticore(lda_reader, id2word=lda_reader.idx2wrd, num_topics=args.dim, workers=args.workers) ldazito.save(args.out)
def _train_model(self): self._lda = LdaMulticore(corpus=self._corpus, id2word=self._id2word, num_topics=self.num_topics, workers=1, chunksize=10000, passes=1) self._save_model()
def fit_lda(X, num_topics=5, passes=20): """ Fit LDA from a scipy CSR matrix (X). """ print('fitting lda...') return LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=passes, eval_every=5, workers=5)
def train(self, vecs): """ Build the topic model. """ corp = Scipy2Corpus(vecs) self.m = LdaMulticore(corp, num_topics=self.n_topics, iterations=1000, workers=3)
def get_lda_model(index_tokens, num_topics=6, passes=3): print('Getting gensim LDA topic model') dictionary = gensim.corpora.Dictionary(index_tokens) corpus = [dictionary.doc2bow(text) for text in index_tokens] lda_model = LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=passes) return lda_model, dictionary, corpus
def topics_by_lda(self, tokenized_corpus_path, num_topics=20, num_words=10, max_lines=10000, split="\s+", max_df=100): """ 读入经过分词的文件并且对其进行 LDA 训练 Arguments: tokenized_corpus_path -> string -- 经过分词的语料集地址 num_topics -> integer -- 主题数目 num_words -> integer -- 主题词数目 max_lines -> integer -- 每次读入的最大行数 split -> string -- 文档的词之间的分隔符 max_df -> integer -- 避免常用词,过滤超过该阈值的词 """ # 存放所有语料集信息 corpus = [] with open(tokenized_corpus_path, 'r', encoding='utf-8') as tokenized_corpus: flag = 0 for document in tokenized_corpus: # 判断是否读取了足够的行数 if (flag > max_lines): break # 将读取到的内容添加到语料集中 corpus.append(re.split(split, document)) flag = flag + 1 # 构建语料集的 BOW 表示 (vocab, DTM) = self.corpus2dtm(corpus, max_df=max_df) # 训练 LDA 模型 lda = LdaMulticore(matutils.Sparse2Corpus(DTM, documents_columns=False), num_topics=num_topics, id2word=dict([(i, s) for i, s in enumerate(vocab)]), workers=4) # 打印并且返回主题数据 topics = lda.show_topics(num_topics=num_topics, num_words=num_words, formatted=False, log=False) for ti, topic in enumerate(topics): print("Topic", ti, ":", " ".join(word[0] for word in topic[1]))