def train(data, valid_h1, valid_h2, vocab): #logging.basicConfig(filename=args.save_path + 'lda.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) model = LdaModel(id2word=vocab, num_topics=args.topics, random_state=0, chunksize=args.batch_size, update_every=args.batch_size, alpha='auto', eta=None, decay=args.decay, offset=args.offset, per_word_topics=True) best_perplexity = float('inf') for epoch in range(args.epochs): model.update(data, passes=1, eval_every=1, gamma_threshold=0.001) print("Epoch number {}".format(epoch), end=' ') val_perplexity = evaluate(data, valid_h1, valid_h2, model, 'valid') if val_perplexity < best_perplexity: best_perplexity = val_perplexity model.save(os.path.join(args.save_path, 'model.ckpt'))
class TestLdaCallback(unittest.TestCase): def setUp(self): self.corpus = MmCorpus(datapath('testcorpus.mm')) self.ch_umass = CoherenceMetric(corpus=self.corpus, coherence="u_mass", logger="visdom", title="Coherence") self.callback = [self.ch_umass] self.model = LdaModel(id2word=common_dictionary, num_topics=2, passes=10, callbacks=self.callback) self.host = "http://localhost" self.port = 8097 def testCallbackUpdateGraph(self): # Popen have no context-manager in 2.7, for this reason - try/finally. try: # spawn visdom.server proc = subprocess.Popen(['python', '-m', 'visdom.server', '-port', str(self.port)]) # wait for visdom server startup (any better way?) time.sleep(3) viz = Visdom(server=self.host, port=self.port) assert viz.check_connection() # clear screen viz.close() self.model.update(self.corpus) finally: proc.kill()
class Lda(ModelABC): """Represent news articles as vectors using Latent Dirichlet Allocation.""" def __init__(self, dictionary: Dictionary, corpus=None, size: int = 100, decay=0.5, lda_filename: str = None): """ :param dictionary: A dictionary :param corpus: A corpus for training :param size: The length of feature vector :param decay: The decay parameter :param lda_filename: File name of a previously trained model """ super().__init__(size) # Check if we have already trained the Lda model if lda_filename is not None and os.path.exists(lda_filename): self.lda = LdaModel.load(lda_filename) logging.info("LDA model loaded") else: if corpus is None: raise ValueError("Corpus must be provided to train LDA") self.lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=size, passes=1, decay=decay, minimum_probability=0.0) def update(self, documents): """ Update model using documents. :param documents: The new documents used for update """ self.lda.update(documents) def save(self, filename: str): """ Save model to a file. :param filename: A model file name """ self.lda.save(filename) def _get_vector_representation(self, items): """ Represent documents as vectors. :param items: A list of documents :return: A list of feature vectors. """ return self.lda[items]
class LDAModel(Model, Transformer): def __init__(self, corpus=None, **kwargs): self._m = LdaModel(corpus, **kwargs) def fit(self, corpus): self._m.update(corpus) def transform(self, corpus): return self._m[corpus] @property def inst(self): return self._m
class LDA(GenericModel): """ Wrapper for Gensim LdaModel and LdaMulticore """ def __init__(self, *args, **kwargs): """ All provided arguments will be passed to LdaModel or LdaMulticore constructors (the latter in case 'workers' is present in keyword arguments) :param args: positional arguments to initialize model with :param kwargs: keyword arguments to pass to model constructor """ if 'workers' in kwargs.keys(): self.__model__ = LdaMulticore(*args, **kwargs) else: self.__model__ = LdaModel(*args, **kwargs) def fit(self, data: Any, *args, **kwargs): # Actually, I think there is no need for this as # we can simply use update() for uninitialized model self.__model__.update(corpus=data, *args, **kwargs) def update(self, data: Any, *args, **kwargs): self.__model__.update(corpus=data, *args, **kwargs) def get_topics(self, docs: Optional[Iterable[Any]] = None, *args, **kwargs): if docs is None: topics = self.__model__.show_topics(formatted=False, *args, **kwargs) else: topics = map( partial(self.__model__.get_document_topics, per_word_topics=True), docs) topics, t_copy, t_copy_1 = tee(topics, 3) ids = map(lambda x: x[0], topics) words = map(lambda x: x[1], t_copy) words = map(lambda x: list(zip(*x))[0], words) scores = map(lambda x: x[1], t_copy_1) scores = map(lambda x: list(zip(*x))[1], scores) topics = zip(ids, zip(words, scores)) return topics
class TestLdaCallback(unittest.TestCase): def setUp(self): self.corpus = MmCorpus(datapath('testcorpus.mm')) self.ch_umass = CoherenceMetric(corpus=self.corpus, coherence="u_mass", logger="visdom", title="Coherence") self.callback = [self.ch_umass] self.model = LdaModel(id2word=common_dictionary, num_topics=2, passes=10, callbacks=self.callback) self.host = "http://localhost" self.port = 8097 def testCallbackUpdateGraph(self): with subprocess.Popen(['python', '-m', 'visdom.server', '-port', str(self.port)]) as proc: # wait for visdom server startup (any better way?) viz = Visdom(server=self.host, port=self.port) for attempt in range(5): time.sleep(1.0) # seconds if viz.check_connection(): break assert viz.check_connection() viz.close() self.model.update(self.corpus) proc.kill()
class TestLdaCallback(unittest.TestCase): def setUp(self): self.corpus = MmCorpus(datapath('testcorpus.mm')) self.ch_umass = CoherenceMetric(corpus=self.corpus, coherence="u_mass", logger="visdom", title="Coherence") self.callback = [self.ch_umass] self.model = LdaModel(id2word=common_dictionary, num_topics=2, passes=10, callbacks=self.callback) self.host = "http://localhost" self.port = 8097 def testCallbackUpdateGraph(self): # Popen have no context-manager in 2.7, for this reason - try/finally. try: # spawn visdom.server proc = subprocess.Popen( ['python', '-m', 'visdom.server', '-port', str(self.port)]) # wait for visdom server startup (any better way?) time.sleep(3) viz = Visdom(server=self.host, port=self.port) assert viz.check_connection() # clear screen viz.close() self.model.update(self.corpus) finally: proc.kill()
@author: Bokkin Wang """ from gensim.test.utils import common_texts from gensim.corpora.dictionary import Dictionary from gensim.models import LdaModel from gensim.test.utils import datapath # Create a corpus from a list of texts common_dictionary = Dictionary(common_texts) common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] # Train the model on the corpus. lda = LdaModel(common_corpus, num_topics=10) # Save model to disk. temp_file = datapath("model") lda.save(temp_file) # Load a potentially pretrained model from disk. lda = LdaModel.load(temp_file) # Create a new corpus, made of previously unseen documents. other_texts = [['computer', 'time', 'graph'], ['survey', 'response', 'eps'], ['human', 'system', 'computer']] other_corpus = [common_dictionary.doc2bow(text) for text in other_texts] unseen_doc = other_corpus[0] vector = lda[unseen_doc] # get topic probability distribution for a document lda.update(other_corpus) vector = lda[unseen_doc]
class LDATagger: _lda_model = None _dictionary = None _lda_model_path = None _dictionary_path = None DEFAULT_MODEL_PATH = os.path.join(os.path.dirname(__file__), "model") DEFAULT_NUM_TOPICS = 1000 def __init__(self, model_path=DEFAULT_MODEL_PATH, num_topics=DEFAULT_NUM_TOPICS, lock=threading.Lock()): self.save_model_lock = lock if os.path.isfile(model_path): raise Exception("Invalid Model Path; Should Be a Directory") if not os.path.exists(model_path): os.makedirs(model_path) self._lda_model_path = os.path.join(model_path, "lda.model") self._dictionary_path = os.path.join(model_path, "tokens.dict") self.num_topics = num_topics self.model_folder_lock = FileLock(model_path) def topics_for_documents(self, doc_tokens_map): self.check_and_load_model() doc_topics_map = defaultdict(list) for document_id, document_tokens in doc_tokens_map.iteritems(): doc_topics_map[document_id] = self.topics_for_document( document_tokens) return doc_topics_map def topics_for_document(self, tokens): self.check_and_load_model() bow_tokens = self._dictionary.doc2bow(tokens) topics = self._lda_model[bow_tokens] return topics def build_topics(self, tokens_list): self._dictionary = Dictionary(tokens_list) corpus = [ self._dictionary.doc2bow(document_tokens) for document_tokens in tokens_list ] self._lda_model = LdaModel(corpus=corpus, id2word=self._dictionary, num_topics=self.num_topics, passes=100) self.save_model() def save_model(self, sleep_for_test=False, mock_datastruct=None): self.save_model_lock.acquire() self.model_folder_lock.acquire() if mock_datastruct: mock_datastruct.acquire() if sleep_for_test: import time time.sleep(1) print "Acquired Lock " try: self._lda_model.save(self._lda_model_path) self._dictionary.save(self._dictionary_path) finally: print "Released Lock" if mock_datastruct: mock_datastruct.release() self.model_folder_lock.release() self.save_model_lock.release() def check_and_load_model(self): if self._lda_model and self._dictionary: return if os.path.exists(self._lda_model_path): self._lda_model = LdaModel.load(self._lda_model_path) else: raise Exception("LDA Model Not found in the path") if os.path.exists(self._dictionary_path): self._dictionary = Dictionary.load(self._dictionary_path) else: raise Exception("Tokens Dictionary Not found in the path") def update_model(self, tokens_list): self.check_and_load_model() corpus = [ self._dictionary.doc2bow(document_tokens) for document_tokens in tokens_list ] self._lda_model.update(corpus=corpus) self.save_model() def build_or_update_model(self, tokens_list): if not self.does_model_exist(): self.build_topics(tokens_list) else: self.update_model(tokens_list) def does_model_exist(self): if os.path.exists(self._lda_model_path) and os.path.exists( self._dictionary_path): return True return False def get_model(self): self.check_and_load_model() model_hash = { "lda_model": cPickle.dumps(self._lda_model), "dictionary": cPickle.dumps(self._dictionary) } return model_hash def restore_model(self, model_hash): self._lda_model = cPickle.loads( model_hash["lda_model"].encode('utf-8')) self._dictionary = cPickle.loads( model_hash["dictionary"].encode('utf-8')) self.save_model() def topics_to_tokens(self): topics_tokens_map = defaultdict(list) if not self.does_model_exist(): return [] else: model = self._lda_model topics_to_tokens = model.show_topics( topics=self.DEFAULT_NUM_TOPICS, topn=25, log=False, formatted=False) for topic_id, tokens in enumerate(topics_to_tokens): topics_tokens_map[topic_id] = self.list_of_tuples_to_hash( tokens) return topics_tokens_map def list_of_tuples_to_hash(self, tokens): tokens_hash = defaultdict(float) for token_probability, token in tokens: tokens_hash[token] = token_probability return tokens_hash
class LdaTest(unittest.TestCase): def setUp(self): self.lda = LdaModel(corpus = common_corpus, id2word = common_dictionary, num_topics = 10) def test_common_dictionary(self): """ Test if dictionary of our model is equal to common_dictionary :return: :rtype: """ dictionary = {'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user' : 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11} self.assertEqual(self.lda.id2word.token2id, dictionary) def test_common_texts(self): """ Test if the order of common_texts change. :return: :rtype: """ texts = [['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']] self.assertEqual(common_texts, texts) def test_common_corpus(self): """ Test if the order of common_corpus change. :return: :rtype: """ corpus = [[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]] self.assertEqual(corpus, common_corpus) def test_new_corpus(self): """ Test if the bow representation of new corpus is consistent :return: :rtype: """ other_texts_without_unseen_word = [["computer", "time", "graph"], ["survey", "response", "eps"], ["human", "system", "computer"]] other_corpus_without_unseen_word = [common_dictionary.doc2bow(text) for text in other_texts_without_unseen_word] self.assertEqual(other_corpus_without_unseen_word[0], [(0, 1), (6, 1), (10, 1)]) other_texts_with_unseen_word = [["computer", "graph", "hardware", "time", ], ["survey", "response", "eps", "administrator"]] other_corpus_with_unseen_word = [common_dictionary.doc2bow(text) for text in other_texts_with_unseen_word] self.assertEqual(other_corpus_with_unseen_word[0], [(0, 1), (6, 1), (10, 1)]) def test_lda_update_1(self): """ Update with unseen text, which doesn't have new words. :return: :rtype: """ other_texts = [["computer", "time", "graph"], ["survey", "response", "eps"], ["human", "system", "computer"]] other_corpus = [common_dictionary.doc2bow(text) for text in other_texts] original_model = copy.deepcopy(self.lda) # Inplace update self.lda.update(other_corpus) self.assertNotEqual(self.lda, original_model) self.assertEqual(self.lda.id2word.token2id, original_model.id2word.token2id) def test_lda_update_2(self): """ Update with unseen text, which have new words. I add 'hardware', 'administrator' to test. :return: :rtype: """ other_texts = [["computer", "graph", "hardware", "time", ], ["survey", "response", "eps", "administrator"]] other_corpus = [common_dictionary.doc2bow(text) for text in other_texts] original_model = copy.deepcopy(self.lda) self.lda.update(other_corpus) self.assertEqual(self.lda.id2word.token2id, original_model.id2word.token2id)
class LDAModel: """ Base class for LSA model. """ def __init__(self, vector_length): """ Initialize model with parameters. Model is fit if it has not been done before. :param vector_length: Number of topics in model. """ self.shortname = 'LDA' self.name = 'LDAmodel' + str(vector_length) self.vector_length = vector_length self.remove_stopwords = None self.word_dict = None self.path = None self.model = None self.doc_vecs = None def set_dict(self, data, remove_stopwords=False, no_below=1, no_above=1, filter_most_frequent=0): """ Set/make dictionary to be used for bow representations. :param data: Which data to use for making dictionary. :param remove_stopwords: Whether to remove stopwords. :param no_below: Minimum number of documents a word has to appear in to be included. :param no_above: Maximum fraction of documents a word can appear in to be included. :param filter_most_frequent: Remove the most frequent words. """ if self.word_dict != None: print( "Model already have a dictionary! This function call does nothing. " ) return self.name = '%s_%sdict_rs%s_nb%s_na%s_fmf%s' % ( self.name, data.name, str(remove_stopwords), str(no_below), str(no_above), str(filter_most_frequent)) self.remove_stopwords = remove_stopwords self.word_dict = data.get_dictionary(remove_stopwords, no_below, no_above, filter_most_frequent) def train(self, data, passes): """ Fit LSA model to the data, set document topic vectors and calculate distances. :param data: Data to fit model on """ if self.word_dict == None: print( "Dictionary must be assigned to model before training. This function call does nothing" ) return if self.model == None: self.model = LdaModel(num_topics=self.vector_length, id2word=self.word_dict, alpha='auto') #, eta='auto') self.name = '%s_%strain_p%s' % (self.name, data.name, str(passes)) self.path = Path('modelfiles/%s/%s' % (data.name, self.name)) try: self.model = LdaModel.load(str(self.path / '.model')) except: self.path.mkdir(parents=True, exist_ok=True) print("Training model...", end='') time.sleep(0.1) datastream = GetBow(data, self.remove_stopwords, self.word_dict) self.model.update(datastream, passes=passes, chunksize=20000, iterations=500) self.model.save(str(self.path / '.model')) def fit(self, data): """ Fit LSA model to the data, set document topic vectors and calculate distances. """ if self.model == None: print( "Model must be trained first. This function call does nothing") return try: self.doc_vecs = pd.read_csv( self.path / str('document_vectors_%s.csv' % data.name), index_col=0) except: print("Fitting model...", end='') time.sleep(0.1) # Container for document topic vectors with zeros doc_vecs = np.zeros((len(data.ids), self.vector_length)) # For each document datastream = GetBow(data, self.remove_stopwords, self.word_dict) for i in range(len(datastream)): # element is now a tuple with index and value for nonzero vector elements for element in self.model[datastream[ i]]: #self.model.get_document_topics(datastream[i], minimum_probability=0.0): # Set nonzero elements in container doc_vecs[i][element[0]] = element[1] # Set document topic vectors as pandas dataframe self.doc_vecs = pd.DataFrame(doc_vecs, index=data.ids) self.doc_vecs.to_csv(self.path / str('document_vectors_%s.csv' % data.name))
class TweetLDA: def __init__(self, date): self.date = date self.documents = [] self.tweet_ids = [] self.bigram = None self.b_min = 90 self.dictionary = None self.corpus = None # Training parameters self.num_topics = 6 self.chunksize = 60000 self.passes = 20 self.iterations = 400 self.eval_every = None self.model = None def compute_bigram(self): ''' Find and save bigrams living among the tweets :update: [covid_tweets].[token_tweets] ''' print("Computing bigram.") cnxn = sqlite3.connect("covid_tweets.db") cursor = cnxn.cursor() count_query = ''' SELECT count(tweet_id) FROM token_tweets WHERE date = ?''' cursor.execute(count_query, (self.date, )) num_tweets = cursor.fetchone()[0] print(self.date, num_tweets, "to have bigram computed.") query = ''' SELECT tweet_id, tokenized_tweet FROM token_tweets WHERE date = ?''' cursor.execute(query, (self.date, )) results = cursor.fetchall() cnxn.close() retokenized_tweets = [] for tweet_id, tokenized_tweet in results: tweet_tokens = tokenized_tweet.split(" ") retokenized_tweets.append(tweet_tokens) phrases = Phrases(retokenized_tweets, min_count=self.b_min) bigram = Phraser(phrases) bigram.save(f"./tmp/{self.date}_bigram_model_{self.b_min}.pkl") print("Bigram computed.") def load_bigram(self): ''' Search for and load a pre-existing bigrams file :update: self.bigram ''' self.bigram = Phraser.load( f"./tmp/{self.date}_bigram_model_{self.b_min}.pkl") print("Bigram loaded.") def prepare_documents(self): ''' Integrate bigrams into the documents so they can be used for the model :update: self.documents ''' print("Preparing documents.") cnxn = sqlite3.connect("covid_tweets.db") cursor = cnxn.cursor() query = ''' SELECT tweet_id, tokenized_tweet FROM token_tweets WHERE date = ? AND in_model = 0 LIMIT 50000''' cursor.execute(query, (self.date, )) results = cursor.fetchall() cnxn.close() if len(results) == 0: raise ValueError for tweet_id, tt in results: self.documents.append(tt.split(" ")) self.tweet_ids.append(tweet_id) for i in range(len(self.documents)): for token in self.bigram[self.documents[i]]: if '_' in token: self.documents[i].append(token) print("Documents have been prepared.") def update_documents(self): ''' Flag that documents have been added to the LDA model :update: [token_tweets] ''' print("Updated relevant documents in [token_tweets]") cnxn = sqlite3.connect("covid_tweets.db") cursor = cnxn.cursor() update_query = ''' UPDATE token_tweets SET in_model = 1 WHERE tweet_id IN (%s)''' cursor.execute(update_query % ','.join('?' * len(self.tweet_ids)), self.tweet_ids) cnxn.commit() cnxn.close() def generate_dictionary(self): ''' Create a dictionary representation of the documents, filtering extremes. :update: self.dictionary, gensim Dictionary object ''' print("Generating dictionary.") cnxn = sqlite3.connect("covid_tweets.db") cursor = cnxn.cursor() query = ''' SELECT tokenized_tweet FROM token_tweets WHERE date = ?''' cursor.execute(query, (self.date, )) results = cursor.fetchall() cnxn.close() self.documents = [tt.split(" ") for tt, in results] for i in range(len(self.documents)): for token in self.bigram[self.documents[i]]: if '_' in token: self.documents[i].append(token) self.dictionary = Dictionary(self.documents) self.dictionary.filter_extremes(no_below=30, no_above=0.50) self.dictionary.save(f"./tmp/{self.date}_dictionary.pkl") print("Dictionary has been saved.") def load_dictionary(self): ''' Load a dictionary of the associated documents. :update: self.corpus, list of Bag-of-Word documents ''' self.dictionary = Dictionary() self.dictionary = self.dictionary.load( f"./tmp/{self.date}_dictionary.pkl") print("Dictionary loaded.") def generate_corpus(self): ''' Create a Bag-of-Words representation corpora. Ready to be trained. :update: self.corpus, list of Bag-of-Word documents ''' self.corpus = [self.dictionary.doc2bow(d) for d in self.documents] def generate_model(self): ''' Utilizting the python Gensim library and the prepared corpus, create a trained LDA model. :update: self.model, LdaModel object ''' temp = self.dictionary[0] id2word = self.dictionary.id2token print("Model generation is beginning.") self.model = LdaModel(corpus=self.corpus, id2word=id2word, chunksize=self.chunksize, alpha='auto', eta='auto', iterations=self.iterations, num_topics=self.num_topics, passes=self.passes, eval_every=self.eval_every) print("Model generated.") temp_file = datapath(f"{self.date}_model") print(temp_file) self.model.save(f"./tmp/{self.date}_model") print("Model has been saved.") self.update_documents() pprint(self.model.top_topics(self.corpus)) def load_model(self): ''' Load a pre-trained model to be analyze or updated :update: self.model, LdaModel object ''' temp_file = datapath(f"{self.date}_model") print(temp_file) #self.model = LdaModel.load(temp_file) self.model = LdaModel.load(f"./tmp/{self.date}_model") def update_model(self): ''' Update the pre-existing model with a new corpus :update: self.documents :update: self.model :udpate: self.corpus ''' self.prepare_documents() self.generate_corpus() print(f"{self.date}_model is being updated.") self.model.update(self.corpus, chunksize=self.chunksize) temp_file = datapath(f"{self.date}_model") print(temp_file) self.model.save(temp_file) print("Model has been saved.") pprint(self.model.top_topics(self.corpus)) self.update_documents() def analyze_model(self): ''' Examine the top topics of the model. ''' cnxn = sqlite3.connect("covid_tweets.db") cursor = cnxn.cursor() query = ''' SELECT tokenized_tweet FROM token_tweets WHERE date = ? AND in_model = 1''' cursor.execute(query, (self.date, )) results = cursor.fetchall() cnxn.close() print(len(results), "documents are in the model.") for tt, in results: self.documents.append(tt.split(" ")) for i in range(len(self.documents)): for token in self.bigram[self.documents[i]]: if '_' in token: self.documents[i].append(token) self.generate_corpus() self.top_topics = self.model.top_topics(self.corpus) pprint(self.top_topics) self.save_top_topics() def output_topics_json(self, values): ''' Output json from a list of tuples ''' to_json = [] for date, topic_num, word, probability in values: to_json.append({ "date": date, "topic_num": topic_num, "word": word, "probability": str(probability) }) with open(f"./tmp/{self.date}_topics.json", "w") as outfile: json.dump(to_json, outfile) def save_top_topics(self): ''' Given the top topics, save them to a .json file ''' to_save = [] for i, topic in enumerate(self.top_topics, 1): for probability, word in topic[0]: to_save.append((self.date, i, word, probability)) self.output_topics_json(to_save)
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] #Train the model on the corpus lda = LdaModel(common_corpus, num_topics=10) '''一步步拆解来看,首先common_texts是list形式,里面的每一个元素都可以认为是一篇文档也是list结构:''' print(type(common_texts)) print(common_texts[0]) '''第二步,doc2bow这个方法用于将文本转化为词袋形式,看一个官方的示例大家应该就能明白了,''' from gensim.corpora import Dictionary dct = Dictionary(["máma mele maso".split(), "ema má máma".split()]) print(dct.doc2bow(["this", "is", "máma"])) print(dct.doc2bow(["this", "is", "máma"], return_missing=True)) '''初始化的时候对每一个词都会生成一个id,新的文本进去的时候,返回该文本每一个词的id,和对应的频数,对于那些不存在原词典的,可以控制是否返回。 此时生成的corpus就相当于是LDA训练模型的输入了,让我们检查一下:''' print(common_corpus[0]) # human单词的id为0,且在第一个文档中只出现了一次 '''最后一步,我们只需调用LDA模型即可,这里指定了10个主题。''' from gensim.models import LdaModel lda = LdaModel(common_corpus, num_topics=10) '''让我们检查一下结果(还有很多种方法大家可以看文档),比如我们想看第一个主题由哪些单词构成:''' print(lda.print_topic(1, topn=2)) '''可以看出第一个模型的词分布,9号10号占比较大(这里topn控制了输出的单词个数,对应的单词可以通过之前生成dict找出) 我们还可以对刚才生成的lda模型用新语料去进行更新,''' ''' # 能更新全部参数 lda.update(other_corpus) #还能单独更新主题分布, 输入为之前的参数,其中rho指学习率 lda.update_alpha(gammat, rho) #还能单独更新词分布 lda.update_eta(lambdat, rho) '''