def create_dictionary(self): """ Utility method to generate gensim-style Dictionary directly from the corpus and vocabulary data. """ dictionary = Dictionary() # replace dfs with defaultdict to avoid downstream KeyErrors # uci vocabularies may contain terms that are not used in the document data dictionary.dfs = defaultdict(int) dictionary.id2token = self.id2word dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word)) dictionary.num_docs = self.num_docs dictionary.num_nnz = self.num_nnz for docno, doc in enumerate(self): if docno % 10000 == 0: logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs)) for word, count in doc: dictionary.dfs[word] += 1 dictionary.num_pos += count return dictionary
def create_dictionary(self): """Generate :class:`gensim.corpora.dictionary.Dictionary` directly from the corpus and vocabulary data. Return ------ :class:`gensim.corpora.dictionary.Dictionary` Dictionary, based on corpus. Examples -------- .. sourcecode:: pycon >>> from gensim.corpora.ucicorpus import UciCorpus >>> from gensim.test.utils import datapath >>> ucc = UciCorpus(datapath('testcorpus.uci')) >>> dictionary = ucc.create_dictionary() """ dictionary = Dictionary() # replace dfs with defaultdict to avoid downstream KeyErrors # uci vocabularies may contain terms that are not used in the document data dictionary.dfs = defaultdict(int) dictionary.id2token = self.id2word dictionary.token2id = utils.revdict(self.id2word) dictionary.num_docs = self.num_docs dictionary.num_nnz = self.num_nnz for docno, doc in enumerate(self): if docno % 10000 == 0: logger.info('PROGRESS: processing document %i of %i', docno, self.num_docs) for word, count in doc: dictionary.dfs[word] += 1 dictionary.num_pos += count return dictionary
if os.path.exists( os.path.join(config_dic.get("vocab_dir"), f"{args.config}.word.dic")): word_dic = Dictionary.load( os.path.join(config_dic.get("vocab_dir"), f"{args.config}.word.dic")) #char_dic = Dictionary.load(os.path.join(config_dic.get("vocab_dir"), f"{args.config}.char.dic")) sw_dicts = {} for sp_key, sp in sps.items(): sw_dicts[sp_key] = Dictionary.load( os.path.join(config_dic.get("vocab_dir"), f"{args.config}.{sp_key}.dic")) else: special_token_dict = {PADDING: 0, UNKNOWN: 1, START: 2, END: 3} word_dic = Dictionary() word_dic.token2id = special_token_dict #char_dic = Dictionary() #char_dic.token2id = special_token_dict sw_dicts = {} for sp_key, sp in sps.items(): _dic = Dictionary() _dic.token2id = special_token_dict sw_dicts[sp_key] = _dic label_dic = Dictionary(train_label_documents) label_dic.patch_with_special_tokens({PADDING: 0}) label_dic.id2token = { _id: label for label, _id in label_dic.token2id.items() } # add vocabulary
## Vectorize the corpus cv = CountVectorizer(stop_words="english", min_df=5, max_df=0.4, max_features=5000, ngram_range=(1, 1)) dtm = cv.fit_transform(corpus) features = np.array(cv.get_feature_names()) id2token = dict(zip(range(len(features)), features)) token2id = dict(zip(features, range(len(features)))) ## Create a gensim dictionary dictionary = Dictionary() dictionary.id2token = id2token dictionary.token2id = token2id ## Train LDA models with different count of topics topic_counts = [20, 30, 40, 50, 70, 100, 120, 150] def get_topn_words(lda_model, features, topn=20): topics = lda_model.components_ topic_words = [] for topic_num, topic_weights in enumerate(topics): top_words = topic_weights.argsort()[::-1][:topn] topic_words.append(list(features[top_words])) return topic_words def get_coherence_lda_models(topic_counts, dtm, features, corpus, dictionary):