def make_item_descriptions(max_sentence_length=None): descriptions = pd.read_csv(os.path.join( 'data', 'descriptions.csv')).rename(columns={'movie': 'item'}) texts = descriptions.description texts = texts.apply(lambda x: x.strip().split()) dictionary = Dictionary(texts.values) dictionary.filter_extremes() eos_id = len(dictionary.keys()) # to index list texts = texts.apply( lambda x: dictionary.doc2idx(x, unknown_word_index=eos_id)) texts = texts.apply(lambda x: np.array([a for a in x if a != eos_id])) max_sentence_length = max( texts.apply(len)) if max_sentence_length is None else min( max(texts.apply(len)), max_sentence_length) # padding texts = texts.apply(lambda x: x[:max_sentence_length]) texts = texts.apply(lambda x: np.pad(x, (0, max_sentence_length - len(x)), 'constant', constant_values=(0, eos_id))) # change types texts = texts.apply(lambda x: x.astype(np.int32)) descriptions.id = descriptions.id.astype(np.int32) return descriptions.id.values, texts.values, len(dictionary.keys()) + 1
class DataGenerator(object): def __init__(self, positive_dataset: pd.DataFrame, negative_dataset: pd.DataFrame, test_size: float, random_state: int = 123, max_sentence_length: int = None): self.dataset = pd.concat([positive_dataset, negative_dataset], axis=0) self.dataset['review'] = self.dataset['review'].apply( lambda x: x.strip().split()) self.dictionary = Dictionary(self.dataset['review'].values) self.dataset['review'] = self.dataset['review'].apply( self.dictionary.doc2idx) self.max_sentence_length = max_sentence_length if self.max_sentence_length is not None: self.dataset['review'] = self.dataset['review'].apply( lambda x: x[:self.max_sentence_length]) else: self.max_sentence_length = max(self.dataset['review'].apply(len)) # padding eos_id = len(self.dictionary.keys()) self.dataset['review'] = self.dataset['review'].apply( lambda x: np.pad(x, (0, self.max_sentence_length - len(x)), 'constant', constant_values=(0, eos_id))) # change type self.dataset['review'] = self.dataset['review'].apply( lambda x: x.astype(np.int32)) self.dataset['label'] = self.dataset['label'].astype(np.int32) # split self.train, self.test = train_test_split(self.dataset, test_size=test_size, random_state=random_state) def get_train_dataset(self): return list( zip(self.train['review'].values, self.train['label'].values)) def get_test_dataset(self): return list(zip(self.test['review'].values, self.test['label'].values)) def get_max_sentence_length(self): return self.max_sentence_length def get_n_word(self): return len(self.dictionary.keys()) + 1
# fit an LDA model, n_topic = 5 news_dictionary = Dictionary(documents) news_dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=5000, keep_tokens=None) corpus = [news_dictionary.doc2bow(text) for text in documents] lda = gensim.models.LdaModel(corpus, num_topics=5, id2word=news_dictionary) lda.show_topics() # convert gensim corpus to a sparse document-term matrix for coherence measure corpus_dense = gensim.matutils.corpus2csc(corpus, num_terms=len( news_dictionary.keys())) corpus_dense = corpus_dense.astype(int) corpus_dense = corpus_dense.transpose() print(corpus_dense.shape) # implements the UMass coherence in Mimno et al. 2011 - Optimizing Semantic Coherence in Topic Models def cooccur_df_ws(w1, w2, corpus_dense, w2ids): """ Returns the co-document frequency of two words """ w1_id, w2_id = w2ids.token2id.get(w1), w2ids.token2id.get(w2) co_freq_array = (corpus_dense[:, [w1_id, w2_id]] > 0).sum(axis=1).A1 return np.count_nonzero(co_freq_array == 2)
class InfoGain(object): """ 计算标签的信息增益,输入的是标注了类别的语料库, 计算信息增益首先计算全局的信息熵,然后计算每个标 签的条件熵,相减就是信息增益。 Example: >>> ig = InfoGain(corpus_file) >>> ig.compute() >>> ig.save(ig_file) >>> print(ig['word']) # 查询一个词的信息增益 """ def __init__(self, corpus_file): """ Args: corpus_file -- 语料文件,第一列是类别,后面都是标签 """ corpus = [] categories = [] self._category_distribution = {} # 统计各个类别的样本数 self._words_cate = {} # 统计每个词(标签、特征)下的类别样本数 self._words_sample_count = {} self._info_gain = {} with open(corpus_file, 'r') as documents: for line in documents: words = line.strip().split() if len(words) <= 1: continue categories.append(words[0]) corpus.append(words[1:]) if words[0] not in self._category_distribution: self._category_distribution[words[0]] = 0 self._category_distribution[words[0]] += 1 # 统计词(标签、特征)和类别的共现次数,用于计算条件熵 for word in set(words[1:]): if word not in self._words_cate: self._words_cate[word] = {} self._words_sample_count[word] = 0 if words[0] not in self._words_cate[word]: self._words_cate[word][words[0]] = 0 self._words_cate[word][words[0]] += 1 self._words_sample_count[word] += 1 self._common_dictionary = Dictionary(corpus) self._corpus = corpus self._categories = categories def compute(self): """ 计算所有词(标签、特征)的信息增益。首先计算全局的信息熵。 """ system_entropy = compute_entropy(len(self._corpus), self._category_distribution) # 计算每个词的条件熵 for word in self._common_dictionary.keys(): category_distribution = {} if word not in self._words_cate: continue # 出现该词(标签、特征)的类别分布信息熵 entropy1 = compute_entropy(self._words_sample_count[word], self._words_cate[word]) for cate in self._category_distribution: category_distribution[cate] = self._category_distribution[cate] if cate in self._words_cate[word]: category_distribution[cate] -= self._words_cate[word] # 未出现该词(标签、特征)的类别分布信息熵 entropy2 = (compute_entropy(len(self._corpus) - self._words_sample_count[word], category_distribution)) # 该词(标签、特征)的条件熵 condition_entropy = (self._words_sample_count[word] * entropy1/len(self._corpus) + (len(self._corpus) - self._words_sample_count[word]) * entropy2/len(self._corpus)) # 信息增益 info_gain = system_entropy - condition_entropy self._info_gain[word] = info_gain def save(self, ig_file_name, sort=False): """ 保存到文件,格式为:词 信息增益 Args: ig_file_name -- 文件路径 sort -- 是否按照信息增益从高到低排序后输出,默认不排序 """ with open(ig_file_name, 'w') as ig_file: if not sort: for word in self._info_gain: ig_file.write("%s %.2f\n" % (word, self._info_gain[word])) else: for item in sorted(self._info_gain.items(), key=lambda x: x[1], reverse=True): ig_file.write("%s %.2f\n" % (item[0], item[1])) def __get_item__(self, word): if word not in self._info_gain: return 0.0 return self._info_gain[word]
continue if row[0] not in eids: eids.add(row[0]) if len(row) == 8: row.append(",".join(topic_word_list_result[row[0]])) else: row[8] = ",".join(topic_word_list_result[row[0]]) csvData.append(row) with open(filename, 'w', encoding='utf-8') as writeFile: writer = csv.writer(writeFile) writer.writerows(csvData) print("Write to database successfully") print("corpus length:", len(pubs_corpus)) print("dict length:", len(pubs_dictionary.keys())) topic_word_list_result = dict() topic_dict = topic_to_lemmatized_word_list(lda) for id in pubs_eids: cur_corpus = id_to_corpus.get(id, None) if cur_corpus != None: candidate_topics = lda.get_document_topics( cur_corpus) # list all topic index best_topic_index = select_highest_prob_topic( candidate_topics) # select the index with highest prob if best_topic_index == -1: print("no topic document:", id) topic_word_list_result[id] = ["unknown"] else: topic_word_list_result[id] = topic_dict[
docs_per_topic = defaultdict(list) # Create a dictionary with docs per topic for topic_id in topics: # Uncommented following line if you want to return actual docs #docs_per_topic[ topic_id ].append(X[np.where(argmax == topic_id)]) docs_per_topic[ topic_id ].append(np.where(argmax == topic_id)) return docs_per_topic # Initialization of term dictionary; in this case for 9 2 a 3-word documents print(common_texts) common_dictionary = Dictionary(common_texts) print(common_dictionary) # prints 12 unique tokens print(common_dictionary.keys()) # prints 12 unique tokens # Create a corpus from a list of texts # BoW: Bag-of-Words representation for each document: (token_id[int], token_count[float]) tuples X = [common_dictionary.doc2bow(text) for text in common_texts] print("X: ", X) # X can also be a sparse matrix with (n_docs, n_terms); that may be handier lda = LdaModel(corpus=X, num_topics = 10, alpha='symmetric') # Full document-term matrix print(lda.get_topics()) # TODO figure out how to use LdaState; need to initialize it first I guess # Get posterior probabilities over topics #print(lda.LdaState().get_lambda())
]) # Bigrams and trigrams are joined by underscores # - # ### Remove rare and common tokens, and limit vocabulary # + dictionary = Dictionary(all_tokens) dictionary.filter_extremes(no_below=30, no_above=0.5, keep_n=20000) # Look at the top 100 and bottom 100 tokens temp = dictionary[0] # Initialize the dict token_counts = pd.DataFrame(np.array( [[token_id, dictionary.id2token[token_id], dictionary.cfs[token_id]] for token_id in dictionary.keys() if token_id in dictionary.cfs.keys() and token_id in dictionary.id2token.keys()]), columns=['id', 'token', 'count']) token_counts['count'] = token_counts['count'].astype('int') token_counts['count'].describe() token_counts = token_counts.sort_values('count') plt.rcParams.update({'figure.figsize': (5, 3.5), 'figure.dpi': 200}) token_counts['count'].head(5000).hist(bins=100) plt.suptitle("Counts for 5,000 least frequent included words") plt.show() display(token_counts.head(50)) plt.rcParams.update({'figure.figsize': (5, 3.5), 'figure.dpi': 200}) token_counts['count'].tail(1000).hist(bins=100)