def _compute_coherence(self, model, k, test_data, log_terms=False): num_topics = model.n_latent sorted_ids = model.get_top_k_terms(k) num_topics = min(num_topics, sorted_ids.shape[-1]) top_k_words_per_topic = [[int(i) for i in list(sorted_ids[:k, t])] for t in range(num_topics)] npmi_eval = EvaluateNPMI(top_k_words_per_topic) npmi = npmi_eval.evaluate_csr_mat(test_data) unique_term_ids = set() unique_limit = 5 ## only consider the top 5 terms for each topic when looking at degree of redundancy for i in range(num_topics): topic_ids = list(top_k_words_per_topic[i][:unique_limit]) for j in range(len(topic_ids)): unique_term_ids.add(topic_ids[j]) redundancy = ( 1.0 - (float(len(unique_term_ids)) / num_topics / unique_limit))**2.0 logging.info("Test Coherence: {}".format(npmi)) if log_terms: top_k_tokens = [ list(map(lambda x: self.vocabulary.idx_to_token[x], list(li))) for li in top_k_words_per_topic ] for i in range(num_topics): logging.info("Topic {}: {}".format(i, top_k_tokens[i])) return npmi, redundancy
def _npmi(self, X, y, k=10): """ Calculate NPMI(Normalized Pointwise Mutual Information) for data X Parameters: X (array-like or sparse matrix): Document word matrix. shape [n_samples, vocab_size] k (int): Threshold at which to compute npmi. optional (default=10) Returns: npmi (float): NPMI score. """ sorted_ids = self.model.get_ordered_terms() num_topics = min(self.n_latent, sorted_ids.shape[-1]) top_k_words_per_topic = [[int(i) for i in list(sorted_ids[:k, t])] for t in range(self.n_latent)] npmi_eval = EvaluateNPMI(top_k_words_per_topic) npmi = npmi_eval.evaluate_csr_mat(X) unique_term_ids = set() unique_limit = 5 ## only consider the top 5 terms for each topic when looking at degree of redundancy for i in range(num_topics): topic_ids = list(top_k_words_per_topic[i][:unique_limit]) for j in range(len(topic_ids)): unique_term_ids.add(topic_ids[j]) redundancy = ( 1.0 - (float(len(unique_term_ids)) / num_topics / unique_limit))**2 return npmi, redundancy
def _npmi_with_dataloader(self, dataloader, k=10): sorted_ids = self.model.get_ordered_terms_encoder( dataloader ) if self.coherence_via_encoder else self.model.get_ordered_terms() num_topics = min(self.n_latent, sorted_ids.shape[-1]) top_k_words_per_topic = [[int(i) for i in list(sorted_ids[:k, t])] for t in range(self.n_latent)] npmi_eval = EvaluateNPMI(top_k_words_per_topic) npmi = npmi_eval.evaluate_csr_loader(dataloader) unique_term_ids = set() unique_limit = 5 ## only consider the top 5 terms for each topic when looking at degree of redundancy for i in range(num_topics): topic_ids = list(top_k_words_per_topic[i][:unique_limit]) for j in range(len(topic_ids)): unique_term_ids.add(topic_ids[j]) redundancy = ( 1.0 - (float(len(unique_term_ids)) / num_topics / unique_limit))**2 return npmi, redundancy
def _npmi_per_covariate(self, X, y, k=10): """ Calculate NPMI(Normalized Pointwise Mutual Information) for each covariate for data X Parameters: X (array-like or sparse matrix): Document word matrix. shape [n_samples, vocab_size] y (array-like or sparse matrix): Covariate matrix. shape [n_samples, n_covars] k (int): Threshold at which to compute npmi. optional (default=10) Returns: (dict): Dictionary of npmi scores for each covariate. """ X_train = X.toarray() y_train = y covars = np.unique(y_train, axis=0) covar_npmi = {} npmi_total = 0 for covar in covars: mask = (y_train == covar).all(axis=1) X_covar, y_covar = mx.nd.array( X_train[mask], dtype=np.float32), mx.nd.array(y_train[mask], dtype=np.float32) sorted_ids = self.model.get_ordered_terms_with_covar_at_data( X_covar, k, y_covar) top_k_words_per_topic = [[ int(i) for i in list(sorted_ids[:k, t].asnumpy()) ] for t in range(self.n_latent)] npmi_eval = EvaluateNPMI(top_k_words_per_topic) npmi = npmi_eval.evaluate_csr_mat(X_covar) if (self.label_map): covar_key = covar[0] else: covar_key = np.where(covar)[0][0] covar_npmi[covar_key] = npmi npmi_total += npmi return npmi_total / len(covars)
os.environ["MXNET_STORAGE_FALLBACK_LOG_VERBOSE"] = "0" if __name__ == "__main__": parser = setup_parser() args = parser.parse_args() verbose = False ### XXX - add as argument vocab = load_vocab(args.vocab_file) if args.override_top_k_terms: top_k_words_per_topic = get_top_k_terms_from_file( args.override_top_k_terms) tst_csr, _, _, _ = file_to_data(args.test_file, len(vocab)) top_k_words_per_topic_ids = [[vocab[t] for t in t_set] for t_set in top_k_words_per_topic] npmi_eval = EvaluateNPMI(top_k_words_per_topic_ids) test_npmi = npmi_eval.evaluate_csr_mat(tst_csr) print("**** Test NPMI = {} *******".format(test_npmi)) exit(0) inference_model = BowVAEInferencer.from_saved( model_dir=args.model_dir, ctx=mx.cpu() if args.gpu < 0 else mx.gpu(args.gpu)) if args.plot_file: # get UMAP embedding visualization import matplotlib.pyplot as plt encoded, labels = inference_model.encode_vec_file(args.test_file) encodings = np.array([doc.asnumpy() for doc in encoded]) print("There are {0} labels and {1} encodings".format( len(labels), len(encodings))) umap_model = umap.UMAP(n_neighbors=4, min_dist=0.5, metric='euclidean')
os.environ["MXNET_STORAGE_FALLBACK_LOG_VERBOSE"] = "0" if __name__ == "__main__": parser = setup_parser() args = parser.parse_args() verbose = False ### XXX - add as argument inference_model = BowVAEInferencer.from_saved(model_dir=args.model_dir, ctx=mx.cpu() if args.gpu < 0 else mx.gpu(args.gpu)) if args.override_top_k_terms: top_k_words_per_topic = get_top_k_terms_from_file(args.override_top_k_terms) tst_csr, _, _, _ = file_to_data(args.test_file, len(inference_model.vocab)) top_k_words_per_topic_ids = [ [ inference_model.vocab[t] for t in t_set ] for t_set in top_k_words_per_topic ] npmi_eval = EvaluateNPMI(top_k_words_per_topic_ids) test_npmi = npmi_eval.evaluate_csr_mat(tst_csr) print("**** Test NPMI = {} *******".format(test_npmi)) exit(0) if args.plot_file: # get UMAP embedding visualization import matplotlib.pyplot as plt encoded, labels = inference_model.encode_vec_file(args.test_file) encodings = np.array([doc.asnumpy() for doc in encoded]) print("There are {0} labels and {1} encodings".format(len(labels), len(encodings))) umap_model = umap.UMAP(n_neighbors=4, min_dist=0.5, metric='euclidean') embeddings = umap_model.fit_transform(encodings) plt.scatter(*embeddings.T, c=labels, s=0.2, alpha=0.7, cmap='coolwarm') plt.savefig(args.plot_file, dpi=1000)