def _kmeans_to_prepared_data_pyldavis_score(x, index2word, centers, labels, embedding_method='tsne', radius=3.5, n_candidate_words=50, n_printed_words=30, lambda_step=0.01): """ Dont use pyLDAvis embedding method. It shows unstable training results. """ topic_term_dists = normalize(centers, norm='l1') empty_clusters = np.where(topic_term_dists.sum(axis=1) == 0)[0] default_weight = 1 / centers.shape[1] topic_term_dists[empty_clusters,:] = default_weight doc_topic_dists = np.zeros((x.shape[0], centers.shape[0])) for d, label in enumerate(labels): doc_topic_dists[d,label] = 1 doc_lengths = x.sum(axis=1).A.ravel() term_frequency = x.sum(axis=0).A.ravel() term_frequency[term_frequency == 0] = 0.01 # preventing zeros if embedding_method == 'tsne': return pyLDAvis.prepare( topic_term_dists, doc_topic_dists, doc_lengths, index2word, term_frequency, R=radius, lambda_step=lambda_step, sort_topics=True, plot_opts={'xlab': 't-SNE1', 'ylab': 't-SNE2'} ) else: return pyLDAvis.prepare( topic_term_dists, doc_topic_dists, doc_lengths, index2word, term_frequency, R=radius, lambda_step=lambda_step )
def visualize_lda_mallet(self, **kwargs): """Visualize LDA model using pyLDAvis""" dataDir = "../data/mallet_files" # update this if needed statefile = 'state.mallet.gz' data = get_LDA_data(dataDir, statefile) vis = pyLDAvis.prepare(sort_topics=False, **data, **kwargs) return vis
def prepare(lda_model, dtm, id2term, **kwargs): """Create Prepared Data from sklearn's LatentDirichletAllocation and CountVectorizer. Parameters ---------- lda_model : sklearn.decomposition.LatentDirichletAllocation. Latent Dirichlet Allocation model from sklearn fitted with `dtm` dtm : array-like or sparse matrix, shape=(n_samples, n_features) Document-term matrix used to fit on LatentDirichletAllocation model (`lda_model`) id2term: the <feature id>:<term word> dictionary **kwargs: Keyword argument to be passed to pyLDAvis.prepare() Returns ------- prepared_data : PreparedData the data structures used in the visualization Example -------- For example usage please see this notebook: http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/sklearn.ipynb See ------ See `pyLDAvis.prepare` for **kwargs. """ opts = fp.merge(_extract_data(lda_model, dtm, id2term), kwargs) return lda.prepare(**opts)
def generate_ldavis_data_v1(data_path, run_name, model, idx_to_word, freqs, vocab_size): """This function will launch a locally hosted session of pyLDAvis to visualize the results of our model""" doc_embed = model.sess.run(model.doc_embedding) topic_embed = model.sess.run(model.topic_embedding) word_embed = model.sess.run(model.word_embedding) # Extract all unique words in order of index: 0 - vocab_size vocabulary = [] # NOTE! Keras Tokenizer indexes from 1, 0 is reserved for PAD token for i in range(1, vocab_size + 1): vocabulary.append(idx_to_word[i]) # Read document lengths doc_lengths = np.load(data_path / run_name / 'doc_lengths.npy') # The `prepare_topics` function is a direct copy from Chris Moody vis_data = prepare_topics(doc_embed, topic_embed, word_embed, np.array(vocabulary), doc_lengths=doc_lengths, term_frequency=freqs, normalize=True) prepared_vis_data = pyLDAvis.prepare(**vis_data) pyLDAvis.show(prepared_vis_data)
def prepare(model_data_path, ignore_topics=[], ignore_terms=[], **kwargs): """Create Prepared Data from sklearn's LatentDirichletAllocation and CountVectorizer. Parameters ---------- model_data_path : Path where TwitterLDA stored it's data output Returns ------- prepared_data : PreparedData the data structures used in the visualization Example -------- For example usage please see this notebook: http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/sklearn.ipynb See ------ See `pyLDAvis.prepare` for **kwargs. """ opts = fp.merge( _extract_data(model_data_path, ignore_topics, ignore_terms), kwargs) opts['sort_topics'] = False return pyLDAvis.prepare(**opts)
def ldavis_show(metagenome, sample_probs, otu_probs, output=None): import pyLDAvis taxa_info = (metagenome.taxonomy.data.loc[metagenome.abundance.columns, ['Class', 'Genus']].apply( lambda x: ';'.join(x), axis=1)) LDAvis_prepared = pyLDAvis.prepare( otu_probs.values, # (topics x otus) sample_probs, # (samples x topics) metagenome.abundance.data.sum(axis=1), # (samples) taxa_info, # (otus) metagenome.abundance.data.sum(axis=0).values) # (otus) LDAvis_data_filepath = '{}/ldavis_prep.pkl'.format(str(metagenome.outdir)) with open(LDAvis_data_filepath, 'wb') as f: pickle.dump(LDAvis_prepared, f) # load the pre-prepared pyLDAvis data from disk with open(LDAvis_data_filepath, 'rb') as f: LDAvis_prepared = pickle.load(f) pyLDAvis.save_html(LDAvis_prepared, '{}/{}'.format(metagenome.figdir, output))
def prepare(topic_model, docs, **kargs): """Transforms the GraphLab TopicModel and related corpus data into the data structures needed for the visualization. Parameters ---------- topic_model : graphlab.toolkits.topic_model.topic_model.TopicModel An already trained GraphLab topic model. docs : SArray of dicts The corpus in bag of word form, the same docs used to train the model. **kwargs : additional keyword arguments are passed through to :func:`pyldavis.prepare`. Returns ------- prepared_data : PreparedData the data structures used in the visualization Example -------- For example usage please see this notebook: http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/GraphLab.ipynb """ opts = fp.merge(_extract_data(topic_model, docs), kargs) return pyLDAvis.prepare(**opts)
def lda_viz(docs, lengths, n_features, n_topics, n_top_words): n_samples = len(docs) norm = lambda data: pandas.DataFrame(data).div(data.sum(1), axis=0).values vect = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') vected = vect.fit_transform(docs) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) doc_topic_dists = norm(lda.fit_transform(vected)) prepared = pyLDAvis.prepare( doc_lengths=lengths, vocab=vect.get_feature_names(), term_frequency=vected.sum(axis=0).tolist()[0], topic_term_dists=norm(lda.components_), doc_topic_dists=doc_topic_dists, ) #print(doc_topic_dists) #print(n_samples) return prepared, doc_topic_dists
def generate_ldavis_data(data_path, run_name, model, idx_to_word, freqs, vocab_size): """This method will launch a locally hosted session of pyLDAvis that will visualize the results of our model """ doc_embed = model.sesh.run(model.doc_embedding) topic_embed = model.sesh.run(model.topic_embedding) word_embed = model.sesh.run(model.word_embedding) # Extract all unique words in order of index 0-vocab_size vocabulary = [] for i in range(vocab_size): vocabulary.append(idx_to_word[i]) # Read in document lengths doc_lengths = np.load(data_path + "/" + run_name + "/" + "doc_lengths.npy") # The prepare_topics function is a direct copy from Chris Moody vis_data = prepare_topics(doc_embed, topic_embed, word_embed, np.array(vocabulary), doc_lengths=doc_lengths, term_frequency=freqs, normalize=True) prepared_vis_data = pyLDAvis.prepare(**vis_data) pyLDAvis.show(prepared_vis_data)
def prepare(lda_model, dtm, vectorizer, **kwargs): """Create Prepared Data from sklearn's LatentDirichletAllocation and CountVectorizer. Parameters ---------- lda_model : sklearn.decomposition.LatentDirichletAllocation. Latent Dirichlet Allocation model from sklearn fitted with `dtm` dtm : array-like or sparse matrix, shape=(n_samples, n_features) Document-term matrix used to fit on LatentDirichletAllocation model (`lda_model`) vectorizer : sklearn.feature_extraction.text.(CountVectorizer, TfIdfVectorizer). vectorizer used to convert raw documents to document-term matrix (`dtm`) **kwargs: Keyword argument to be passed to pyLDAvis.prepare() Returns ------- prepared_data : PreparedData the data structures used in the visualization Example -------- For example usage please see this notebook: http://nbviewer.ipython.org/github/bmabey/pyLDAvis/blob/master/notebooks/sklearn.ipynb See ------ See `pyLDAvis.prepare` for **kwargs. """ opts = fp.merge(_extract_data(lda_model, dtm, vectorizer), kwargs) return pyLDAvis.prepare(**opts)
def plot_pyldavis(topic_model, document_topic_matrix, document_term_matrix, file=None, **kwargs): """ Generate a pyLDAvis visualization of the given topic model. For more information about the visualization read the `original paper <http://www.aclweb.org/anthology/W14-3110>`_ by Sievert and Shirley. Note that pyLDAvis only supports LDA models, passing a nmf model will cause an exception. :param document_topic_matrix: A document-topic matrix as returned by calling get_document_topic_matrix() on a topic model. :type document_topic_matrix: np.ndarray :param document_term_matrix: Term count weighted document-term matrix of the documents used to infer the document_topic_matrix. :type document_term_matrix: np.ndarray :param file: Path to store the HTML output. If no file is passed the plot is visualized in the browser. :type file: str :param kwargs: Further parameters passed directly to pyLDAvis's prepare function. See the `documentation <http://pyldavis.readthedocs.io/en/latest/modules/API.html#pyLDAvis.prepare>`_ for options. Note, that sort_topics=False is already set. """ if topic_model.model_name != 'lda': raise Exception('pyLDAvis only supports LDA. {} not supported'.format(topic_model.model_name)) topic_token_matrix = topic_model.get_topic_token_matrix(normalize=True) id2word = topic_model.id2token document_lengths = np.sum(document_term_matrix, axis=1).getA1() term_frequencies = np.sum(document_term_matrix, axis=0).getA1() prepared_data = pyLDAvis.prepare(topic_token_matrix, document_topic_matrix, document_lengths, id2word, term_frequencies, sort_topics=False, **kwargs) ROOT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..') REPORT_DIR = os.path.join(ROOT_DIR, 'reports') if file: base_path = os.path.join(REPORT_DIR, 'figures/pyLDAvis') pa = os.path.join(base_path, file) with open(pa, 'w') as f: pyLDAvis.save_html(prepared_data, f) else: pyLDAvis.show(prepared_data)
def generate_ldavis_data(data_path, model, idx_to_word, freqs, vocab_size): """ This function will launch a locally hosted session of pyLDAvis to visualize the results of our model. :param data_path: (PosixPath) data location :param model: TensorFlow model :param idx_to_word: (dict) index-to-word mapping :param freqs: (list) frequency counts of each token :param vocab_size: (int) size of vocabulary :return: """ doc_embed = model.sess.run(model.doc_embedding) topic_embed = model.sess.run(model.topic_embedding) word_embed = model.sess.run(model.word_embedding) # Extract all unique words in order of index: 1 - (vocab_size + 1) # NOTE! Keras Tokenizer indexes from 1, 0 is reserved for PAD token vocabulary = ['<PAD>'] for i in range(1, vocab_size): vocabulary.append(idx_to_word[i]) # Read document lengths doc_lengths = np.load(data_path / 'doc_lengths.npy') # The `prepare_topics` function is a direct copy from Chris Moody vis_data = prepare_topics(doc_embed, topic_embed, word_embed, np.array(vocabulary), doc_lengths=doc_lengths, term_frequency=freqs, normalize=True) prepared_vis_data = pyLDAvis.prepare(**vis_data) pyLDAvis.show(prepared_vis_data)
def genSTTMHtml(data, uid): print('数据预处理...') first = Series(data).apply(chinese_word_cut) # 分词 tmp = first[first.notnull()] docs = [item for item in tmp if len(item) > 2] if len(docs) < 2: print('数据量过少') return K = min(max(len(docs) // 100, 2), 10) # docs_len = len(docs) # if docs_len < 10: # print('数据量过少') # return; # if docs_len < 30: # K=2 # elif docs_len < 100: # K=3 # elif docs_len < 200: # K=5 mgp = gen_mgp(K) vocab = set([x for doc in docs for x in doc]) mgp.fit(docs, len(vocab)) showResult(mgp) print('模型可视化...') pytest = prepare_data(mgp, docs, vocab) movies_vis_data = pyLDAvis.prepare(**pytest) filename = '{}.html'.format(uid) filepath = os.path.join(save_html_dir, filename) pyLDAvis.save_html(movies_vis_data, filepath) return os.path.join('/assets', 'html', filename)
def pylda_vis(args, model, corpus, time_slices, pre): """ Function to visualize model using pyLDAvis input: args (argparse object): input arguments model: LDA model to visualize corpus: corpus to run LDA over time_slices (list): list containing number of files per time time slice pre (str): path to save all results to returns """ print(timestamp() + " About to visualize...", file=sys.stderr) for slice in range(len(time_slices)): doc_topic, topic_term, doc_lengths, term_frequency, vocab = model.dtm_vis( time=slice, corpus=corpus) vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency, sort_topics=True) pyLDAvis.save_html(vis_wrapper, pre + "time_slice_" + str(slice) + ".html") print(timestamp() + " Prepared time slice", slice, "for pyLDAvis...", file=sys.stderr)
def plot_lda_vis(model_data, mode='show', filename=None): """Designed to work with to_py_lda_vis() in the model classes.""" from pyLDAvis import prepare, save_html, show model_vis_data = prepare(**model_data) if mode == 'save_html' and filename: save_html(model_vis_data, filename) else: show(model_vis_data)
def learn_topic_model(X, vocab, graphlets, config, dbg=False): alpha = config['dirichlet_params']['alpha'] eta = config['dirichlet_params']['eta'] model = lda.LDA(n_topics=config['n_topics'], n_iter=config['n_iters'], random_state=1, alpha=alpha, eta=eta) model.fit(X) # model.fit_transform(X) is also available topic_word = model.topic_word_ # model.components_ also works n_top_words = 30 feature_freq = (X != 0).sum(axis=0) doc_lengths = (X != 0).sum(axis=1) try: print "phi: %s. theta: %s. nd: %s. vocab: %s. Mw: %s" \ %( model.topic_word_.shape, model.doc_topic_.shape, doc_lengths.shape, len(graphlets.keys()), len(feature_freq)) data = {'topic_term_dists': model.topic_word_, 'doc_topic_dists': model.doc_topic_, 'doc_lengths': len(graphlets.keys()), 'vocab': graphlets.keys(), 'term_frequency': X} import pyLDAvis vis_data = pyLDAvis.prepare(model.topic_word_, model.doc_topic_, doc_lengths, graphlets.keys(), feature_freq) # vis_data = pp.prepare(model.topic_word_, model.doc_topic_, doc_lengths, graphlets.keys(), feature_freq) html_file = "../LDAvis/Learnt_Models/topic_model_" + id + ".html" pyLDAvis.save_html(vis_data, html_file) print "PyLDAVis ran. output: %s" % html_file """investigate the objects used in the topics""" print("\ntype(topic_word): {}".format(type(topic_word))) print("shape: {}".format(topic_word.shape)) topics = {} for i, topic_dist in enumerate(topic_word): objs = [] topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1] #print('Topic {}: {}'.format(i, ' '.join( [repr(i) for i in topic_words] ))) for j in [graphlets[k] for k in topic_words]: objs.extend(object_nodes(j)[0]) topics[i] = objs if dbg: print('Topic {}: {}'.format(i, list(set(objs)))) except ImportError: print "No module pyLDAvis. Cannot visualise topic model" """investigate the highly probably topics in each document""" doc_topic = model.doc_topic_ # #Each document's most probable topic - don't have the UUIDs, so dont use this. # pred_labels = [] # for n in range(doc_topic.shape[0]): # if max(doc_topic[n]) > config['class_thresh']: # topic_most_pr = doc_topic[n].argmax() # pred_labels.append(topic_most_pr) return doc_topic, topic_word #, pred_labels
def lda_vis(modeled_corpus, mode='show', filename=None): """Designed to work with to_py_lda_vis() in the model classes.""" from pyLDAvis import prepare, show, save_html model_vis_data = _to_py_lda_vis(modeled_corpus) prepared_model_vis_data = prepare(**model_vis_data) if mode == 'save_html' and filename: logging.info("Saving pyLDAVis to {}".format(filename)) save_html(prepared_model_vis_data, filename) else: show(prepared_model_vis_data, ip="0.0.0.0", port=8888)
def lda_vis(modeled_corpus, mode='show', filename=None): """Designed to work with to_py_lda_vis() in the model classes.""" from pyLDAvis import prepare, show, save_html model_vis_data = _to_py_lda_vis(modeled_corpus) prepared_model_vis_data = prepare(**model_vis_data) if mode == 'save_html' and filename: save_html(prepared_model_vis_data, filename) else: show(prepared_model_vis_data)
def lda_viz(topic_2_term, topic_2_doc, doc_lengths, vocab_, term_frequency): # create pyLDAvis object prepared_data = pyLDAvis.prepare( topic_term_dists=topic_2_term, doc_topic_dists=topic_2_doc, doc_lengths=doc_lengths, vocab=vocab_, term_frequency=term_frequency, start_index=0, sort_topics=False, ) pyLDAvis.save_html(prepared_data, "ldaviz.html")
def ldavis_create(lda, corpus, gensim_dict, LDAvis_data_filepath=fpathroot + fpathappend + '_lda_vis', return_ldavis=False): LDAvis_prepared = pyLDAvis.prepare(lda, corpus, gensim_dict) with open(LDAvis_data_filepath, 'w') as f: pickle.dump(LDAvis_prepared, f) if return_ldavis == True: return LDAvis_prepared else: pyLDAvis.display(LDAvis_prepared)
def visualise_ldamallet_topics(dataset, alpha, num_topic): ''' Extracts relevant information form ldamallet's LDA model and visualizes the topics with Gensim's LDA visualisation :return: visualisation ''' ldamallet_dir = 'data/topic_models/basic/{}_alpha{}_{}/ldamallet'.format( dataset, alpha, num_topic) # e.g. Semeval_alpha50_20 convertedLDAmallet = convertLDAmallet(dataDir=ldamallet_dir, filename='state.mallet.gz') pyLDAvis.enable_notebook() vis = pyLDAvis.prepare(**convertedLDAmallet) # pyLDAvis.display(vis) return vis
def new(cls, name: str, dataset: Dataset, model: TopicModel, **kwargs) -> "Visualizer": path = common.PROJDIR / (name + ".LDAvis.json") pyLDAvis.save_json( pyLDAvis.prepare(model.get_topic_word_matrix(normalize=True), model.get_doc_topic_matrix(normalize=True), dataset.get_count_matrix().sum(axis=1).squeeze(), [word.decode() for word in dataset.get_vocab()], dataset.get_count_matrix().sum(axis=0).squeeze(), **kwargs), str(path), ) return cls(path)
def make_pyLDAVis(self, mdl, visualization_file='./visualization.html'): import pyLDAvis topic_term_dists = np.stack( [mdl.get_topic_word_dist(k) for k in range(mdl.k)]) doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs]) doc_topic_dists /= doc_topic_dists.sum(axis=1, keepdims=True) doc_lengths = np.array([len(doc.words) for doc in mdl.docs]) vocab = list(mdl.used_vocabs) term_frequency = mdl.used_vocab_freq prepared_data = pyLDAvis.prepare(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency) pyLDAvis.save_html(prepared_data, visualization_file)
def prepare_visualization(self, documents: List[List[str]]) -> pyLDAvis: """ Prepare documents for visualization from trained model :param documents: List[List[str]] Tokenized documents :return: pyLDAvis Prepared word matrix, documents distances, vocabulary and word counts using pyLDAvis library """ _voc: List[str] = [] for cluster in self.cluster_word_distribution: _voc.extend(list(cluster.keys())) _vocabulary: List[str] = list(set(_voc)) _doc_topic_distances: List[List[float]] = [self.predict_proba(doc) for doc in documents] for doc in _doc_topic_distances: for word in doc: assert not isinstance(word, complex) _doc_len = [len(doc) for doc in documents] _word_counts_map: dict = {} for doc in documents: for word in doc: _word_counts_map[word] = _word_counts_map.get(word, 0) + 1 _word_counts: list = [_word_counts_map[term] for term in _vocabulary] _doc_topic_distances_ext: list = [[v if not math.isnan(v) else 1 / self.n_clusters for v in d] for d in _doc_topic_distances] _doc_topic_distances_ext = [d if sum(d) > 0 else [1 / self.n_clusters] * self.n_clusters for d in _doc_topic_distances_ext] for doc in _doc_topic_distances_ext: for f in doc: assert not isinstance(f, complex) assert (pd.DataFrame(_doc_topic_distances_ext).sum(axis=1) < 0.999).sum() == 0 _word_matrix: list = [] for cluster in self.cluster_word_distribution: _total: float = sum([frequency for word, frequency in cluster.items()]) assert not math.isnan(_total) if _total == 0: _row: list = [(1 / len(_vocabulary))] * len(_vocabulary) else: _row: list = [cluster.get(word, 0) / _total for word in _vocabulary] for word in _row: assert not isinstance(word, complex) _word_matrix.append(_row) return pyLDAvis.prepare(topic_term_dists=_word_matrix, doc_topic_dists=_doc_topic_distances_ext, doc_lengths=_doc_len, vocab=_vocabulary, term_frequency=_word_counts, R=30, lambda_step=0.01, sort_topics=False )
def tpc_vis(doc_wds_mat, tpc_wds_mat, doc_tpc_mat, vocab, outfile): data_input = [] data_input.append([doc_wds_mat, tpc_wds_mat, doc_tpc_mat, vocab]) data = { 'topic_term_dists': data_input[1], 'doc_topic_dists': data_input[2], 'doc_lengths': data_input[0].sum(axis=1).A.squeeze(), 'vocab': data_input[3], 'term_frequency': data_input[0].sum(axis=0).A.squeeze() } vis_data = pyLDAvis.prepare(**data) pyLDAvis.save_html(vis_data, outfile)
def single_pyLDAvis(N, fin_tmpl, fout_tmpl, mds): filename = fin_tmpl.format(n=N) #print (filename) model_data = LDAp.load_model_from_pkl(filename) vis_data = pyLDAvis.prepare(**model_data, mds=mds) # pyLDAvis 2D可视化降维演算法有['PCOA','TSNE','MMDS'] 三种可能 # 文档来源https://pyldavis.readthedocs.io/en/latest/modules/API.html#pyLDAvis.prepared_data_to_html html_out = fout_tmpl.format(n=N, kind=mds) #d3, ldavis, ldavis_css的资源需要先下载好并放在对映的目录,相对於html_out目录 pyLDAvis.save_html(vis_data, html_out,\ d3_url="js/d3.min.js", \ ldavis_url='js/ldavis.v1.0.0.js', \ ldavis_css_url='js/ldavis.v1.0.0.css') return (model_data, html_out)
def lda_vis(self, n_words=30, name='model'): ''' DESC: Creates pyLDAvis figure. Requires LDA topic_analysis model --Input-- n_words = number of words to display in the barcharts of figure ---------------------------------- --Output-- Returns pyLDAvis figure in html browser ''' doc_lengths = [len(doc) for doc in self.corpus] vocab_lst = self.vectorizer.feature_names term_freq = textacy.vsm.get_doc_freqs(self.tfidf, normalized=False) topic_terms_tups = list( self.model.top_topic_terms(self.vectorizer.feature_names, topics=-1, top_n=len(vocab_lst), weights=True)) lst = [] for topic in topic_terms_tups: words = [] for w in topic[1]: words.append(w) lst.append(words) topic_weight = [] for topic in lst: weights = [] for word in vocab_lst: for we in topic: if word == we[0]: weights.append(we[1]) topic_weight.append(weights) topic_term = np.array(topic_weight) self.ldavis = pyLDAvis.prepare(topic_term, \ self.topic_matrix, \ doc_lengths, \ vocab_lst, \ term_freq, \ R=n_words, \ mds='mmds', \ sort_topics=False) pyLDAvis.save_html(self.ldavis, 'pyLDAvis_' + name) print('plotting...') pyLDAvis.show(self.ldavis)
def visualize_topics(self, notebook_mode = False, mds = 'pcoa'): """ Print important topics based on decomposition. Parameters ---------- mds : str, optional (default='pcoa') 2D Decomposition. Allowed values: * ``'pcoa'`` - Dimension reduction via Jensen-Shannon Divergence & Principal Coordinate Analysis (aka Classical Multidimensional Scaling) * ``'mmds'`` - Dimension reduction via Multidimensional scaling * ``'tsne'`` - Dimension reduction via t-distributed stochastic neighbor embedding """ if not isinstance(mds, str): raise ValueError('mds must be a string') if not isinstance(notebook_mode, bool): raise ValueError('notebook_mode must be a boolean') try: import pyLDAvis import pyLDAvis.sklearn except: raise Exception( 'pyldavis not installed. Please install it and try again.' ) if notebook_mode: pyLDAvis.enable_notebook() vis_data = _prepare_topics( self._doc_embed, self._topic_embed, self._word_embed, np.array(self._features), doc_lengths = self._doc_len, term_frequency = self._freqs, normalize = True, ) prepared_vis_data = pyLDAvis.prepare(**vis_data) if notebook_mode: return prepared_vis_data else: pyLDAvis.show(prepared_vis_data)
def generate_ldavis_data(data_path, model, idx_to_word, freqs, vocab_size): """This method will launch a locally hosted session of pyLDAvis that will visualize the results of our model Parameters ---------- data_path : str Location where your data is stored. model : Lda2Vec Loaded lda2vec tensorflow model. idx_to_word : dict index to word mapping dictionary freqs list: Frequencies of each token. vocab_size : int Total size of your vocabulary """ doc_embed = model.sesh.run(model.mixture.doc_embedding) topic_embed = model.sesh.run(model.mixture.topic_embedding) word_embed = model.sesh.run(model.w_embed.embedding) # Extract all unique words in order of index 0-vocab_size vocabulary = [] for k, v in idx_to_word.items(): vocabulary.append(v) # Read in document lengths doc_lengths = np.load(data_path + "/doc_lengths.npy") # The prepare_topics function is a direct copy from Chris Moody vis_data = prepare_topics(doc_embed, topic_embed, word_embed, np.array(vocabulary), doc_lengths=doc_lengths, term_frequency=freqs, normalize=True) prepared_vis_data = pyLDAvis.prepare(**vis_data) pyLDAvis.display(prepared_vis_data)
def pyLDA(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency): """ use pyldavis show results in browser topic_term_dists : array-like, shape (`n_topics`, `n_terms`) Matrix of topic-term probabilities. Where `n_terms` is `len(vocab)`. doc_topic_dists : array-like, shape (`n_docs`, `n_topics`) Matrix of document-topic probabilities. doc_lengths : array-like, shape `n_docs` The length of each document, i.e. the number of words in each document. The order of the numbers should be consistent with the ordering of the docs in `doc_topic_dists`. vocab : array-like, shape `n_terms` List of all the words in the corpus used to train the model. term_frequency : array-like, shape `n_terms` The count of each particular term over the entire corpus. The ordering of these counts should correspond with `vocab` and `topic_term_dists`. """ pyLDAvis.enable_notebook(True) data = pyLDAvis.prepare(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency) pyLDAvis.show(data)
def visualize(self): with torch.no_grad(): doc_concept_probs = self.get_train_doc_concept_probs() # [n_concepts, vocab_size] weighted word counts of each concept concept_word_counts = torch.matmul( doc_concept_probs.transpose(0, 1), self.bow_train) # normalize word counts to word distribution of each concept concept_word_dists = concept_word_counts / concept_word_counts.sum( 1, True) # fill NaN with 1/vocab_size in case a concept has all zero word distribution concept_word_dists[ concept_word_dists != concept_word_dists] = 1.0 / concept_word_dists.shape[1] vis_data = pyLDAvis.prepare( topic_term_dists=concept_word_dists.data.cpu().numpy(), doc_topic_dists=doc_concept_probs.data.cpu().numpy(), doc_lengths=self.doc_lens, vocab=self.vocab, term_frequency=self.word_counts) pyLDAvis.save_html( vis_data, os.path.join(self.out_dir, "visualization.html"))
def generate_ldavis_data(clean_data_dir, model, idx_to_word, freqs, vocab_size): doc_embed = model.sesh.run(model.mixture.doc_embedding) topic_embed = model.sesh.run(model.mixture.topic_embedding) word_embed = model.sesh.run(model.w_embed.embedding) vocabulary = [] for _, v in idx_to_word.items(): vocabulary.append(v) doc_lengths = np.load(clean_data_dir + "/doc_lengths.npy") vis_data = prepare_topics(doc_embed, topic_embed, word_embed, np.array(vocabulary), doc_lengths=doc_lengths, term_frequency=freqs) prepared_vis_data = pyLDAvis.prepare(**vis_data) pyLDAvis.show(prepared_vis_data)
def lda_viz(docs, lengths, n_features, n_topics, n_top_words): n_samples = len(docs) norm = lambda data: pandas.DataFrame(data).div(data.sum(1),axis=0).values vect = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') vected = vect.fit_transform(docs) lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0) doc_topic_dists = norm(lda.fit_transform(vected)) prepared = pyLDAvis.prepare( doc_lengths = lengths, vocab = vect.get_feature_names(), term_frequency = vected.sum(axis=0).tolist()[0], topic_term_dists = norm(lda.components_), doc_topic_dists = doc_topic_dists, ) #print(doc_topic_dists) #print(n_samples) return prepared, doc_topic_dists
def main(start, end, increment): path = Path('C:/Data/Python/JobLoss') data_words = [] with open(path / 'Processed.json') as f: data = json.load(f) for tweet in data: data_words.append(' '.join(tweet[1])) vec = CountVectorizer() X = vec.fit_transform(data_words).toarray() vocab = np.array(vec.get_feature_names()) biterms = vec_to_biterms(X) for k in range(start, end, increment): print('Model %s' % k) btm = oBTM(num_topics=k, V=vocab) for i in range(0, len(biterms), chunksize): print('%s / %s' % (i, len(biterms))) biterms_chunk = biterms[i:i + chunksize] btm.fit(biterms_chunk, iterations=iterations) topics = btm.transform(biterms) vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1), vocab, np.sum(X, axis=0)) pyLDAvis.save_html( vis, str(path / ('Visualizations/BTMVisualization%s.html' % k)))