def embedding_fig(w2v_model, kmeans_model, n=500, embed_style='tsne', savefn=None, show=False): """ Save or show a matplotlib figure, displaying word embeddings colored according to cluster assignment. The parameter n gives the number of words to show, in decreasing order of frequency in the dataset. """ freqs = {x: y.count for x, y in w2v_model.vocab.items()} words, word_vecs = w2v_model.index2word, w2v_model.syn0 srtd_indices, srtd_words = zip(*sorted( list(enumerate(words)), key=lambda x: freqs[x[1]], reverse=True)) srtd_vecs = np.array([word_vecs[i] for i in srtd_indices]) subset_words, subset_vecs = srtd_words[:n], srtd_vecs[:n] # map cluster assignment to integer. subset_clusters = np.array( map(lambda x: int(kmeans_model.predict(x)), subset_vecs)) unique_clusters = set(subset_clusters) print "number of unique clusters in top {} words: {}.".format( n, len(unique_clusters)) colormap = discrete_color_scheme(len(unique_clusters)) int_to_color = { idx: colormap[i] for (i, idx) in enumerate(list(unique_clusters)) } subset_colors = [int_to_color[i] for i in subset_clusters] if embed_style == 'pca': pca = PCA(n_components=2) pca_embeddings = pca.fit_transform(subset_vecs) embed_xs, embed_ys = pca_embeddings.transpose() elif embed_style == 'tsne': tsne_embeddings = bh_sne(np.asarray(subset_vecs, dtype=np.float64), d=2, perplexity=10) embed_xs, embed_ys = tsne_embeddings.transpose() fig = plt.figure() fig.set_size_inches(50, 28.4) ax = fig.add_subplot(111) ax.scatter(embed_xs, embed_ys, color=subset_colors, s=100) for (x, y, word) in zip(embed_xs, embed_ys, subset_words): ax.annotate(word, xy=(x, y), textcoords='offset points', fontsize=20) plt.title('{} 2d word embeddings'.format(embed_style)) if savefn is not None: plt.savefig(savefn, dpi=120) if show: plt.show()
def plot_gpe(gpe_data, show=False, savefn=None): terms = gpe_data.values()[0].keys() term_serieses = {t: [] for t in terms} scheme = discrete_color_scheme(n=len(terms)) colormap = {t: scheme[i] for i, t in enumerate(terms)} times = [] s = sorted(gpe_data.items()) for date, per_stem_vals in s: for stem, val in per_stem_vals.items(): term_serieses[stem].append(val) times.append(date) # Now for each stem, we have a list of three-tuples correspnoding to their values over time # get a 3x1 figure, each sharing the x (time) axis.. f, (ax1, ax2, ax3) = plt.subplots(3, 1, sharex=True) f.set_size_inches(18.5, 10.5) for stem, serieses in term_serieses.items(): ys0 = map(lambda x: x[0], serieses) ys1 = map(lambda x: x[1], serieses) ys2 = map(lambda x: x[2], serieses) ax1.plot_date(times, ys0, color=colormap[stem], label=stem, fmt='b-') ax2.plot_date(times, ys1, color=colormap[stem], label=stem, fmt='b-') ax3.plot_date(times, ys2, color=colormap[stem], label=stem, fmt='b-') ax1.set_xlabel('Time') ax2.set_xlabel('Time') ax3.set_xlabel('Time') ax1.set_ylabel('Differential Fecundity') ax2.set_ylabel('Differential Mutation') ax3.set_ylabel('Differential Convergence') ax1.set_title('GPE Term I') ax2.set_title('GPE Term II') ax3.set_title('GPE Term III') handles, labels = ax1.get_legend_handles_labels() f.legend(handles, labels, loc='upper right') if savefn is not None: plt.savefig(savefn, dpi=100)