Exemplo n.º 1
0
def embedding_fig(w2v_model,
                  kmeans_model,
                  n=500,
                  embed_style='tsne',
                  savefn=None,
                  show=False):
    """
    Save or show a matplotlib figure, displaying word embeddings colored
    according to cluster assignment. The parameter n gives the number of 
    words to show, in decreasing order of frequency in the dataset. 

    """
    freqs = {x: y.count for x, y in w2v_model.vocab.items()}
    words, word_vecs = w2v_model.index2word, w2v_model.syn0
    srtd_indices, srtd_words = zip(*sorted(
        list(enumerate(words)), key=lambda x: freqs[x[1]], reverse=True))
    srtd_vecs = np.array([word_vecs[i] for i in srtd_indices])
    subset_words, subset_vecs = srtd_words[:n], srtd_vecs[:n]
    # map cluster assignment to integer.
    subset_clusters = np.array(
        map(lambda x: int(kmeans_model.predict(x)), subset_vecs))
    unique_clusters = set(subset_clusters)
    print "number of unique clusters in top {} words: {}.".format(
        n, len(unique_clusters))
    colormap = discrete_color_scheme(len(unique_clusters))
    int_to_color = {
        idx: colormap[i]
        for (i, idx) in enumerate(list(unique_clusters))
    }
    subset_colors = [int_to_color[i] for i in subset_clusters]
    if embed_style == 'pca':
        pca = PCA(n_components=2)
        pca_embeddings = pca.fit_transform(subset_vecs)
        embed_xs, embed_ys = pca_embeddings.transpose()
    elif embed_style == 'tsne':
        tsne_embeddings = bh_sne(np.asarray(subset_vecs, dtype=np.float64),
                                 d=2,
                                 perplexity=10)
        embed_xs, embed_ys = tsne_embeddings.transpose()
    fig = plt.figure()
    fig.set_size_inches(50, 28.4)
    ax = fig.add_subplot(111)
    ax.scatter(embed_xs, embed_ys, color=subset_colors, s=100)
    for (x, y, word) in zip(embed_xs, embed_ys, subset_words):
        ax.annotate(word, xy=(x, y), textcoords='offset points', fontsize=20)
    plt.title('{} 2d word embeddings'.format(embed_style))
    if savefn is not None:
        plt.savefig(savefn, dpi=120)
    if show:
        plt.show()
Exemplo n.º 2
0
def plot_gpe(gpe_data, show=False, savefn=None):
    terms = gpe_data.values()[0].keys()
    term_serieses = {t: [] for t in terms}
    scheme = discrete_color_scheme(n=len(terms))
    colormap = {t: scheme[i] for i, t in enumerate(terms)}
    times = []
    s = sorted(gpe_data.items())
    for date, per_stem_vals in s:
        for stem, val in per_stem_vals.items():
            term_serieses[stem].append(val)
        times.append(date)
    # Now for each stem, we have a list of three-tuples correspnoding to their values over time

    # get a 3x1 figure, each sharing the x (time) axis..
    f, (ax1, ax2, ax3) = plt.subplots(3, 1, sharex=True)
    f.set_size_inches(18.5, 10.5)
    for stem, serieses in term_serieses.items():
        ys0 = map(lambda x: x[0], serieses)
        ys1 = map(lambda x: x[1], serieses)
        ys2 = map(lambda x: x[2], serieses)
        ax1.plot_date(times, ys0, color=colormap[stem], label=stem, fmt='b-')
        ax2.plot_date(times, ys1, color=colormap[stem], label=stem, fmt='b-')
        ax3.plot_date(times, ys2, color=colormap[stem], label=stem, fmt='b-')

    ax1.set_xlabel('Time')
    ax2.set_xlabel('Time')
    ax3.set_xlabel('Time')
    ax1.set_ylabel('Differential Fecundity')
    ax2.set_ylabel('Differential Mutation')
    ax3.set_ylabel('Differential Convergence')
    ax1.set_title('GPE Term I')
    ax2.set_title('GPE Term II')
    ax3.set_title('GPE Term III')
    handles, labels = ax1.get_legend_handles_labels()
    f.legend(handles, labels, loc='upper right')
    if savefn is not None:
        plt.savefig(savefn, dpi=100)