Exemplo n.º 1
0
def mean_vs_pooled(rank_dist, freq_dist, save_dir):
    all_joints = merge_to_joint(rank_dist, freq_dist)
    all_xs, all_ys = list(
        zip(*[(r, f) for r_ls, f_ls in all_joints.values()
              for r, f in zip(r_ls, f_ls) if f > 0]))

    hexbin_plot(all_xs,
                all_ys,
                xlbl=r"$\log$ $r(w)$",
                ylbl=r"$\log$ $f(w)$",
                min_y=1)

    mean_ranks = reduce_pooled(rank_dist)
    mean_freqs = reduce_pooled(freq_dist)

    mean_joints = merge_to_joint(mean_ranks, mean_freqs)
    mean_xs, mean_ys = list(zip(*sorted(mean_joints.values())))

    hexbin_plot(mean_xs,
                mean_ys,
                xlbl=r"$\log$ $r(w)$",
                ylbl=r"$\log$ $f(w)$",
                color="red",
                edgecolors="red",
                cmap="Reds_r",
                cbar=False,
                min_y=1,
                label="mean")

    plt.legend()
    plt.savefig(save_dir + "rank_freq_mean_vs_var.png", dpi=300)
    plt.close()
Exemplo n.º 2
0
def heap_main(wiki, rng_params, m, save_dir="./"):
    rng = list(range(*rng_params))

    try:
        vocab_sizes = heap_from_file(save_dir,
                                     (rng_params[0], rng_params[1], len(rng)))
    except FileNotFoundError:
        vocab_sizes = [heap(wiki, rng) for _ in range(m)]

    do_mles(rng, vocab_sizes, save_dir)

    all_sizes = [v_n for size_ls in vocab_sizes for v_n in size_ls]

    print(len(all_sizes))

    long_rng = np.tile(rng, m)

    print(len(long_rng))

    print(len(vocab_sizes))
    hexbin_plot(long_rng,
                all_sizes,
                xlbl="$n$",
                ylbl="$V(n)$",
                log=False,
                ignore_zeros=False,
                gridsize=100)

    mean_vs = np.mean(vocab_sizes, axis=0)

    hexbin_plot(rng,
                mean_vs,
                xlbl="$n$",
                ylbl="$V(n)$",
                log=False,
                ignore_zeros=False,
                label="mean",
                color="red",
                edgecolors="red",
                cmap="Reds_r",
                cbar=False,
                gridsize=100,
                linewidths=0.5)

    plt.legend(loc="upper left")
    plt.savefig(save_dir + "vocab_growth_" + str(min(rng)) + "_" +
                str(max(rng)) + "_" + str(len(rng)) + ".png",
                dpi=300)
    plt.close()

    with open(
            save_dir + "vocab_growth_" + str(rng_params[0]) + "_" +
            str(rng_params[1]) + "_" + str(len(rng)) + ".pkl", "wb") as handle:
        pickle.dump(vocab_sizes, handle)
Exemplo n.º 3
0
def convergence_main(wiki, rng, m, save_dir="./"):
    handle = open(
        save_dir + "mle_mandelbrot_convergence_" + "_".join(map(str, rng)) +
        ".txt", "w")
    for i, n in enumerate(rng):
        mean_ranks, mean_freqs = get_mean_relationship(wiki, n, m,
                                                       compute_freqs)
        joints = merge_to_joint(mean_ranks, mean_freqs)
        xs, ys = list(zip(*joints.values()))

        hexbin_plot(xs,
                    ys,
                    xlbl=r"$\log$ $r(w)$",
                    ylbl=r"$\log$ $f(w)$",
                    edgecolors=colour_palette[i],
                    color=colour_palette[i],
                    label=format_scientific(n),
                    alpha=1 / (i + 1)**.3,
                    linewidths=1.0,
                    cbar=(True if i == 0 else False),
                    min_y=1)

        do_mle(xs, ys, n, handle)

    handle.close()

    plt.legend()
    plt.savefig(save_dir + "convergence_" + "_".join(map(str, rng)) + ".png",
                dpi=300)
    plt.close()

    for i, n in enumerate(rng):
        mean_ranks, mean_freqs = get_mean_relationship(
            wiki, n, m, compute_normalised_freqs)
        joints = merge_to_joint(mean_ranks, mean_freqs)
        xs, ys = list(zip(*joints.values()))

        hexbin_plot(xs,
                    ys,
                    xlbl=r"$\log$ $r(w)$",
                    ylbl=r"$\log$ $P(w)$",
                    edgecolors=colour_palette[i],
                    color=colour_palette[i],
                    label=format_scientific(n),
                    alpha=1 / (i + 1)**.3,
                    linewidths=1.0,
                    cbar=(True if i == 0 else False),
                    min_y=1 / n)

    plt.legend()
    plt.savefig(save_dir + "convergence_probs_" + "_".join(map(str, rng)) +
                ".png",
                dpi=300)
    plt.close()
Exemplo n.º 4
0
def zipf_wrong(wiki, n, d):
    subcorp = Articles.subsample(wiki, n)

    ranks, freqs = compute_ranks(subcorp), compute_freqs(subcorp)

    joints = merge_to_joint(ranks, freqs)
    xs, ys = list(zip(*sorted(joints.values())))

    hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$")
    plt.savefig(d + "rank_freq_" + str(n) + "_wrong.png", dpi=300)
    plt.close()
Exemplo n.º 5
0
def zipf_piantadosi(wiki, n, d):
    subcorp1 = Words.subsample(wiki, n)
    subcorp2 = Words.subsample(wiki, n)

    ranks = compute_ranks(subcorp1)
    freqs = compute_freqs(subcorp2)

    joints = merge_to_joint(ranks, freqs)
    xs, ys = list(zip(*sorted(joints.values())))

    hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$")
    plt.savefig(d + "rank_freq_" + str(n) + "_piantadosi.png", dpi=300)
    plt.close()
Exemplo n.º 6
0
def within_filter_plots(sample_dict, show=True, mle_dict=None):
    plot_lims = None
    for i, (param, sample_ls) in enumerate(sample_dict.items()):
        mean_ranks, mean_freqs = mean_rank_freq_from_samples(sample_ls)
        joints = merge_to_joint(mean_ranks, mean_freqs)
        xs, ys = list(zip(*sorted(joints.values())))

        cur_plot_lims =\
        hexbin_plot(xs, ys,
                    xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$", label=str(param),
                    color=colour_palette[i], edgecolors=colour_palette[i],
                    linewidths=1.0, lims=None, min_y=1,
                    cbar=False)
        if mle_dict and param in mle_dict:
            mandelbrot = mle_dict[param]
            plot_preds(mandelbrot, np.asarray(xs), color=colour_palette[i])

        plot_lims = get_greater_lims(plot_lims, cur_plot_lims)
        print(plot_lims)

    plt.xlim(plot_lims[0])
    plt.ylim(plot_lims[1])
    plt.legend()
    if show:
        plt.show()

    return plot_lims
Exemplo n.º 7
0
def vocab_growth_plot(tf_means, srf_means, uni_mean, rng, save_dir):
    i = 0
    plot_lims = None
    for name, sample_dict in zip(["TF ", "SRF "], [tf_means, srf_means]):
        for param, mean_vs in sample_dict.items():
            cur_plot_lims = hexbin_plot(rng,
                                        mean_vs,
                                        log=False,
                                        ignore_zeros=False,
                                        label=name + str(param),
                                        color=colour_palette[i],
                                        edgecolors=colour_palette[i],
                                        cmap="Blues_r",
                                        cbar=False,
                                        gridsize=100,
                                        linewidths=1.0)

            plot_lims = get_greater_lims(plot_lims, cur_plot_lims)
            i += 1

    cur_plot_lims = hexbin_plot(rng,
                                uni_mean,
                                xlbl="$n$",
                                ylbl="$V(n)$",
                                log=False,
                                ignore_zeros=False,
                                label="UNIF",
                                color="black",
                                edgecolors="black",
                                cmap="gray",
                                cbar=True,
                                gridsize=100,
                                linewidths=1.0)
    plot_lims = get_greater_lims(plot_lims, cur_plot_lims)

    plt.xlim(plot_lims[0])
    plt.ylim(plot_lims[1])
    plt.legend(loc="lower right")
    plt.savefig(save_dir + "vocab_growth_comparison.png", dpi=300)
    plt.close()
Exemplo n.º 8
0
def covariance_across_words(rank_dist, freq_dist, save_dir):
    joints = merge_to_joint(rank_dist, freq_dist)
    mean_ranks = reduce_pooled(rank_dist)

    equalize_len = lambda ls1, ls2: (ls1[:min(len(ls1), len(ls2))], ls2[:min(
        len(ls1), len(ls2))])

    cov_dict = {
        w: np.cov(*equalize_len(r_ls, f_ls))
        for w, (r_ls, f_ls) in joints.items()
    }

    fano_factor_dict = {
        w: cov_mat[0][1] / mean_ranks[w]
        for w, cov_mat in cov_dict.items()
    }

    words_sorted = [
        (w, r) for w, r in sorted(mean_ranks.items(), key=lambda tup: tup[1])
    ]

    xs, ys = list(
        zip(*[(r, fano_factor_dict[w]) for w, r in words_sorted
              if w in cov_dict]))

    hexbin_plot(xs,
                ys,
                log=False,
                xscale="log",
                bins="log",
                xlbl="$\overline{r}(w)$",
                ylbl="$D(w)$",
                ignore_zeros=False,
                gridsize=100)

    #    plt.legend()
    plt.savefig(save_dir + "dispersion.png", dpi=300)
    plt.close()
Exemplo n.º 9
0
def across_filter_plots(tf_samples,
                        srf_samples,
                        f,
                        h,
                        uni_samples,
                        show=False):
    tf_mean_ranks, tf_mean_freqs = mean_rank_freq_from_samples(tf_samples)
    srf_mean_ranks, srf_mean_freqs = mean_rank_freq_from_samples(srf_samples)
    uni_mean_ranks, uni_mean_freqs = mean_rank_freq_from_samples(uni_samples)

    tf_mean_rf = mean_rank_freq_from_samples(tf_samples)
    srf_mean_rf = mean_rank_freq_from_samples(srf_samples)
    uni_mean_rf = mean_rank_freq_from_samples(uni_samples)

    plot_lims = None

    joints = merge_to_joint(*uni_mean_rf)
    xs, ys = list(zip(*sorted(joints.values())))

    cur_plot_lims = hexbin_plot(xs,
                                ys,
                                xlbl="$\log$ $r(w)$",
                                ylbl="$\log$ $f(w)$",
                                label="UNIF",
                                color="black",
                                edgecolors="black",
                                cmap="gray",
                                linewidths=1.0,
                                lims=None,
                                min_y=1,
                                cbar=True)
    plot_lims = get_greater_lims(plot_lims, cur_plot_lims)

    joints = merge_to_joint(*tf_mean_rf)
    xs, ys = list(zip(*sorted(joints.values())))

    cur_plot_lims = hexbin_plot(xs,
                                ys,
                                label="TF " + str(f),
                                color=colour_palette[0],
                                edgecolors=colour_palette[0],
                                linewidths=1.0,
                                lims=None,
                                min_y=1,
                                cbar=False)
    plot_lims = get_greater_lims(plot_lims, cur_plot_lims)

    joints = merge_to_joint(*srf_mean_rf)
    xs, ys = list(zip(*sorted(joints.values())))

    cur_plot_lims = hexbin_plot(xs,
                                ys,
                                label="SRF " + str(h),
                                color=colour_palette[1],
                                edgecolors=colour_palette[1],
                                linewidths=1.0,
                                lims=None,
                                min_y=1,
                                cbar=False)
    plot_lims = get_greater_lims(plot_lims, cur_plot_lims)

    plt.xlim(plot_lims[0])
    plt.ylim(plot_lims[1])
    plt.legend()
    if show:
        plt.show()
Exemplo n.º 10
0
            handle.write(mandel.print_result(string=True))
            handle.write("\n\n")

        handle.write("\nUNI\n")
        handle.write(uni_mandel.print_result(string=True))
        handle.write("\n\n")

    print("MLEs done")

    # within filter comparisons
    # TF
    uni_plot_lims = hexbin_plot(uni_xs,
                                uni_ys,
                                label="UNIF",
                                linewidths=1.0,
                                color="black",
                                edgecolors="black",
                                cmap="gray",
                                alpha=0.5,
                                cbar=True,
                                min_y=1)
    plot_lims = within_filter_plots(three_tfs, show=False)
    plot_lims = get_greater_lims(uni_plot_lims, plot_lims)
    plt.xlim(plot_lims[0])
    plt.ylim(plot_lims[1])
    plt.savefig(results_d + "TF_within_comp_rank_freq.png", dpi=300)
    plt.close()

    # SRF
    uni_plot_lims = hexbin_plot(uni_xs,
                                uni_ys,
                                label="UNIF",
Exemplo n.º 11
0
def sampling_levels_main(wiki, n, m, save_dir="./"):

    art_mean_ranks, art_mean_freqs = get_mean_relationship(
        Articles, wiki, n, m)
    art_joint = merge_to_joint(art_mean_ranks, art_mean_freqs)
    art_xs, art_ys = list(zip(*sorted(art_joint.values())))

    hexbin_plot(art_xs,
                art_ys,
                xlbl=r"$\log$ $r(w)$",
                ylbl=r"$\log$ $f(w)$",
                label="texts",
                min_y=1)

    do_mle(art_xs, art_ys, Articles, save_dir)

    sent_mean_ranks, sent_mean_freqs = get_mean_relationship(
        Sentences, wiki, n, m)
    sent_joint = merge_to_joint(sent_mean_ranks, sent_mean_freqs)
    sent_xs, sent_ys = list(zip(*sorted(sent_joint.values())))

    do_mle(sent_xs, sent_ys, Sentences, save_dir)

    word_mean_ranks, word_mean_freqs = get_mean_relationship(Words, wiki, n, m)
    word_joint = merge_to_joint(word_mean_ranks, word_mean_freqs)
    word_xs, word_ys = list(zip(*sorted(word_joint.values())))

    hexbin_plot(word_xs,
                word_ys,
                xlbl=r"$\log$ $r(w)$",
                ylbl=r"$\log$ $f(w)$",
                color="red",
                edgecolors="red",
                cmap="Reds_r",
                label="words",
                cbar=False,
                min_y=1)

    do_mle(word_xs, word_ys, Words, save_dir)

    plt.legend()
    plt.savefig(save_dir + "rank_freq_word_vs_article_" + str(n) + ".png",
                dpi=300)
    plt.close()

    freq_joint = merge_to_joint(art_mean_freqs, word_mean_freqs)
    xs, ys = list(zip(*sorted(freq_joint.values())))

    hexbin_plot(xs,
                ys,
                xlbl=r"$\log$ $f(w)$ from texts",
                ylbl=r"$\log$ $f(w)$ from words")
    plt.savefig(save_dir + "freq_correl_word_vs_article_" + str(n) + ".png",
                dpi=300)
    plt.close()

    art_word_corr = scistats.spearmanr(xs, ys)

    freq_joint = merge_to_joint(art_mean_freqs, sent_mean_freqs)
    xs, ys = list(zip(*sorted(freq_joint.values())))

    art_sent_corr = scistats.spearmanr(xs, ys)

    freq_joint = merge_to_joint(sent_mean_freqs, word_mean_freqs)
    xs, ys = list(zip(*sorted(freq_joint.values())))

    sent_word_corr = scistats.spearmanr(xs, ys)

    with open(save_dir + "freq_sampling_level_correlations.txt",
              "w") as handle:
        handle.write("\t".join([
            "Articles-Words:",
            str(art_word_corr.correlation),
            str(art_word_corr.pvalue)
        ]))
        handle.write("\n")
        handle.write("\t".join([
            "Articles-Sentences:",
            str(art_sent_corr.correlation),
            str(art_sent_corr.pvalue)
        ]))
        handle.write("\n")
        handle.write("\t".join([
            "Sentences-Words:",
            str(sent_word_corr.correlation),
            str(sent_word_corr.pvalue)
        ]))
        handle.write("\n")

    rank_joint = merge_to_joint(art_mean_ranks, word_mean_ranks)
    xs, ys = list(zip(*sorted(rank_joint.values())))

    hexbin_plot(xs,
                ys,
                xlbl=r"$\log$ $r(w)$ from texts",
                ylbl=r"$\log$ $r(w)$ from words")
    plt.savefig(save_dir + "rank_correl_word_vs_article_" + str(n) + ".png",
                dpi=300)
    plt.close()

    art_word_corr = scistats.spearmanr(xs, ys)

    rank_joint = merge_to_joint(art_mean_ranks, sent_mean_ranks)
    xs, ys = list(zip(*sorted(rank_joint.values())))

    art_sent_corr = scistats.spearmanr(xs, ys)

    rank_joint = merge_to_joint(sent_mean_ranks, word_mean_ranks)
    xs, ys = list(zip(*sorted(rank_joint.values())))

    sent_word_corr = scistats.spearmanr(xs, ys)

    with open(save_dir + "rank_sampling_level_correlations.txt",
              "w") as handle:
        handle.write("\t".join([
            "Articles-Words:",
            str(art_word_corr.correlation),
            str(art_word_corr.pvalue)
        ]))
        handle.write("\n")
        handle.write("\t".join([
            "Articles-Sentences:",
            str(art_sent_corr.correlation),
            str(art_sent_corr.pvalue)
        ]))
        handle.write("\n")
        handle.write("\t".join([
            "Sentences-Words:",
            str(sent_word_corr.correlation),
            str(sent_word_corr.pvalue)
        ]))
        handle.write("\n")
Exemplo n.º 12
0
    subsamples2 = (Sentences.subsample(wiki, n) for _ in range(m))

    ranks = [compute_ranks(sub) for sub in subsamples1]
    ranks_joined = pool_ranks(ranks)
    mean_ranks = reduce_pooled(ranks_joined)

    freqs = [compute_freqs(sub) for sub in subsamples2]
    freqs_joined = pool_freqs(freqs)
    mean_freqs = reduce_pooled(freqs_joined)

    print("subsampling done")

    joints = merge_to_joint(mean_ranks, mean_freqs)
    xs, ys = list(zip(*sorted(joints.values())))

    hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$", min_y=1)

    mandelbrot = Mandelbrot(ys, xs)
    mandelbrot_fit = mandelbrot.fit(
        start_params=np.asarray([10.0, 1000.0]),  # [1.0, 1.0]
        method="powell",
        full_output=True)
    mandelbrot.register_fit(mandelbrot_fit)
    mandelbrot.print_result()
    with open(d + "mle_mandelbrot_" + str(n) + "_" + str(m) + ".txt",
              "w") as handle:
        handle.write(mandelbrot.print_result(string=True))
    plot_preds(mandelbrot, np.asarray(xs))
    plt.savefig(d + "rank_freq_" + str(n) + "_" + str(m) + ".png", dpi=300)
    plt.close()