def mean_vs_pooled(rank_dist, freq_dist, save_dir): all_joints = merge_to_joint(rank_dist, freq_dist) all_xs, all_ys = list( zip(*[(r, f) for r_ls, f_ls in all_joints.values() for r, f in zip(r_ls, f_ls) if f > 0])) hexbin_plot(all_xs, all_ys, xlbl=r"$\log$ $r(w)$", ylbl=r"$\log$ $f(w)$", min_y=1) mean_ranks = reduce_pooled(rank_dist) mean_freqs = reduce_pooled(freq_dist) mean_joints = merge_to_joint(mean_ranks, mean_freqs) mean_xs, mean_ys = list(zip(*sorted(mean_joints.values()))) hexbin_plot(mean_xs, mean_ys, xlbl=r"$\log$ $r(w)$", ylbl=r"$\log$ $f(w)$", color="red", edgecolors="red", cmap="Reds_r", cbar=False, min_y=1, label="mean") plt.legend() plt.savefig(save_dir + "rank_freq_mean_vs_var.png", dpi=300) plt.close()
def heap_main(wiki, rng_params, m, save_dir="./"): rng = list(range(*rng_params)) try: vocab_sizes = heap_from_file(save_dir, (rng_params[0], rng_params[1], len(rng))) except FileNotFoundError: vocab_sizes = [heap(wiki, rng) for _ in range(m)] do_mles(rng, vocab_sizes, save_dir) all_sizes = [v_n for size_ls in vocab_sizes for v_n in size_ls] print(len(all_sizes)) long_rng = np.tile(rng, m) print(len(long_rng)) print(len(vocab_sizes)) hexbin_plot(long_rng, all_sizes, xlbl="$n$", ylbl="$V(n)$", log=False, ignore_zeros=False, gridsize=100) mean_vs = np.mean(vocab_sizes, axis=0) hexbin_plot(rng, mean_vs, xlbl="$n$", ylbl="$V(n)$", log=False, ignore_zeros=False, label="mean", color="red", edgecolors="red", cmap="Reds_r", cbar=False, gridsize=100, linewidths=0.5) plt.legend(loc="upper left") plt.savefig(save_dir + "vocab_growth_" + str(min(rng)) + "_" + str(max(rng)) + "_" + str(len(rng)) + ".png", dpi=300) plt.close() with open( save_dir + "vocab_growth_" + str(rng_params[0]) + "_" + str(rng_params[1]) + "_" + str(len(rng)) + ".pkl", "wb") as handle: pickle.dump(vocab_sizes, handle)
def convergence_main(wiki, rng, m, save_dir="./"): handle = open( save_dir + "mle_mandelbrot_convergence_" + "_".join(map(str, rng)) + ".txt", "w") for i, n in enumerate(rng): mean_ranks, mean_freqs = get_mean_relationship(wiki, n, m, compute_freqs) joints = merge_to_joint(mean_ranks, mean_freqs) xs, ys = list(zip(*joints.values())) hexbin_plot(xs, ys, xlbl=r"$\log$ $r(w)$", ylbl=r"$\log$ $f(w)$", edgecolors=colour_palette[i], color=colour_palette[i], label=format_scientific(n), alpha=1 / (i + 1)**.3, linewidths=1.0, cbar=(True if i == 0 else False), min_y=1) do_mle(xs, ys, n, handle) handle.close() plt.legend() plt.savefig(save_dir + "convergence_" + "_".join(map(str, rng)) + ".png", dpi=300) plt.close() for i, n in enumerate(rng): mean_ranks, mean_freqs = get_mean_relationship( wiki, n, m, compute_normalised_freqs) joints = merge_to_joint(mean_ranks, mean_freqs) xs, ys = list(zip(*joints.values())) hexbin_plot(xs, ys, xlbl=r"$\log$ $r(w)$", ylbl=r"$\log$ $P(w)$", edgecolors=colour_palette[i], color=colour_palette[i], label=format_scientific(n), alpha=1 / (i + 1)**.3, linewidths=1.0, cbar=(True if i == 0 else False), min_y=1 / n) plt.legend() plt.savefig(save_dir + "convergence_probs_" + "_".join(map(str, rng)) + ".png", dpi=300) plt.close()
def zipf_wrong(wiki, n, d): subcorp = Articles.subsample(wiki, n) ranks, freqs = compute_ranks(subcorp), compute_freqs(subcorp) joints = merge_to_joint(ranks, freqs) xs, ys = list(zip(*sorted(joints.values()))) hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$") plt.savefig(d + "rank_freq_" + str(n) + "_wrong.png", dpi=300) plt.close()
def zipf_piantadosi(wiki, n, d): subcorp1 = Words.subsample(wiki, n) subcorp2 = Words.subsample(wiki, n) ranks = compute_ranks(subcorp1) freqs = compute_freqs(subcorp2) joints = merge_to_joint(ranks, freqs) xs, ys = list(zip(*sorted(joints.values()))) hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$") plt.savefig(d + "rank_freq_" + str(n) + "_piantadosi.png", dpi=300) plt.close()
def within_filter_plots(sample_dict, show=True, mle_dict=None): plot_lims = None for i, (param, sample_ls) in enumerate(sample_dict.items()): mean_ranks, mean_freqs = mean_rank_freq_from_samples(sample_ls) joints = merge_to_joint(mean_ranks, mean_freqs) xs, ys = list(zip(*sorted(joints.values()))) cur_plot_lims =\ hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$", label=str(param), color=colour_palette[i], edgecolors=colour_palette[i], linewidths=1.0, lims=None, min_y=1, cbar=False) if mle_dict and param in mle_dict: mandelbrot = mle_dict[param] plot_preds(mandelbrot, np.asarray(xs), color=colour_palette[i]) plot_lims = get_greater_lims(plot_lims, cur_plot_lims) print(plot_lims) plt.xlim(plot_lims[0]) plt.ylim(plot_lims[1]) plt.legend() if show: plt.show() return plot_lims
def vocab_growth_plot(tf_means, srf_means, uni_mean, rng, save_dir): i = 0 plot_lims = None for name, sample_dict in zip(["TF ", "SRF "], [tf_means, srf_means]): for param, mean_vs in sample_dict.items(): cur_plot_lims = hexbin_plot(rng, mean_vs, log=False, ignore_zeros=False, label=name + str(param), color=colour_palette[i], edgecolors=colour_palette[i], cmap="Blues_r", cbar=False, gridsize=100, linewidths=1.0) plot_lims = get_greater_lims(plot_lims, cur_plot_lims) i += 1 cur_plot_lims = hexbin_plot(rng, uni_mean, xlbl="$n$", ylbl="$V(n)$", log=False, ignore_zeros=False, label="UNIF", color="black", edgecolors="black", cmap="gray", cbar=True, gridsize=100, linewidths=1.0) plot_lims = get_greater_lims(plot_lims, cur_plot_lims) plt.xlim(plot_lims[0]) plt.ylim(plot_lims[1]) plt.legend(loc="lower right") plt.savefig(save_dir + "vocab_growth_comparison.png", dpi=300) plt.close()
def covariance_across_words(rank_dist, freq_dist, save_dir): joints = merge_to_joint(rank_dist, freq_dist) mean_ranks = reduce_pooled(rank_dist) equalize_len = lambda ls1, ls2: (ls1[:min(len(ls1), len(ls2))], ls2[:min( len(ls1), len(ls2))]) cov_dict = { w: np.cov(*equalize_len(r_ls, f_ls)) for w, (r_ls, f_ls) in joints.items() } fano_factor_dict = { w: cov_mat[0][1] / mean_ranks[w] for w, cov_mat in cov_dict.items() } words_sorted = [ (w, r) for w, r in sorted(mean_ranks.items(), key=lambda tup: tup[1]) ] xs, ys = list( zip(*[(r, fano_factor_dict[w]) for w, r in words_sorted if w in cov_dict])) hexbin_plot(xs, ys, log=False, xscale="log", bins="log", xlbl="$\overline{r}(w)$", ylbl="$D(w)$", ignore_zeros=False, gridsize=100) # plt.legend() plt.savefig(save_dir + "dispersion.png", dpi=300) plt.close()
def across_filter_plots(tf_samples, srf_samples, f, h, uni_samples, show=False): tf_mean_ranks, tf_mean_freqs = mean_rank_freq_from_samples(tf_samples) srf_mean_ranks, srf_mean_freqs = mean_rank_freq_from_samples(srf_samples) uni_mean_ranks, uni_mean_freqs = mean_rank_freq_from_samples(uni_samples) tf_mean_rf = mean_rank_freq_from_samples(tf_samples) srf_mean_rf = mean_rank_freq_from_samples(srf_samples) uni_mean_rf = mean_rank_freq_from_samples(uni_samples) plot_lims = None joints = merge_to_joint(*uni_mean_rf) xs, ys = list(zip(*sorted(joints.values()))) cur_plot_lims = hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$", label="UNIF", color="black", edgecolors="black", cmap="gray", linewidths=1.0, lims=None, min_y=1, cbar=True) plot_lims = get_greater_lims(plot_lims, cur_plot_lims) joints = merge_to_joint(*tf_mean_rf) xs, ys = list(zip(*sorted(joints.values()))) cur_plot_lims = hexbin_plot(xs, ys, label="TF " + str(f), color=colour_palette[0], edgecolors=colour_palette[0], linewidths=1.0, lims=None, min_y=1, cbar=False) plot_lims = get_greater_lims(plot_lims, cur_plot_lims) joints = merge_to_joint(*srf_mean_rf) xs, ys = list(zip(*sorted(joints.values()))) cur_plot_lims = hexbin_plot(xs, ys, label="SRF " + str(h), color=colour_palette[1], edgecolors=colour_palette[1], linewidths=1.0, lims=None, min_y=1, cbar=False) plot_lims = get_greater_lims(plot_lims, cur_plot_lims) plt.xlim(plot_lims[0]) plt.ylim(plot_lims[1]) plt.legend() if show: plt.show()
handle.write(mandel.print_result(string=True)) handle.write("\n\n") handle.write("\nUNI\n") handle.write(uni_mandel.print_result(string=True)) handle.write("\n\n") print("MLEs done") # within filter comparisons # TF uni_plot_lims = hexbin_plot(uni_xs, uni_ys, label="UNIF", linewidths=1.0, color="black", edgecolors="black", cmap="gray", alpha=0.5, cbar=True, min_y=1) plot_lims = within_filter_plots(three_tfs, show=False) plot_lims = get_greater_lims(uni_plot_lims, plot_lims) plt.xlim(plot_lims[0]) plt.ylim(plot_lims[1]) plt.savefig(results_d + "TF_within_comp_rank_freq.png", dpi=300) plt.close() # SRF uni_plot_lims = hexbin_plot(uni_xs, uni_ys, label="UNIF",
def sampling_levels_main(wiki, n, m, save_dir="./"): art_mean_ranks, art_mean_freqs = get_mean_relationship( Articles, wiki, n, m) art_joint = merge_to_joint(art_mean_ranks, art_mean_freqs) art_xs, art_ys = list(zip(*sorted(art_joint.values()))) hexbin_plot(art_xs, art_ys, xlbl=r"$\log$ $r(w)$", ylbl=r"$\log$ $f(w)$", label="texts", min_y=1) do_mle(art_xs, art_ys, Articles, save_dir) sent_mean_ranks, sent_mean_freqs = get_mean_relationship( Sentences, wiki, n, m) sent_joint = merge_to_joint(sent_mean_ranks, sent_mean_freqs) sent_xs, sent_ys = list(zip(*sorted(sent_joint.values()))) do_mle(sent_xs, sent_ys, Sentences, save_dir) word_mean_ranks, word_mean_freqs = get_mean_relationship(Words, wiki, n, m) word_joint = merge_to_joint(word_mean_ranks, word_mean_freqs) word_xs, word_ys = list(zip(*sorted(word_joint.values()))) hexbin_plot(word_xs, word_ys, xlbl=r"$\log$ $r(w)$", ylbl=r"$\log$ $f(w)$", color="red", edgecolors="red", cmap="Reds_r", label="words", cbar=False, min_y=1) do_mle(word_xs, word_ys, Words, save_dir) plt.legend() plt.savefig(save_dir + "rank_freq_word_vs_article_" + str(n) + ".png", dpi=300) plt.close() freq_joint = merge_to_joint(art_mean_freqs, word_mean_freqs) xs, ys = list(zip(*sorted(freq_joint.values()))) hexbin_plot(xs, ys, xlbl=r"$\log$ $f(w)$ from texts", ylbl=r"$\log$ $f(w)$ from words") plt.savefig(save_dir + "freq_correl_word_vs_article_" + str(n) + ".png", dpi=300) plt.close() art_word_corr = scistats.spearmanr(xs, ys) freq_joint = merge_to_joint(art_mean_freqs, sent_mean_freqs) xs, ys = list(zip(*sorted(freq_joint.values()))) art_sent_corr = scistats.spearmanr(xs, ys) freq_joint = merge_to_joint(sent_mean_freqs, word_mean_freqs) xs, ys = list(zip(*sorted(freq_joint.values()))) sent_word_corr = scistats.spearmanr(xs, ys) with open(save_dir + "freq_sampling_level_correlations.txt", "w") as handle: handle.write("\t".join([ "Articles-Words:", str(art_word_corr.correlation), str(art_word_corr.pvalue) ])) handle.write("\n") handle.write("\t".join([ "Articles-Sentences:", str(art_sent_corr.correlation), str(art_sent_corr.pvalue) ])) handle.write("\n") handle.write("\t".join([ "Sentences-Words:", str(sent_word_corr.correlation), str(sent_word_corr.pvalue) ])) handle.write("\n") rank_joint = merge_to_joint(art_mean_ranks, word_mean_ranks) xs, ys = list(zip(*sorted(rank_joint.values()))) hexbin_plot(xs, ys, xlbl=r"$\log$ $r(w)$ from texts", ylbl=r"$\log$ $r(w)$ from words") plt.savefig(save_dir + "rank_correl_word_vs_article_" + str(n) + ".png", dpi=300) plt.close() art_word_corr = scistats.spearmanr(xs, ys) rank_joint = merge_to_joint(art_mean_ranks, sent_mean_ranks) xs, ys = list(zip(*sorted(rank_joint.values()))) art_sent_corr = scistats.spearmanr(xs, ys) rank_joint = merge_to_joint(sent_mean_ranks, word_mean_ranks) xs, ys = list(zip(*sorted(rank_joint.values()))) sent_word_corr = scistats.spearmanr(xs, ys) with open(save_dir + "rank_sampling_level_correlations.txt", "w") as handle: handle.write("\t".join([ "Articles-Words:", str(art_word_corr.correlation), str(art_word_corr.pvalue) ])) handle.write("\n") handle.write("\t".join([ "Articles-Sentences:", str(art_sent_corr.correlation), str(art_sent_corr.pvalue) ])) handle.write("\n") handle.write("\t".join([ "Sentences-Words:", str(sent_word_corr.correlation), str(sent_word_corr.pvalue) ])) handle.write("\n")
subsamples2 = (Sentences.subsample(wiki, n) for _ in range(m)) ranks = [compute_ranks(sub) for sub in subsamples1] ranks_joined = pool_ranks(ranks) mean_ranks = reduce_pooled(ranks_joined) freqs = [compute_freqs(sub) for sub in subsamples2] freqs_joined = pool_freqs(freqs) mean_freqs = reduce_pooled(freqs_joined) print("subsampling done") joints = merge_to_joint(mean_ranks, mean_freqs) xs, ys = list(zip(*sorted(joints.values()))) hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$", min_y=1) mandelbrot = Mandelbrot(ys, xs) mandelbrot_fit = mandelbrot.fit( start_params=np.asarray([10.0, 1000.0]), # [1.0, 1.0] method="powell", full_output=True) mandelbrot.register_fit(mandelbrot_fit) mandelbrot.print_result() with open(d + "mle_mandelbrot_" + str(n) + "_" + str(m) + ".txt", "w") as handle: handle.write(mandelbrot.print_result(string=True)) plot_preds(mandelbrot, np.asarray(xs)) plt.savefig(d + "rank_freq_" + str(n) + "_" + str(m) + ".png", dpi=300) plt.close()