def get_mean_relationship(wiki, n, m, freq_func): subsamples1 = (Sentences.subsample(wiki, n) for _ in range(m)) subsamples2 = (Sentences.subsample(wiki, n) for _ in range(m)) ranks = [compute_ranks(sub) for sub in subsamples1] ranks_joined = pool_ranks(ranks) mean_ranks = reduce_pooled(ranks_joined) freqs = [freq_func(sub) for sub in subsamples2] freqs_joined = pool_freqs(freqs) mean_freqs = reduce_pooled(freqs_joined) return mean_ranks, mean_freqs
def variance_main(wiki, n, m, save_dir="./"): subsamples1 = (Sentences.subsample(wiki, n) for _ in range(m)) subsamples2 = (Sentences.subsample(wiki, n) for _ in range(m)) ranks = [compute_ranks(sub) for sub in subsamples1] ranks_joined = pool_ranks(ranks) freqs = [compute_freqs(sub) for sub in subsamples2] freqs_joined = pool_freqs(freqs) mean_vs_pooled(ranks_joined, freqs_joined, save_dir) do_mles(ranks, freqs, save_dir) covariance_across_words(ranks_joined, freqs_joined, save_dir)
def heap(corp, rng): vocab_sizes = [] for ntoks in rng: subsample = Sentences(sent_subsample(corp, ntoks)) vocab_size = compute_vocab_size(subsample) vocab_sizes.append(vocab_size) return vocab_sizes
def get_model(corpus, n): big_ranks = compute_ranks(Sentences.subsample(corpus, n)) freqs = compute_freqs(Sentences.subsample(corpus, n)) joint = merge_to_joint(big_ranks, freqs) xs, ys = list(zip(*sorted(joint.values()))) mandelbrot = Mandelbrot(ys, xs) mandelbrot_fit = mandelbrot.fit(start_params=np.asarray([1.0, 1.0]), method="powell", full_output=True) mandelbrot.register_fit(mandelbrot_fit) mandelbrot.print_result() auto_typ = typicality(mandelbrot, joint) return big_ranks, mandelbrot, auto_typ
def heap(corp, rng): vocab_sizes = [] for i, ntoks in enumerate(rng): if i % 10 == 0: print(i, ntoks) subsample = Sentences.subsample(corp, ntoks) vocab_size = compute_vocab_size(subsample) vocab_sizes.append(vocab_size) return vocab_sizes
def get_filters(filter_dir, k, names, param_name, param_ls): filters_dict = {} for param in param_ls: all_samples = corpora_from_pickles(filter_dir, names=names) cur_param_filters = [Sentences(c) for name_d, c in all_samples if name_d["k"] == k and name_d[param_name] == param] filters_dict[param] = cur_param_filters return filters_dict
def filter_worker(i): print("started ", i) cur_seed = int.from_bytes(os.urandom(4), byteorder='little') rand.seed(cur_seed) filtered = list(filter_typicality_incremental(mp_array, zipf_model, rank_dict, auto_typ, n, factor*epsilon_f_minus, lt)) filtered_freqs = compute_freqs(Sentences(filtered)) print("filtered ", i, " typicality: ", typicality(zipf_model, merge_to_joint(rank_dict, filtered_freqs))) name = "_".join((str(n), str(factor), str(i))) corpus_to_pickle(filtered, "results/" + lang + "/TF", name)
def establish_typical_set(corpus, rank_dict, zipf_model, n, m): typicalities = [] for i in range(m): sub = Sentences.subsample(corpus, n) sub_freqs = compute_freqs(sub) sub_joints = merge_to_joint(rank_dict, sub_freqs) sub_typicality = typicality(zipf_model, sub_joints) typicalities.append(sub_typicality) mean_typ, std_typ = np.mean(typicalities), np.var(typicalities)**.5 return mean_typ, std_typ
def get_reference_dist(wiki): n = int(10e6) m = 10 wiki_ls = list(wiki) subsamples = [Sentences.subsample(wiki_ls, n) for _ in range(m)] mean_ranks, mean_freqs = mean_rank_freq_from_samples(subsamples) joints = merge_to_joint(mean_ranks, mean_freqs) xs, ys = list(zip(*sorted(joints.values()))) mandelbrot = Mandelbrot(ys, xs) mandelbrot_fit = mandelbrot.fit(start_params=np.asarray([1.0, 1.0]), method="powell", full_output=True) mandelbrot.register_fit(mandelbrot_fit) mandelbrot.print_result() return mandelbrot, mean_ranks
setup_m = 100 m = 10 wiki = list(wiki_from_pickles("data/"+lang+"_pkl")) sents = [s for a in wiki for s in a] zipf_model, rank_dict, mean_typ, std_typ, auto_typ = setup_filtering(wiki, big_n(wiki), n, setup_m) mean_corrected = abs(mean_typ - auto_typ) epsilon_f_plus = mean_corrected + std_typ*factor epsilon_f_minus = - epsilon_f_plus print("\nModel and Epsilon established") print(auto_typ, mean_typ, std_typ) print(epsilon_f_minus, epsilon_f_plus) for m_i in range(m): print("started ", m_i) filtered = list(filter_typicality_incremental(sents, zipf_model, rank_dict, auto_typ, n, epsilon_f_minus, lt)) filtered_freqs = compute_freqs(Sentences(filtered)) print("filtered ", m_i, " typicality: ", typicality(zipf_model, merge_to_joint(rank_dict, filtered_freqs))) name = "_".join((str(n), str(factor), str(m_i))) corpus_to_pickle(filtered, "results/" + lang + "/TF", name)
print("ARGS: ", lang, factors, hist_lens, "\n") d = "results/" + lang + "/" results_d = d + "evaluation/" k = 1000000 srfs = get_filters(d + "SRF/", k, ["k", "h", "i"], "h", hist_lens) tfs = get_filters(d + "TF/", k, ["k", "f", "i"], "f", factors) highest_three_factors = factors[-3:] three_tfs = {k: tfs[k] for k in highest_three_factors} highest_three_hist_lens = hist_lens[-3:] three_srfs = {k: srfs[k] for k in highest_three_hist_lens} unis = [ Sentences(c) for _, c in corpora_from_pickles(d + "UNI", names=["k", "i"]) ] uni_mean_ranks, uni_mean_freqs = mean_rank_freq_from_samples(unis) uni_joints = merge_to_joint(uni_mean_ranks, uni_mean_freqs) uni_xs, uni_ys = list(zip(*sorted(uni_joints.values()))) print("filters loaded", flush=True) # MLEs tf_mles, srf_mles, uni_mandel = do_mles(tfs, srfs, unis) with open(results_d + "mle_mandelbrot.txt", "w") as handle: for param, mandel in tf_mles.items(): handle.write("TF " + str(param))
if __name__ == "__main__": n = 100000 d = "results/ALS/" # GET UNIVERSE wiki = list(wiki_from_pickles("data/ALS_pkl")) sent_d, label_f = number_sents((s for a in wiki for s in a)) word_d, word_label_f = number_words((w for a in wiki for s in a for w in s)) ## LOAD CORPORA # SRFs srf_samples = list(corpora_from_pickles(d + "SRF", names=["n", "h", "i"])) srf_10 = [ Sentences(c) for name_d, c in srf_samples if name_d["n"] == n and name_d["h"] == 10 ] srf_20 = [ Sentences(c) for name_d, c in srf_samples if name_d["n"] == n and name_d["h"] == 20 ] srf_30 = [ Sentences(c) for name_d, c in srf_samples if name_d["n"] == n and name_d["h"] == 30 ] #TFs tf_samples = list(corpora_from_pickles(d + "TF", names=["n", "f", "i"])) tf_50 = [ Sentences(c) for name_d, c in tf_samples if name_d["n"] == n and name_d["f"] == 50
# -*- coding: utf-8 -*- from data.reader import wiki_from_pickles, corpus_to_pickle from data.corpus import Sentences from stats.stat_functions import compute_freqs, merge_to_joint import argparse def parse_args(): p = argparse.ArgumentParser() p.add_argument("--lang", type=str) p.add_argument("--n_tokens", type=int) args = p.parse_args() return args.lang, args.n_tokens if __name__ == "__main__": lang, n = parse_args() m = 10 wiki = list(wiki_from_pickles("data/" + lang + "_pkl")) for i in range(m): sampled = Sentences.subsample(wiki, n) sampled_sents = list(sampled.sentences()) name = "_".join((str(n), str(i))) corpus_to_pickle(sampled_sents, "results/" + lang + "/UNI", name)
hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$") plt.savefig(d + "rank_freq_" + str(n) + "_piantadosi.png", dpi=300) plt.close() if __name__ == "__main__": lang = parse_args() d = "results/" + lang + "/plots/" wiki = list(wiki_from_pickles("data/" + lang + "_pkl")) n = int(10e6) m = 10 zipf_wrong(wiki, n, d) zipf_piantadosi(wiki, n, d) subsamples1 = (Sentences.subsample(wiki, n) for _ in range(m)) subsamples2 = (Sentences.subsample(wiki, n) for _ in range(m)) ranks = [compute_ranks(sub) for sub in subsamples1] ranks_joined = pool_ranks(ranks) mean_ranks = reduce_pooled(ranks_joined) freqs = [compute_freqs(sub) for sub in subsamples2] freqs_joined = pool_freqs(freqs) mean_freqs = reduce_pooled(freqs_joined) print("subsampling done") joints = merge_to_joint(mean_ranks, mean_freqs) xs, ys = list(zip(*sorted(joints.values())))
c_vec1, c_vec2 = [cs1[x] for x in sorted(universe)], [cs2[x] for x in sorted(universe)] return sum(min(one, two) for one, two in zip(c_vec1, c_vec2))/sum(max(one, two) for one, two in zip(c_vec1, c_vec2)) if __name__ == "__main__": n = 100000 d = "results/ALS/" wiki = list(wiki_from_pickles("data/ALS_pkl")) print("Total num sents", len([s for a in wiki for s in a])) srf_samples = corpora_from_pickles(d + "SRF", names=["n", "h", "i"]) srf_30 = [Sentences(c) for name_d, c in srf_samples if name_d["n"] == n and name_d["h"] == 30] tf_samples = corpora_from_pickles(d + "TF", names=["n", "f", "i"]) tf_100 = [Sentences(c) for name_d, c in tf_samples if name_d["n"] == n and name_d["f"] == 100] uni_samples = corpora_from_pickles(d + "UNI", names=["n", "i"]) uni = [Sentences(c) for name_d, c in uni_samples if name_d["n"] == n] for subcorp_set, name in zip([srf_30, tf_100, uni], ["SRF", "TF", "UNI"]): print("\n", name) shuffled_sents = rand.permutation([s for subcorp in subcorp_set