def zipf_wrong(wiki, n, d): subcorp = Articles.subsample(wiki, n) ranks, freqs = compute_ranks(subcorp), compute_freqs(subcorp) joints = merge_to_joint(ranks, freqs) xs, ys = list(zip(*sorted(joints.values()))) hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$") plt.savefig(d + "rank_freq_" + str(n) + "_wrong.png", dpi=300) plt.close()
def filter_worker(i): print("started ", i) cur_seed = int.from_bytes(os.urandom(4), byteorder='little') rand.seed(cur_seed) filtered = list(filter_typicality_incremental(mp_array, zipf_model, rank_dict, auto_typ, n, factor*epsilon_f_minus, lt)) filtered_freqs = compute_freqs(Sentences(filtered)) print("filtered ", i, " typicality: ", typicality(zipf_model, merge_to_joint(rank_dict, filtered_freqs))) name = "_".join((str(n), str(factor), str(i))) corpus_to_pickle(filtered, "results/" + lang + "/TF", name)
def mean_rank_freq_from_samples(sample_ls): rand_perm = np.random.permutation(sample_ls) half = len(sample_ls) // 2 samples1, samples2 = rand_perm[:half], rand_perm[half:] ranks = [compute_ranks(sub) for sub in samples1] ranks_joined = pool_ranks(ranks) mean_ranks = reduce_pooled(ranks_joined) freqs = [compute_freqs(sub) for sub in samples2] freqs_joined = pool_freqs(freqs) mean_freqs = reduce_pooled(freqs_joined) return mean_ranks, mean_freqs
def get_mean_relationship(sampling_level, wiki, n, m): subsamples1 = (sampling_level.subsample(wiki, n) for _ in range(m)) subsamples2 = (sampling_level.subsample(wiki, n) for _ in range(m)) ranks = [compute_ranks(sub) for sub in subsamples1] ranks_joined = pool_ranks(ranks) mean_ranks = reduce_pooled(ranks_joined) freqs = [compute_freqs(sub) for sub in subsamples2] freqs_joined = pool_freqs(freqs) mean_freqs = reduce_pooled(freqs_joined) return mean_ranks, mean_freqs
def zipf_piantadosi(wiki, n, d): subcorp1 = Words.subsample(wiki, n) subcorp2 = Words.subsample(wiki, n) ranks = compute_ranks(subcorp1) freqs = compute_freqs(subcorp2) joints = merge_to_joint(ranks, freqs) xs, ys = list(zip(*sorted(joints.values()))) hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$") plt.savefig(d + "rank_freq_" + str(n) + "_piantadosi.png", dpi=300) plt.close()
def establish_typical_set(corpus, rank_dict, zipf_model, n, m): typicalities = [] for i in range(m): sub = Sentences.subsample(corpus, n) sub_freqs = compute_freqs(sub) sub_joints = merge_to_joint(rank_dict, sub_freqs) sub_typicality = typicality(zipf_model, sub_joints) typicalities.append(sub_typicality) mean_typ, std_typ = np.mean(typicalities), np.var(typicalities)**.5 return mean_typ, std_typ
def variance_main(wiki, n, m, save_dir="./"): subsamples1 = (Sentences.subsample(wiki, n) for _ in range(m)) subsamples2 = (Sentences.subsample(wiki, n) for _ in range(m)) ranks = [compute_ranks(sub) for sub in subsamples1] ranks_joined = pool_ranks(ranks) freqs = [compute_freqs(sub) for sub in subsamples2] freqs_joined = pool_freqs(freqs) mean_vs_pooled(ranks_joined, freqs_joined, save_dir) do_mles(ranks, freqs, save_dir) covariance_across_words(ranks_joined, freqs_joined, save_dir)
def get_model(corpus, n): big_ranks = compute_ranks(Sentences.subsample(corpus, n)) freqs = compute_freqs(Sentences.subsample(corpus, n)) joint = merge_to_joint(big_ranks, freqs) xs, ys = list(zip(*sorted(joint.values()))) mandelbrot = Mandelbrot(ys, xs) mandelbrot_fit = mandelbrot.fit(start_params=np.asarray([1.0, 1.0]), method="powell", full_output=True) mandelbrot.register_fit(mandelbrot_fit) mandelbrot.print_result() auto_typ = typicality(mandelbrot, joint) return big_ranks, mandelbrot, auto_typ
setup_m = 100 m = 10 wiki = list(wiki_from_pickles("data/"+lang+"_pkl")) sents = [s for a in wiki for s in a] zipf_model, rank_dict, mean_typ, std_typ, auto_typ = setup_filtering(wiki, big_n(wiki), n, setup_m) mean_corrected = abs(mean_typ - auto_typ) epsilon_f_plus = mean_corrected + std_typ*factor epsilon_f_minus = - epsilon_f_plus print("\nModel and Epsilon established") print(auto_typ, mean_typ, std_typ) print(epsilon_f_minus, epsilon_f_plus) for m_i in range(m): print("started ", m_i) filtered = list(filter_typicality_incremental(sents, zipf_model, rank_dict, auto_typ, n, epsilon_f_minus, lt)) filtered_freqs = compute_freqs(Sentences(filtered)) print("filtered ", m_i, " typicality: ", typicality(zipf_model, merge_to_joint(rank_dict, filtered_freqs))) name = "_".join((str(n), str(factor), str(m_i))) corpus_to_pickle(filtered, "results/" + lang + "/TF", name)
def samples_to_typicality(samples, ref_dist, rank_dict): freqs = [compute_freqs(s) for s in samples] joints = [merge_to_joint(rank_dict, f_dict) for f_dict in freqs] typs = [typicality(ref_dist, j) for j in joints] return typs
d = "results/" + lang + "/plots/" wiki = list(wiki_from_pickles("data/" + lang + "_pkl")) n = int(10e6) m = 10 zipf_wrong(wiki, n, d) zipf_piantadosi(wiki, n, d) subsamples1 = (Sentences.subsample(wiki, n) for _ in range(m)) subsamples2 = (Sentences.subsample(wiki, n) for _ in range(m)) ranks = [compute_ranks(sub) for sub in subsamples1] ranks_joined = pool_ranks(ranks) mean_ranks = reduce_pooled(ranks_joined) freqs = [compute_freqs(sub) for sub in subsamples2] freqs_joined = pool_freqs(freqs) mean_freqs = reduce_pooled(freqs_joined) print("subsampling done") joints = merge_to_joint(mean_ranks, mean_freqs) xs, ys = list(zip(*sorted(joints.values()))) hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$", min_y=1) mandelbrot = Mandelbrot(ys, xs) mandelbrot_fit = mandelbrot.fit( start_params=np.asarray([10.0, 1000.0]), # [1.0, 1.0] method="powell", full_output=True)