plt.ylim(plot_lims[1]) plt.savefig(results_d + "SRF_within_comp_rank_freq_MLE.png", dpi=300) plt.close() print("compared within", flush=True) # across filter comparisons max_f, max_h = max(factors), max(hist_lens) across_filter_plots(tfs[max_f], srfs[max_h], max_f, max_h, unis) plt.savefig(results_d + "across_comp_rank_freq.png", dpi=300) plt.close() print("compared across", flush=True) # typicality distributions wiki_iter = wiki_from_pickles("data/" + lang + "_pkl") ref_dist, big_mean_ranks = get_reference_dist(wiki_iter) tf_means, srf_means, (uni_mean, uni_std) = typicality_distributions( tfs, srfs, unis, ref_dist, big_mean_ranks) plt.savefig(results_d + "typicality_distribution.png", dpi=300) plt.close() with open(results_d + "typicality_mean_stddev.txt", "w") as handle: for param, (mean_typ, std_typ) in tf_means.items(): handle.write("\nTF " + str(param)) handle.write("\t" + str(round(mean_typ, 3)) + "\t" + str(round(std_typ, 3))) for param, (mean_typ, std_typ) in srf_means.items(): handle.write("\nSRF " + str(param)) handle.write("\t" + str(round(mean_typ, 3)) + "\t" + str(round(std_typ, 3)))
p.add_argument("--lang", type=str) p.add_argument("--n_tokens", type=int) p.add_argument("--factor", type=float, help="The factor to multiply epsilon with; determines" "the degree of atypicality.") args = p.parse_args() return args.lang, args.n_tokens, args.factor if __name__ == "__main__": lang, n, factor = parse_args() big_n = lambda wiki: len([w for a in wiki for s in a for w in s])*.49 setup_m = 100 m = 10 wiki = list(wiki_from_pickles("data/"+lang+"_pkl")) sents = [s for a in wiki for s in a] zipf_model, rank_dict, mean_typ, std_typ, auto_typ = setup_filtering(wiki, big_n(wiki), n, setup_m) mean_corrected = abs(mean_typ - auto_typ) epsilon_f_plus = mean_corrected + std_typ*factor epsilon_f_minus = - epsilon_f_plus print("\nModel and Epsilon established") print(auto_typ, mean_typ, std_typ) print(epsilon_f_minus, epsilon_f_plus)
if not universe: universe = cs1.keys() | cs2.keys() c_vec1, c_vec2 = [cs1[x] for x in sorted(universe)],\ [cs2[x] for x in sorted(universe)] return (sum(min(one, two) for one, two in zip(c_vec1, c_vec2)) / sum(max(one, two) for one, two in zip(c_vec1, c_vec2))) if __name__ == "__main__": n = 100000 d = "results/ALS/" # GET UNIVERSE wiki = list(wiki_from_pickles("data/ALS_pkl")) sent_d, label_f = number_sents((s for a in wiki for s in a)) word_d, word_label_f = number_words((w for a in wiki for s in a for w in s)) ## LOAD CORPORA # SRFs srf_samples = list(corpora_from_pickles(d + "SRF", names=["n", "h", "i"])) srf_10 = [ Sentences(c) for name_d, c in srf_samples if name_d["n"] == n and name_d["h"] == 10 ] srf_20 = [ Sentences(c) for name_d, c in srf_samples if name_d["n"] == n and name_d["h"] == 20 ]