Пример #1
0
    plt.ylim(plot_lims[1])
    plt.savefig(results_d + "SRF_within_comp_rank_freq_MLE.png", dpi=300)
    plt.close()

    print("compared within", flush=True)

    # across filter comparisons
    max_f, max_h = max(factors), max(hist_lens)
    across_filter_plots(tfs[max_f], srfs[max_h], max_f, max_h, unis)
    plt.savefig(results_d + "across_comp_rank_freq.png", dpi=300)
    plt.close()

    print("compared across", flush=True)

    # typicality distributions
    wiki_iter = wiki_from_pickles("data/" + lang + "_pkl")
    ref_dist, big_mean_ranks = get_reference_dist(wiki_iter)
    tf_means, srf_means, (uni_mean, uni_std) = typicality_distributions(
        tfs, srfs, unis, ref_dist, big_mean_ranks)
    plt.savefig(results_d + "typicality_distribution.png", dpi=300)
    plt.close()

    with open(results_d + "typicality_mean_stddev.txt", "w") as handle:
        for param, (mean_typ, std_typ) in tf_means.items():
            handle.write("\nTF " + str(param))
            handle.write("\t" + str(round(mean_typ, 3)) + "\t" +
                         str(round(std_typ, 3)))
        for param, (mean_typ, std_typ) in srf_means.items():
            handle.write("\nSRF " + str(param))
            handle.write("\t" + str(round(mean_typ, 3)) + "\t" +
                         str(round(std_typ, 3)))
Пример #2
0
    p.add_argument("--lang", type=str)
    p.add_argument("--n_tokens", type=int)
    p.add_argument("--factor", type=float,
                   help="The factor to multiply epsilon with; determines"
                   "the degree of atypicality.")
    
    args = p.parse_args()
    return args.lang, args.n_tokens, args.factor

if __name__ == "__main__":
    lang, n, factor = parse_args()
    big_n = lambda wiki: len([w for a in wiki for s in a for w in s])*.49
    setup_m = 100
    m = 10
    
    wiki = list(wiki_from_pickles("data/"+lang+"_pkl"))
    sents = [s for a in wiki for s in a]

    zipf_model, rank_dict, mean_typ, std_typ, auto_typ = setup_filtering(wiki, 
                                                                         big_n(wiki), 
                                                                         n, 
                                                                         setup_m)
    
    mean_corrected = abs(mean_typ - auto_typ)
    epsilon_f_plus = mean_corrected + std_typ*factor
    epsilon_f_minus = - epsilon_f_plus
    
    print("\nModel and Epsilon established")
    print(auto_typ, mean_typ, std_typ)
    print(epsilon_f_minus, epsilon_f_plus)
    
Пример #3
0
    if not universe:
        universe = cs1.keys() | cs2.keys()

    c_vec1, c_vec2 = [cs1[x] for x in sorted(universe)],\
                    [cs2[x] for x in sorted(universe)]
    return (sum(min(one, two) for one, two in zip(c_vec1, c_vec2)) /
            sum(max(one, two) for one, two in zip(c_vec1, c_vec2)))


if __name__ == "__main__":
    n = 100000
    d = "results/ALS/"

    # GET UNIVERSE
    wiki = list(wiki_from_pickles("data/ALS_pkl"))
    sent_d, label_f = number_sents((s for a in wiki for s in a))
    word_d, word_label_f = number_words((w for a in wiki for s in a
                                         for w in s))

    ## LOAD CORPORA
    # SRFs
    srf_samples = list(corpora_from_pickles(d + "SRF", names=["n", "h", "i"]))
    srf_10 = [
        Sentences(c) for name_d, c in srf_samples
        if name_d["n"] == n and name_d["h"] == 10
    ]
    srf_20 = [
        Sentences(c) for name_d, c in srf_samples
        if name_d["n"] == n and name_d["h"] == 20
    ]