Пример #1
0
def get_mean_relationship(wiki, n, m, freq_func):
    subsamples1 = (Sentences.subsample(wiki, n) for _ in range(m))
    subsamples2 = (Sentences.subsample(wiki, n) for _ in range(m))

    ranks = [compute_ranks(sub) for sub in subsamples1]
    ranks_joined = pool_ranks(ranks)
    mean_ranks = reduce_pooled(ranks_joined)

    freqs = [freq_func(sub) for sub in subsamples2]
    freqs_joined = pool_freqs(freqs)
    mean_freqs = reduce_pooled(freqs_joined)

    return mean_ranks, mean_freqs
Пример #2
0
def variance_main(wiki, n, m, save_dir="./"):
    subsamples1 = (Sentences.subsample(wiki, n) for _ in range(m))
    subsamples2 = (Sentences.subsample(wiki, n) for _ in range(m))

    ranks = [compute_ranks(sub) for sub in subsamples1]
    ranks_joined = pool_ranks(ranks)

    freqs = [compute_freqs(sub) for sub in subsamples2]
    freqs_joined = pool_freqs(freqs)

    mean_vs_pooled(ranks_joined, freqs_joined, save_dir)

    do_mles(ranks, freqs, save_dir)

    covariance_across_words(ranks_joined, freqs_joined, save_dir)
Пример #3
0
def heap(corp, rng):
    vocab_sizes = []
    for ntoks in rng:
        subsample = Sentences(sent_subsample(corp, ntoks))
        vocab_size = compute_vocab_size(subsample)
        vocab_sizes.append(vocab_size)
    return vocab_sizes
Пример #4
0
def get_model(corpus, n):
    big_ranks = compute_ranks(Sentences.subsample(corpus, n))
    freqs = compute_freqs(Sentences.subsample(corpus, n))

    joint = merge_to_joint(big_ranks, freqs)

    xs, ys = list(zip(*sorted(joint.values())))

    mandelbrot = Mandelbrot(ys, xs)
    mandelbrot_fit = mandelbrot.fit(start_params=np.asarray([1.0, 1.0]),
                                    method="powell",
                                    full_output=True)
    mandelbrot.register_fit(mandelbrot_fit)
    mandelbrot.print_result()
    auto_typ = typicality(mandelbrot, joint)

    return big_ranks, mandelbrot, auto_typ
Пример #5
0
def heap(corp, rng):
    vocab_sizes = []
    for i, ntoks in enumerate(rng):
        if i % 10 == 0:
            print(i, ntoks)
        subsample = Sentences.subsample(corp, ntoks)
        vocab_size = compute_vocab_size(subsample)
        vocab_sizes.append(vocab_size)

    return vocab_sizes
Пример #6
0
def get_filters(filter_dir, k, names, param_name, param_ls):
    filters_dict = {}
    
    for param in param_ls:
        all_samples = corpora_from_pickles(filter_dir, names=names)
        cur_param_filters = [Sentences(c) for name_d, c in all_samples if 
                             name_d["k"] == k and name_d[param_name] == param]
        filters_dict[param] = cur_param_filters
        
    return filters_dict
Пример #7
0
 def filter_worker(i):
     print("started ", i)
     cur_seed = int.from_bytes(os.urandom(4), byteorder='little')
     rand.seed(cur_seed)
     filtered = list(filter_typicality_incremental(mp_array, zipf_model, 
                     rank_dict, auto_typ, n, factor*epsilon_f_minus, lt))
     filtered_freqs = compute_freqs(Sentences(filtered))
     print("filtered ", i, " typicality: ", 
           typicality(zipf_model, merge_to_joint(rank_dict, filtered_freqs)))
     
     name = "_".join((str(n), str(factor), str(i)))
     corpus_to_pickle(filtered, "results/" + lang + "/TF", name)
Пример #8
0
def establish_typical_set(corpus, rank_dict, zipf_model, n, m):
    typicalities = []

    for i in range(m):
        sub = Sentences.subsample(corpus, n)

        sub_freqs = compute_freqs(sub)
        sub_joints = merge_to_joint(rank_dict, sub_freqs)

        sub_typicality = typicality(zipf_model, sub_joints)
        typicalities.append(sub_typicality)

    mean_typ, std_typ = np.mean(typicalities), np.var(typicalities)**.5
    return mean_typ, std_typ
Пример #9
0
def get_reference_dist(wiki):
    n = int(10e6)
    m = 10

    wiki_ls = list(wiki)

    subsamples = [Sentences.subsample(wiki_ls, n) for _ in range(m)]
    mean_ranks, mean_freqs = mean_rank_freq_from_samples(subsamples)
    joints = merge_to_joint(mean_ranks, mean_freqs)
    xs, ys = list(zip(*sorted(joints.values())))
    mandelbrot = Mandelbrot(ys, xs)
    mandelbrot_fit = mandelbrot.fit(start_params=np.asarray([1.0, 1.0]),
                                    method="powell",
                                    full_output=True)
    mandelbrot.register_fit(mandelbrot_fit)
    mandelbrot.print_result()
    return mandelbrot, mean_ranks
Пример #10
0
    setup_m = 100
    m = 10
    
    wiki = list(wiki_from_pickles("data/"+lang+"_pkl"))
    sents = [s for a in wiki for s in a]

    zipf_model, rank_dict, mean_typ, std_typ, auto_typ = setup_filtering(wiki, 
                                                                         big_n(wiki), 
                                                                         n, 
                                                                         setup_m)
    
    mean_corrected = abs(mean_typ - auto_typ)
    epsilon_f_plus = mean_corrected + std_typ*factor
    epsilon_f_minus = - epsilon_f_plus
    
    print("\nModel and Epsilon established")
    print(auto_typ, mean_typ, std_typ)
    print(epsilon_f_minus, epsilon_f_plus)
    
    
    for m_i in range(m):
        print("started ", m_i)        
        filtered = list(filter_typicality_incremental(sents, zipf_model, 
                        rank_dict, auto_typ, n, epsilon_f_minus, lt))
        filtered_freqs = compute_freqs(Sentences(filtered))
        print("filtered ", m_i, " typicality: ", 
              typicality(zipf_model, merge_to_joint(rank_dict, filtered_freqs)))

        
        name = "_".join((str(n), str(factor), str(m_i)))
        corpus_to_pickle(filtered, "results/" + lang + "/TF", name)
Пример #11
0
    print("ARGS: ", lang, factors, hist_lens, "\n")
    d = "results/" + lang + "/"
    results_d = d + "evaluation/"

    k = 1000000

    srfs = get_filters(d + "SRF/", k, ["k", "h", "i"], "h", hist_lens)
    tfs = get_filters(d + "TF/", k, ["k", "f", "i"], "f", factors)

    highest_three_factors = factors[-3:]
    three_tfs = {k: tfs[k] for k in highest_three_factors}
    highest_three_hist_lens = hist_lens[-3:]
    three_srfs = {k: srfs[k] for k in highest_three_hist_lens}

    unis = [
        Sentences(c)
        for _, c in corpora_from_pickles(d + "UNI", names=["k", "i"])
    ]

    uni_mean_ranks, uni_mean_freqs = mean_rank_freq_from_samples(unis)
    uni_joints = merge_to_joint(uni_mean_ranks, uni_mean_freqs)
    uni_xs, uni_ys = list(zip(*sorted(uni_joints.values())))

    print("filters loaded", flush=True)

    # MLEs
    tf_mles, srf_mles, uni_mandel = do_mles(tfs, srfs, unis)

    with open(results_d + "mle_mandelbrot.txt", "w") as handle:
        for param, mandel in tf_mles.items():
            handle.write("TF " + str(param))
Пример #12
0
if __name__ == "__main__":
    n = 100000
    d = "results/ALS/"

    # GET UNIVERSE
    wiki = list(wiki_from_pickles("data/ALS_pkl"))
    sent_d, label_f = number_sents((s for a in wiki for s in a))
    word_d, word_label_f = number_words((w for a in wiki for s in a
                                         for w in s))

    ## LOAD CORPORA
    # SRFs
    srf_samples = list(corpora_from_pickles(d + "SRF", names=["n", "h", "i"]))
    srf_10 = [
        Sentences(c) for name_d, c in srf_samples
        if name_d["n"] == n and name_d["h"] == 10
    ]
    srf_20 = [
        Sentences(c) for name_d, c in srf_samples
        if name_d["n"] == n and name_d["h"] == 20
    ]
    srf_30 = [
        Sentences(c) for name_d, c in srf_samples
        if name_d["n"] == n and name_d["h"] == 30
    ]
    #TFs
    tf_samples = list(corpora_from_pickles(d + "TF", names=["n", "f", "i"]))
    tf_50 = [
        Sentences(c) for name_d, c in tf_samples
        if name_d["n"] == n and name_d["f"] == 50
Пример #13
0
# -*- coding: utf-8 -*-

from data.reader import wiki_from_pickles, corpus_to_pickle
from data.corpus import Sentences
from stats.stat_functions import compute_freqs, merge_to_joint

import argparse


def parse_args():
    p = argparse.ArgumentParser()

    p.add_argument("--lang", type=str)
    p.add_argument("--n_tokens", type=int)

    args = p.parse_args()
    return args.lang, args.n_tokens


if __name__ == "__main__":
    lang, n = parse_args()
    m = 10

    wiki = list(wiki_from_pickles("data/" + lang + "_pkl"))

    for i in range(m):
        sampled = Sentences.subsample(wiki, n)
        sampled_sents = list(sampled.sentences())
        name = "_".join((str(n), str(i)))
        corpus_to_pickle(sampled_sents, "results/" + lang + "/UNI", name)
Пример #14
0
    hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$")
    plt.savefig(d + "rank_freq_" + str(n) + "_piantadosi.png", dpi=300)
    plt.close()


if __name__ == "__main__":
    lang = parse_args()
    d = "results/" + lang + "/plots/"
    wiki = list(wiki_from_pickles("data/" + lang + "_pkl"))
    n = int(10e6)
    m = 10

    zipf_wrong(wiki, n, d)
    zipf_piantadosi(wiki, n, d)

    subsamples1 = (Sentences.subsample(wiki, n) for _ in range(m))
    subsamples2 = (Sentences.subsample(wiki, n) for _ in range(m))

    ranks = [compute_ranks(sub) for sub in subsamples1]
    ranks_joined = pool_ranks(ranks)
    mean_ranks = reduce_pooled(ranks_joined)

    freqs = [compute_freqs(sub) for sub in subsamples2]
    freqs_joined = pool_freqs(freqs)
    mean_freqs = reduce_pooled(freqs_joined)

    print("subsampling done")

    joints = merge_to_joint(mean_ranks, mean_freqs)
    xs, ys = list(zip(*sorted(joints.values())))
Пример #15
0
    c_vec1, c_vec2 = [cs1[x] for x in sorted(universe)], [cs2[x] for x in sorted(universe)]
    return sum(min(one, two) for one, two in zip(c_vec1, c_vec2))/sum(max(one, two) for one, two in zip(c_vec1, c_vec2))




if __name__ == "__main__":
    n = 100000
    d = "results/ALS/"
    
    wiki = list(wiki_from_pickles("data/ALS_pkl"))
    print("Total num sents", len([s for a in wiki for s in a]))
    
    
    srf_samples = corpora_from_pickles(d + "SRF", names=["n", "h", "i"])
    srf_30 = [Sentences(c) for name_d, c in srf_samples if name_d["n"] == n and 
                                                  name_d["h"] == 30]
    
    
    tf_samples = corpora_from_pickles(d + "TF", names=["n", "f", "i"])
    tf_100 = [Sentences(c) for name_d, c in tf_samples if name_d["n"] == n and 
                                                  name_d["f"] == 100]    
    
    uni_samples = corpora_from_pickles(d + "UNI", names=["n", "i"])
    uni = [Sentences(c) for name_d, c in uni_samples if name_d["n"] == n]
    
    
    for subcorp_set, name in zip([srf_30, tf_100, uni], ["SRF", "TF", "UNI"]):
        print("\n", name)
        
        shuffled_sents = rand.permutation([s for subcorp in subcorp_set