示例#1
0
def zipf_wrong(wiki, n, d):
    subcorp = Articles.subsample(wiki, n)

    ranks, freqs = compute_ranks(subcorp), compute_freqs(subcorp)

    joints = merge_to_joint(ranks, freqs)
    xs, ys = list(zip(*sorted(joints.values())))

    hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$")
    plt.savefig(d + "rank_freq_" + str(n) + "_wrong.png", dpi=300)
    plt.close()
示例#2
0
 def filter_worker(i):
     print("started ", i)
     cur_seed = int.from_bytes(os.urandom(4), byteorder='little')
     rand.seed(cur_seed)
     filtered = list(filter_typicality_incremental(mp_array, zipf_model, 
                     rank_dict, auto_typ, n, factor*epsilon_f_minus, lt))
     filtered_freqs = compute_freqs(Sentences(filtered))
     print("filtered ", i, " typicality: ", 
           typicality(zipf_model, merge_to_joint(rank_dict, filtered_freqs)))
     
     name = "_".join((str(n), str(factor), str(i)))
     corpus_to_pickle(filtered, "results/" + lang + "/TF", name)
示例#3
0
def mean_rank_freq_from_samples(sample_ls):
    rand_perm = np.random.permutation(sample_ls)
    half = len(sample_ls) // 2
    samples1, samples2 = rand_perm[:half], rand_perm[half:]

    ranks = [compute_ranks(sub) for sub in samples1]
    ranks_joined = pool_ranks(ranks)
    mean_ranks = reduce_pooled(ranks_joined)

    freqs = [compute_freqs(sub) for sub in samples2]
    freqs_joined = pool_freqs(freqs)
    mean_freqs = reduce_pooled(freqs_joined)
    return mean_ranks, mean_freqs
示例#4
0
def get_mean_relationship(sampling_level, wiki, n, m):
    subsamples1 = (sampling_level.subsample(wiki, n) for _ in range(m))
    subsamples2 = (sampling_level.subsample(wiki, n) for _ in range(m))

    ranks = [compute_ranks(sub) for sub in subsamples1]
    ranks_joined = pool_ranks(ranks)
    mean_ranks = reduce_pooled(ranks_joined)

    freqs = [compute_freqs(sub) for sub in subsamples2]
    freqs_joined = pool_freqs(freqs)
    mean_freqs = reduce_pooled(freqs_joined)

    return mean_ranks, mean_freqs
示例#5
0
def zipf_piantadosi(wiki, n, d):
    subcorp1 = Words.subsample(wiki, n)
    subcorp2 = Words.subsample(wiki, n)

    ranks = compute_ranks(subcorp1)
    freqs = compute_freqs(subcorp2)

    joints = merge_to_joint(ranks, freqs)
    xs, ys = list(zip(*sorted(joints.values())))

    hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$")
    plt.savefig(d + "rank_freq_" + str(n) + "_piantadosi.png", dpi=300)
    plt.close()
示例#6
0
def establish_typical_set(corpus, rank_dict, zipf_model, n, m):
    typicalities = []

    for i in range(m):
        sub = Sentences.subsample(corpus, n)

        sub_freqs = compute_freqs(sub)
        sub_joints = merge_to_joint(rank_dict, sub_freqs)

        sub_typicality = typicality(zipf_model, sub_joints)
        typicalities.append(sub_typicality)

    mean_typ, std_typ = np.mean(typicalities), np.var(typicalities)**.5
    return mean_typ, std_typ
示例#7
0
def variance_main(wiki, n, m, save_dir="./"):
    subsamples1 = (Sentences.subsample(wiki, n) for _ in range(m))
    subsamples2 = (Sentences.subsample(wiki, n) for _ in range(m))

    ranks = [compute_ranks(sub) for sub in subsamples1]
    ranks_joined = pool_ranks(ranks)

    freqs = [compute_freqs(sub) for sub in subsamples2]
    freqs_joined = pool_freqs(freqs)

    mean_vs_pooled(ranks_joined, freqs_joined, save_dir)

    do_mles(ranks, freqs, save_dir)

    covariance_across_words(ranks_joined, freqs_joined, save_dir)
示例#8
0
def get_model(corpus, n):
    big_ranks = compute_ranks(Sentences.subsample(corpus, n))
    freqs = compute_freqs(Sentences.subsample(corpus, n))

    joint = merge_to_joint(big_ranks, freqs)

    xs, ys = list(zip(*sorted(joint.values())))

    mandelbrot = Mandelbrot(ys, xs)
    mandelbrot_fit = mandelbrot.fit(start_params=np.asarray([1.0, 1.0]),
                                    method="powell",
                                    full_output=True)
    mandelbrot.register_fit(mandelbrot_fit)
    mandelbrot.print_result()
    auto_typ = typicality(mandelbrot, joint)

    return big_ranks, mandelbrot, auto_typ
示例#9
0
    setup_m = 100
    m = 10
    
    wiki = list(wiki_from_pickles("data/"+lang+"_pkl"))
    sents = [s for a in wiki for s in a]

    zipf_model, rank_dict, mean_typ, std_typ, auto_typ = setup_filtering(wiki, 
                                                                         big_n(wiki), 
                                                                         n, 
                                                                         setup_m)
    
    mean_corrected = abs(mean_typ - auto_typ)
    epsilon_f_plus = mean_corrected + std_typ*factor
    epsilon_f_minus = - epsilon_f_plus
    
    print("\nModel and Epsilon established")
    print(auto_typ, mean_typ, std_typ)
    print(epsilon_f_minus, epsilon_f_plus)
    
    
    for m_i in range(m):
        print("started ", m_i)        
        filtered = list(filter_typicality_incremental(sents, zipf_model, 
                        rank_dict, auto_typ, n, epsilon_f_minus, lt))
        filtered_freqs = compute_freqs(Sentences(filtered))
        print("filtered ", m_i, " typicality: ", 
              typicality(zipf_model, merge_to_joint(rank_dict, filtered_freqs)))

        
        name = "_".join((str(n), str(factor), str(m_i)))
        corpus_to_pickle(filtered, "results/" + lang + "/TF", name)
示例#10
0
def samples_to_typicality(samples, ref_dist, rank_dict):
    freqs = [compute_freqs(s) for s in samples]
    joints = [merge_to_joint(rank_dict, f_dict) for f_dict in freqs]
    typs = [typicality(ref_dist, j) for j in joints]
    return typs
示例#11
0
    d = "results/" + lang + "/plots/"
    wiki = list(wiki_from_pickles("data/" + lang + "_pkl"))
    n = int(10e6)
    m = 10

    zipf_wrong(wiki, n, d)
    zipf_piantadosi(wiki, n, d)

    subsamples1 = (Sentences.subsample(wiki, n) for _ in range(m))
    subsamples2 = (Sentences.subsample(wiki, n) for _ in range(m))

    ranks = [compute_ranks(sub) for sub in subsamples1]
    ranks_joined = pool_ranks(ranks)
    mean_ranks = reduce_pooled(ranks_joined)

    freqs = [compute_freqs(sub) for sub in subsamples2]
    freqs_joined = pool_freqs(freqs)
    mean_freqs = reduce_pooled(freqs_joined)

    print("subsampling done")

    joints = merge_to_joint(mean_ranks, mean_freqs)
    xs, ys = list(zip(*sorted(joints.values())))

    hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$", min_y=1)

    mandelbrot = Mandelbrot(ys, xs)
    mandelbrot_fit = mandelbrot.fit(
        start_params=np.asarray([10.0, 1000.0]),  # [1.0, 1.0]
        method="powell",
        full_output=True)