Пример #1
0
def zipf_wrong(wiki, n, d):
    subcorp = Articles.subsample(wiki, n)

    ranks, freqs = compute_ranks(subcorp), compute_freqs(subcorp)

    joints = merge_to_joint(ranks, freqs)
    xs, ys = list(zip(*sorted(joints.values())))

    hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$")
    plt.savefig(d + "rank_freq_" + str(n) + "_wrong.png", dpi=300)
    plt.close()
Пример #2
0
def mean_rank_freq_from_samples(sample_ls):
    rand_perm = np.random.permutation(sample_ls)
    half = len(sample_ls) // 2
    samples1, samples2 = rand_perm[:half], rand_perm[half:]

    ranks = [compute_ranks(sub) for sub in samples1]
    ranks_joined = pool_ranks(ranks)
    mean_ranks = reduce_pooled(ranks_joined)

    freqs = [compute_freqs(sub) for sub in samples2]
    freqs_joined = pool_freqs(freqs)
    mean_freqs = reduce_pooled(freqs_joined)
    return mean_ranks, mean_freqs
Пример #3
0
def get_mean_relationship(sampling_level, wiki, n, m):
    subsamples1 = (sampling_level.subsample(wiki, n) for _ in range(m))
    subsamples2 = (sampling_level.subsample(wiki, n) for _ in range(m))

    ranks = [compute_ranks(sub) for sub in subsamples1]
    ranks_joined = pool_ranks(ranks)
    mean_ranks = reduce_pooled(ranks_joined)

    freqs = [compute_freqs(sub) for sub in subsamples2]
    freqs_joined = pool_freqs(freqs)
    mean_freqs = reduce_pooled(freqs_joined)

    return mean_ranks, mean_freqs
Пример #4
0
def get_mean_relationship(wiki, n, m, freq_func):
    subsamples1 = (Sentences.subsample(wiki, n) for _ in range(m))
    subsamples2 = (Sentences.subsample(wiki, n) for _ in range(m))

    ranks = [compute_ranks(sub) for sub in subsamples1]
    ranks_joined = pool_ranks(ranks)
    mean_ranks = reduce_pooled(ranks_joined)

    freqs = [freq_func(sub) for sub in subsamples2]
    freqs_joined = pool_freqs(freqs)
    mean_freqs = reduce_pooled(freqs_joined)

    return mean_ranks, mean_freqs
Пример #5
0
def zipf_piantadosi(wiki, n, d):
    subcorp1 = Words.subsample(wiki, n)
    subcorp2 = Words.subsample(wiki, n)

    ranks = compute_ranks(subcorp1)
    freqs = compute_freqs(subcorp2)

    joints = merge_to_joint(ranks, freqs)
    xs, ys = list(zip(*sorted(joints.values())))

    hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$")
    plt.savefig(d + "rank_freq_" + str(n) + "_piantadosi.png", dpi=300)
    plt.close()
Пример #6
0
def variance_main(wiki, n, m, save_dir="./"):
    subsamples1 = (Sentences.subsample(wiki, n) for _ in range(m))
    subsamples2 = (Sentences.subsample(wiki, n) for _ in range(m))

    ranks = [compute_ranks(sub) for sub in subsamples1]
    ranks_joined = pool_ranks(ranks)

    freqs = [compute_freqs(sub) for sub in subsamples2]
    freqs_joined = pool_freqs(freqs)

    mean_vs_pooled(ranks_joined, freqs_joined, save_dir)

    do_mles(ranks, freqs, save_dir)

    covariance_across_words(ranks_joined, freqs_joined, save_dir)
Пример #7
0
def get_model(corpus, n):
    big_ranks = compute_ranks(Sentences.subsample(corpus, n))
    freqs = compute_freqs(Sentences.subsample(corpus, n))

    joint = merge_to_joint(big_ranks, freqs)

    xs, ys = list(zip(*sorted(joint.values())))

    mandelbrot = Mandelbrot(ys, xs)
    mandelbrot_fit = mandelbrot.fit(start_params=np.asarray([1.0, 1.0]),
                                    method="powell",
                                    full_output=True)
    mandelbrot.register_fit(mandelbrot_fit)
    mandelbrot.print_result()
    auto_typ = typicality(mandelbrot, joint)

    return big_ranks, mandelbrot, auto_typ
Пример #8
0

if __name__ == "__main__":
    lang = parse_args()
    d = "results/" + lang + "/plots/"
    wiki = list(wiki_from_pickles("data/" + lang + "_pkl"))
    n = int(10e6)
    m = 10

    zipf_wrong(wiki, n, d)
    zipf_piantadosi(wiki, n, d)

    subsamples1 = (Sentences.subsample(wiki, n) for _ in range(m))
    subsamples2 = (Sentences.subsample(wiki, n) for _ in range(m))

    ranks = [compute_ranks(sub) for sub in subsamples1]
    ranks_joined = pool_ranks(ranks)
    mean_ranks = reduce_pooled(ranks_joined)

    freqs = [compute_freqs(sub) for sub in subsamples2]
    freqs_joined = pool_freqs(freqs)
    mean_freqs = reduce_pooled(freqs_joined)

    print("subsampling done")

    joints = merge_to_joint(mean_ranks, mean_freqs)
    xs, ys = list(zip(*sorted(joints.values())))

    hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$", min_y=1)

    mandelbrot = Mandelbrot(ys, xs)