示例#1
0
def get_mean_relationship(wiki, n, m, freq_func):
    subsamples1 = (Sentences.subsample(wiki, n) for _ in range(m))
    subsamples2 = (Sentences.subsample(wiki, n) for _ in range(m))

    ranks = [compute_ranks(sub) for sub in subsamples1]
    ranks_joined = pool_ranks(ranks)
    mean_ranks = reduce_pooled(ranks_joined)

    freqs = [freq_func(sub) for sub in subsamples2]
    freqs_joined = pool_freqs(freqs)
    mean_freqs = reduce_pooled(freqs_joined)

    return mean_ranks, mean_freqs
示例#2
0
def variance_main(wiki, n, m, save_dir="./"):
    subsamples1 = (Sentences.subsample(wiki, n) for _ in range(m))
    subsamples2 = (Sentences.subsample(wiki, n) for _ in range(m))

    ranks = [compute_ranks(sub) for sub in subsamples1]
    ranks_joined = pool_ranks(ranks)

    freqs = [compute_freqs(sub) for sub in subsamples2]
    freqs_joined = pool_freqs(freqs)

    mean_vs_pooled(ranks_joined, freqs_joined, save_dir)

    do_mles(ranks, freqs, save_dir)

    covariance_across_words(ranks_joined, freqs_joined, save_dir)
示例#3
0
def get_model(corpus, n):
    big_ranks = compute_ranks(Sentences.subsample(corpus, n))
    freqs = compute_freqs(Sentences.subsample(corpus, n))

    joint = merge_to_joint(big_ranks, freqs)

    xs, ys = list(zip(*sorted(joint.values())))

    mandelbrot = Mandelbrot(ys, xs)
    mandelbrot_fit = mandelbrot.fit(start_params=np.asarray([1.0, 1.0]),
                                    method="powell",
                                    full_output=True)
    mandelbrot.register_fit(mandelbrot_fit)
    mandelbrot.print_result()
    auto_typ = typicality(mandelbrot, joint)

    return big_ranks, mandelbrot, auto_typ
示例#4
0
def heap(corp, rng):
    vocab_sizes = []
    for i, ntoks in enumerate(rng):
        if i % 10 == 0:
            print(i, ntoks)
        subsample = Sentences.subsample(corp, ntoks)
        vocab_size = compute_vocab_size(subsample)
        vocab_sizes.append(vocab_size)

    return vocab_sizes
示例#5
0
def establish_typical_set(corpus, rank_dict, zipf_model, n, m):
    typicalities = []

    for i in range(m):
        sub = Sentences.subsample(corpus, n)

        sub_freqs = compute_freqs(sub)
        sub_joints = merge_to_joint(rank_dict, sub_freqs)

        sub_typicality = typicality(zipf_model, sub_joints)
        typicalities.append(sub_typicality)

    mean_typ, std_typ = np.mean(typicalities), np.var(typicalities)**.5
    return mean_typ, std_typ
示例#6
0
def get_reference_dist(wiki):
    n = int(10e6)
    m = 10

    wiki_ls = list(wiki)

    subsamples = [Sentences.subsample(wiki_ls, n) for _ in range(m)]
    mean_ranks, mean_freqs = mean_rank_freq_from_samples(subsamples)
    joints = merge_to_joint(mean_ranks, mean_freqs)
    xs, ys = list(zip(*sorted(joints.values())))
    mandelbrot = Mandelbrot(ys, xs)
    mandelbrot_fit = mandelbrot.fit(start_params=np.asarray([1.0, 1.0]),
                                    method="powell",
                                    full_output=True)
    mandelbrot.register_fit(mandelbrot_fit)
    mandelbrot.print_result()
    return mandelbrot, mean_ranks
示例#7
0
# -*- coding: utf-8 -*-

from data.reader import wiki_from_pickles, corpus_to_pickle
from data.corpus import Sentences
from stats.stat_functions import compute_freqs, merge_to_joint

import argparse


def parse_args():
    p = argparse.ArgumentParser()

    p.add_argument("--lang", type=str)
    p.add_argument("--n_tokens", type=int)

    args = p.parse_args()
    return args.lang, args.n_tokens


if __name__ == "__main__":
    lang, n = parse_args()
    m = 10

    wiki = list(wiki_from_pickles("data/" + lang + "_pkl"))

    for i in range(m):
        sampled = Sentences.subsample(wiki, n)
        sampled_sents = list(sampled.sentences())
        name = "_".join((str(n), str(i)))
        corpus_to_pickle(sampled_sents, "results/" + lang + "/UNI", name)
示例#8
0
    hexbin_plot(xs, ys, xlbl="$\log$ $r(w)$", ylbl="$\log$ $f(w)$")
    plt.savefig(d + "rank_freq_" + str(n) + "_piantadosi.png", dpi=300)
    plt.close()


if __name__ == "__main__":
    lang = parse_args()
    d = "results/" + lang + "/plots/"
    wiki = list(wiki_from_pickles("data/" + lang + "_pkl"))
    n = int(10e6)
    m = 10

    zipf_wrong(wiki, n, d)
    zipf_piantadosi(wiki, n, d)

    subsamples1 = (Sentences.subsample(wiki, n) for _ in range(m))
    subsamples2 = (Sentences.subsample(wiki, n) for _ in range(m))

    ranks = [compute_ranks(sub) for sub in subsamples1]
    ranks_joined = pool_ranks(ranks)
    mean_ranks = reduce_pooled(ranks_joined)

    freqs = [compute_freqs(sub) for sub in subsamples2]
    freqs_joined = pool_freqs(freqs)
    mean_freqs = reduce_pooled(freqs_joined)

    print("subsampling done")

    joints = merge_to_joint(mean_ranks, mean_freqs)
    xs, ys = list(zip(*sorted(joints.values())))