def align_years(years, rep_type, in_dir, out_dir, count_dir, min_count,
                **rep_args):
    first_iter = True
    base_embed = None
    for year in years:
        print "Loading year:", year  # for each year
        year_embed = create_representation(rep_type, in_dir + str(year),
                                           **rep_args)  # load in embedding pkl
        year_words = words_above_count(
            count_dir, year,
            min_count)  # load count pkl, returns only words greater min_count
        year_embed.get_subembed(
            year_words
        )  # keep the embeddings for only the words in year_words, if not out of vocabulary
        print "Aligning year:", year
        if first_iter:  # for first iteration, our aligned embed is our base embed so basically skip it
            aligned_embed = year_embed
            first_iter = False
        else:
            aligned_embed = alignment.smart_procrustes_align(
                base_embed, year_embed)
        base_embed = aligned_embed
        print "Writing year:", year
        foutname = out_dir + str(year)
        np.save(foutname + "-w.npy", aligned_embed.m)
        write_pickle(aligned_embed.iw, foutname + "-vocab.pkl")
Пример #2
0
def worker(proc_num,
           queue,
           out_dir,
           in_dir,
           count_dir,
           words,
           dim,
           num_words,
           min_count=100):
    while True:
        if queue.empty():
            break
        year = queue.get()
        print("Loading embeddings for year", year)
        time.sleep(random.random() * 120)
        valid_words = set(words_above_count(count_dir, year, min_count))
        print(len(valid_words))
        words = list(valid_words.intersection(words[year][:num_words]))
        print(len(words))
        base_embed = Explicit.load((in_dir + INPUT_FORMAT).format(year=year),
                                   normalize=False)
        base_embed = base_embed.get_subembed(words, restrict_context=True)
        print("SVD for year", year)
        u, s, v = randomized_svd(base_embed.m, n_components=dim, n_iter=5)
        print("Saving year", year)
        np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-u.npy",
                u)
        np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-v.npy",
                v)
        np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-s.npy",
                s)
        write_pickle(base_embed.iw,
                     (out_dir + OUT_FORMAT).format(year=year, dim=dim) +
                     "-vocab.pkl")
Пример #3
0
def worker(proc_num,
           queue,
           out_dir,
           in_dir,
           count_dir,
           valid_words,
           num_words,
           min_count,
           sample=1e-5):
    while True:
        try:
            year = queue.get(block=False)
        except Empty:
            break
        print proc_num, "Getting counts and matrix year", year
        embed = Explicit.load(in_dir + str(year) + ".bin", normalize=False)
        year_words = valid_words[year][:num_words]
        count_words = set(ioutils.words_above_count(count_dir, year,
                                                    min_count))
        freq = CachedFreqDist(
            ioutils.load_pickle(count_dir + str(year) + "-counts.pkl"))
        use_words = list(count_words.intersection(year_words))
        embed = embed.get_subembed(use_words, restrict_context=True)
        sample_corr = min(SAMPLE_MAX / freq.N(), 1.0)
        print "Sample correction..", sample_corr
        embed.m = embed.m * sample_corr
        mat = embed.m.tocoo()
        print proc_num, "Outputing pairs for year", year
        with open(out_dir + str(year) + ".tmp.txt", "w") as fp:
            for i in xrange(len(mat.data)):
                if i % 10000 == 0:
                    print "Done ", i, "of", len(mat.data)
                word = embed.iw[mat.row[i]]
                context = embed.ic[mat.col[i]]
                if sample != 0:
                    prop_keep = min(np.sqrt(sample / freq.freq(word)), 1.0)
                    prop_keep *= min(np.sqrt(sample / freq.freq(context)), 1.0)
                else:
                    prop_keep = 1.0
                word = word.encode("utf-8")
                context = context.encode("utf-8")
                line = word + " " + context + "\n"
                for j in xrange(int(mat.data[i] * prop_keep)):
                    fp.write(line)
        mat = mat.tocsr()
        print proc_num, "Outputing vocab for year", year
        with open(out_dir + str(year) + ".vocab", "w") as fp:
            for word in year_words:
                if not word in count_words:
                    print >> fp, word.encode("utf-8"), 1
                else:
                    print >> fp, word.encode("utf-8"), int(
                        mat[embed.wi[word], :].sum())
        print "shuf " + out_dir + str(year) + ".tmp.txt" " > " + out_dir + str(
            year) + ".txt"
        os.system("shuf " + out_dir + str(year) + ".tmp.txt" + " > " +
                  out_dir + str(year) + ".txt")
        os.remove(out_dir + str(year) + ".tmp.txt")
Пример #4
0
def worker(proc_num, queue, out_dir, in_dir, count_dir, valid_words, num_words, min_count, sample=1e-5):
    while True:
        try:
            year = queue.get(block=False)
        except Empty:
            break
        print proc_num, "Getting counts and matrix year", year
        embed = Explicit.load(in_dir + str(year) + ".bin", normalize=False)
        year_words = valid_words[year][:num_words]
        count_words = set(ioutils.words_above_count(count_dir, year, min_count))
        freq = CachedFreqDist(ioutils.load_pickle(count_dir + str(year) + "-counts.pkl"))
        use_words = list(count_words.intersection(year_words)) 
        embed = embed.get_subembed(use_words, restrict_context=True)
        sample_corr = min(SAMPLE_MAX / freq.N(), 1.0)
        print "Sample correction..", sample_corr
        embed.m = embed.m * sample_corr
        mat = embed.m.tocoo()
        print proc_num, "Outputing pairs for year", year
        with open(out_dir + str(year) + ".tmp.txt", "w") as fp:
            for i in xrange(len(mat.data)): 
                if i % 10000 == 0:
                    print "Done ", i, "of", len(mat.data)
                word = embed.iw[mat.row[i]]
                context = embed.ic[mat.col[i]]
                if sample != 0:
                    prop_keep = min(np.sqrt(sample / freq.freq(word)), 1.0) 
                    prop_keep *= min(np.sqrt(sample / freq.freq(context)), 1.0) 
                else:
                    prop_keep = 1.0
                word = word.encode("utf-8")
                context = context.encode("utf-8")
                line = word + " " + context + "\n"
                for j in xrange(int(mat.data[i] * prop_keep)):
                    fp.write(line)
        mat = mat.tocsr()
        print proc_num, "Outputing vocab for year", year
        with open(out_dir + str(year) + ".vocab", "w") as fp:
            for word in year_words:
                if not word in count_words:
                    print >>fp, word.encode("utf-8"), 1
                else:
                    print >>fp, word.encode("utf-8"), int(mat[embed.wi[word], :].sum())
        print "shuf " + out_dir + str(year) + ".tmp.txt" " > " + out_dir + str(year) + ".txt" 
        os.system("shuf " + out_dir + str(year) + ".tmp.txt" + " > " + out_dir + str(year) + ".txt")
        os.remove(out_dir + str(year) + ".tmp.txt")
Пример #5
0
def align_years(years, rep_type, in_dir, out_dir, count_dir, min_count, **rep_args):
    first_iter = True
    base_embed = None
    for year in years:
        print("Loading year:", year)
        year_embed =  create_representation(rep_type, in_dir + str(year), **rep_args)
        year_words = words_above_count(count_dir, year, min_count)
        year_embed.get_subembed(year_words)
        print("Aligning year:", year)
        if first_iter:
            aligned_embed = year_embed
            first_iter = False
        else:
            aligned_embed = alignment.smart_procrustes_align(base_embed, year_embed)
        base_embed = aligned_embed
        print("Writing year:", year)
        foutname = out_dir + str(year)
        np.save(foutname + "-w.npy",aligned_embed.m)
        write_pickle(aligned_embed.iw, foutname + "-vocab.pkl")
Пример #6
0
def align_years(years, rep_type, in_dir, out_dir, count_dir, min_count, **rep_args):
    first_iter = True
    base_embed = None
    for year in years:
        print "Loading year:", year
        year_embed =  create_representation(rep_type, in_dir + str(year), **rep_args)
        year_words = words_above_count(count_dir, year, min_count)
        year_embed.get_subembed(year_words)
        print "Aligning year:", year
        if first_iter:
            aligned_embed = year_embed
            first_iter = False
        else:
            aligned_embed = alignment.smart_procrustes_align(base_embed, year_embed)
        base_embed = aligned_embed
        print "Writing year:", year
        foutname = out_dir + str(year)
        np.save(foutname + "-w.npy",aligned_embed.m)
        write_pickle(aligned_embed.iw, foutname + "-vocab.pkl")
Пример #7
0
def worker(proc_num, queue, out_dir, in_dir, count_dir, words, dim, num_words, min_count=100):
    while True:
        if queue.empty():
            break
        year = queue.get()
        print "Loading embeddings for year", year
        time.sleep(random.random() * 120)
        valid_words = set(words_above_count(count_dir, year, min_count))
        print len(valid_words)
        words = list(valid_words.intersection(words[year][:num_words]))
        print len(words)
        base_embed = Explicit.load((in_dir + INPUT_FORMAT).format(year=year), normalize=False)
        base_embed = base_embed.get_subembed(words, restrict_context=True)
        print "SVD for year", year
        u, s, v = randomized_svd(base_embed.m, n_components=dim, n_iter=5)
        print "Saving year", year
        np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-u.npy", u)
        np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-v.npy", v)
        np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-s.npy", s)
        write_pickle(base_embed.iw, (out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-vocab.pkl")