def worker(proc_num, queue, out_pref, in_dir, target_lists, context_lists, displacement_base, thresh, year_inc, type): time.sleep(10 * random.random()) while True: if queue.empty(): print proc_num, "Finished" break year = queue.get() print proc_num, "Loading matrices..." base = create_representation(type, in_dir + str(year - year_inc), thresh=thresh, restricted_context=context_lists[year], normalize=True, add_context=False) delta = create_representation(type, in_dir + str(year), thresh=thresh, restricted_context=context_lists[year], normalize=True, add_context=False) print proc_num, "Getting deltas..." year_vols = get_cosine_deltas(base, delta, target_lists[year], type) year_disp = get_cosine_deltas(displacement_base, delta, target_lists[year], type) print proc_num, "Writing results..." ioutils.write_pickle(year_vols, out_pref + str(year) + "-vols.pkl") ioutils.write_pickle(year_disp, out_pref + str(year) + "-disps.pkl")
def align_cloud(year, rep_type, main_dir, num, dim, wordlist, **rep_args): print "Aligning cloud year:", year avg_embed_mat = np.zeros((len(wordlist), dim)) for i in range(1, num + 1): # Iterates throug the embeddings print i finname = main_dir + "/embedding_" + str(i) + "/noinit/" + str( dim) + "/" + str(year) foutname = main_dir + "/embedding_" + str(i) + "/noinit/" + str( dim) + "/aligned/" + str(year) other_embed = create_representation( rep_type, finname, **rep_args) # Loads the individual embedding keep_indices = [other_embed.wi[word] for word in wordlist] other_embed = Embedding( other_embed.m[keep_indices, :], wordlist, normalize=False) # Synchronize the order of words if i == 1: base_embed = other_embed ortho = np.eye(dim) else: ortho = alignment.get_procrustes_mat(base_embed, other_embed) aligned_embed_mat = (other_embed.m).dot( ortho) # Rotates the embedding to the reference avg_embed_mat += aligned_embed_mat / num # Creates avarage embedding np.save(foutname + "-w.npy", aligned_embed_mat) write_pickle(other_embed.iw, foutname + "-vocab.pkl") foutname = main_dir + "/embedding_avg/" + str(year) np.save(foutname + "-w.npy", avg_embed_mat) write_pickle(base_embed.iw, foutname + "-vocab.pkl")
def align_years(years, rep_type, in_dir, out_dir, count_dir, min_count, **rep_args): first_iter = True base_embed = None for year in years: print "Loading year:", year # for each year year_embed = create_representation(rep_type, in_dir + str(year), **rep_args) # load in embedding pkl year_words = words_above_count( count_dir, year, min_count) # load count pkl, returns only words greater min_count year_embed.get_subembed( year_words ) # keep the embeddings for only the words in year_words, if not out of vocabulary print "Aligning year:", year if first_iter: # for first iteration, our aligned embed is our base embed so basically skip it aligned_embed = year_embed first_iter = False else: aligned_embed = alignment.smart_procrustes_align( base_embed, year_embed) base_embed = aligned_embed print "Writing year:", year foutname = out_dir + str(year) np.save(foutname + "-w.npy", aligned_embed.m) write_pickle(aligned_embed.iw, foutname + "-vocab.pkl")
def worker(proc_num, queue): while True: time.sleep(random.random()*10) try: year = queue.get(block=False) except Empty: print(proc_num, "Finished") return positive_seeds, negative_seeds = seeds.hist_seeds() year = str(year) print(proc_num, "On year", year) words = vocab.top_words(year, 5100) stop_words = vocab.top_words(year, 100) words = words.difference(stop_words) embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year) embed_words = set(embed.iw) words = words.intersection(embed_words) polarities = polarity_induction_methods.bootstrap( embed.get_subembed(words.union(positive_seeds).union(negative_seeds)), positive_seeds, negative_seeds, score_method=polarity_induction_methods.random_walk, num_boots=50, n_procs=20, return_all=True, beta=0.9, nn=25) util.write_pickle(polarities, constants.POLARITIES + year + '-coha-freq-boot.pkl')
def worker(proc_num, queue, out_pref, in_dir, target_lists, context_lists, displacement_base, thresh, year_inc, type): time.sleep(10*random.random()) while True: if queue.empty(): print proc_num, "Finished" break year = queue.get() print proc_num, "Loading matrices..." base = create_representation(type, in_dir + str(year-year_inc), thresh=thresh, restricted_context=context_lists[year], normalize=True, add_context=False) delta = create_representation(type, in_dir + str(year), thresh=thresh, restricted_context=context_lists[year], normalize=True, add_context=False) print proc_num, "Getting deltas..." year_vols = get_cosine_deltas(base, delta, target_lists[year], type) year_disp = get_cosine_deltas(displacement_base, delta, target_lists[year], type) print proc_num, "Writing results..." ioutils.write_pickle(year_vols, out_pref + str(year) + "-vols.pkl") ioutils.write_pickle(year_disp, out_pref + str(year) + "-disps.pkl")
def worker(proc_num, queue): while True: time.sleep(random.random()*10) try: year = queue.get(block=False) except Empty: print proc_num, "Finished" return positive_seeds, negative_seeds = seeds.hist_seeds() year = str(year) print proc_num, "On year", year words = vocab.top_words(year, 5100) stop_words = vocab.top_words(year, 100) words = words.difference(stop_words) embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year) embed_words = set(embed.iw) words = words.intersection(embed_words) polarities = polarity_induction_methods.bootstrap( embed.get_subembed(words.union(positive_seeds).union(negative_seeds)), positive_seeds, negative_seeds, score_method=polarity_induction_methods.random_walk, num_boots=50, n_procs=20, return_all=True, beta=0.9, nn=25) util.write_pickle(polarities, constants.POLARITIES + year + '-coha-freq-boot.pkl')
def run(count_path, out_path, smooth=0, cds=True, normalize=False, neg=1): counts = create_representation("Explicit", count_path, normalize=False) old_mat = counts.m index = counts.wi smooth = old_mat.sum() * smooth # getting marginal probs row_probs = old_mat.sum(1) + smooth col_probs = old_mat.sum(0) + smooth if cds: col_probs = np.power(col_probs, 0.75) row_probs = row_probs / row_probs.sum() col_probs = col_probs / col_probs.sum() # building PPMI matrix ppmi_mat = make_ppmi_mat(old_mat, row_probs, col_probs, smooth, neg=neg, normalize=normalize) import pyximport pyximport.install(setup_args={"include_dirs": np.get_include()}) from representations import sparse_io sparse_io.export_mat_eff(ppmi_mat.row, ppmi_mat.col, ppmi_mat.data, out_path + ".bin") util.write_pickle(index, out_path + "-index.pkl")
def worker(proc_num, queue): while True: # time.sleep(random.random()*10) try: name = queue.get(block=False) except Empty: print proc_num, "Finished" return if name + ".pkl" in os.listdir(POLARITIES): continue print proc_num, "Running", name subredditgen.main(name) word_dict = util.load_pickle(DICTS.format(name)) word_dict.filter_extremes(no_above=0.1, no_below=100) to_keep = sorted(word_dict.dfs, key=lambda w: word_dict.dfs[w], reverse=True)[:5000] word_dict.filter_tokens(good_ids=to_keep) sub_vecs = create_representation( "SVD", constants.SUBREDDIT_EMBEDDINGS.format(name)) pos_seeds, neg_seeds = seeds.twitter_seeds() sub_vecs = sub_vecs.get_subembed( set(word_dict.token2id.keys()).union(pos_seeds).union(neg_seeds)) pols = polarity_induction_methods.bootstrap(sub_vecs, pos_seeds, neg_seeds, return_all=True, nn=25, beta=0.9, num_boots=50, n_procs=10) util.write_pickle(pols, POLARITIES + name + ".pkl")
def align_years(years, rep_type, main_dir, num, dim, **rep_args): print "Aligning years to each other" first_iter = True base_embed = None for year in years: # Iterates through years print year year_embed = create_representation( rep_type, main_dir + "/embedding_avg/" + str(year), **rep_args) # Loads the individual embedding if first_iter: aligned_embed = year_embed first_iter = False else: ortho = alignment.get_procrustes_mat(base_embed, year_embed) aligned_embed = Embedding( (year_embed.m).dot(ortho), year_embed.iw, normalize=False) # Rotates to the previous year embedding for i in range( 1, num + 1): # Align all the embedding the same way as the avarage finname = main_dir + "/embedding_" + str(i) + "/noinit/" + str( dim) + "/aligned/" + str(year) foutname = main_dir + "/embedding_" + str( i) + "/noinit/" + str(dim) + "/aligned/" + str(year) mat = np.load(finname + "-w.npy") mat = mat.dot(ortho) np.save(foutname + "-w.npy", mat) base_embed = aligned_embed foutname = main_dir + "/embedding_avg/aligned/" + str(year) np.save(foutname + "-w.npy", aligned_embed.m) write_pickle(aligned_embed.iw, foutname + "-vocab.pkl")
def main(): args = docopt(""" Usage: eval_reliability.py [options] <representation> <file_name> <folders>... Options: --words FILE Use FILE with list of words (1 per line) to measure reliabilty --ws FILES Testsets for word similarity evaluation, use "," as separator! --ana FILES Testsets for analogy evaluation, use "," as separator! --closest N Use N closest neighbors to measure reliability [default: 10] """) folders = args["<folders>"] closest = int(args["--closest"]) word_list = args["--words"] ws_test_sets = [read_ws_test_set(path) for path in args["--ws"].split(",")] as_test_sets = [ read_as_test_set(path) for path in args["--ana"].split(",") ] as_xi_and_ix = [get_vocab_as(test_set) for test_set in as_test_sets] words = words_to_evaluate_file( word_list) if word_list else argswords_to_evaluate(representations) #good default parameter for svd args["--eig"] = 0 args["--w+c"] = False #not used args["--neg"] = 1 representations = [] for file in folders: if os.path.isfile(file + "/" + args["<file_name>"] + ".words.vocab"): x = copy.deepcopy(args) x["<representation_path>"] = file + "/" + args["<file_name>"] representations.append(create_representation(x)) else: print("Could not find " + file + "/" + args["<file_name>"] + ".words.vocab", file=sys.stderr) #comparisson over all subsets if len(representations) < 2: raise Exception("Need multiple models for evaluation") evaluated = [ " ".join([str(evaluate_ws(r, w)) for r in representations]) for w in ws_test_sets ] for i, test_set in enumerate(as_test_sets): evaluated.append(" ".join([ str( evaluate_as(r, test_set, as_xi_and_ix[i][0], as_xi_and_ix[i][1])) for r in representations ])) evaluated.append(reliability(representations, words, closest)) print("\t".join(evaluated))
def main(): args = docopt(""" Usage: analogy_eval.py [options] <representation> <representation_path> <task_path> Options: --neg NUM Number of negative samples; subtracts its log from PMI (only applicable to PPMI) [default: 1] --w+c Use ensemble of word and context vectors (not applicable to PPMI) --eig NUM Weighted exponent of the eigenvalue matrix (only applicable to SVD) [default: 0.5] --normalize Use row-normalized word vectors """) print args['--normalize'] representation = create_representation(args) args['--normalize'] = True print args['--normalize'] representation_sim = create_representation(args) data = read_test_set(representation, args['<task_path>']) xi, ix = get_vocab(data) accuracy_add, accuracy_mul = evaluate(representation, representation_sim, data, xi, ix) print args['<representation>'], args[ '<representation_path>'], '\t%0.3f' % accuracy_add, '\t%0.3f' % accuracy_mul
def main(): args = docopt(""" Usage: ws_eval.py [options] <representation> <representation_path> <task_path> Options: --neg NUM Number of negative samples; subtracts its log from PMI (only applicable to PPMI) [default: 1] --w+c Use ensemble of word and context vectors (not applicable to PPMI) --eig NUM Weighted exponent of the eigenvalue matrix (only applicable to SVD) [default: 0.5] """) data = read_test_set(args['<task_path>']) representation = create_representation(args) correlation = evaluate(representation, data) print (args['<representation>'] + " " + args['<task_path>'] + '\t%0.3f' % correlation)
def main(): args = docopt(""" Usage: ws_eval.py [options] <representation> <representation_path> <task_path> Options: --neg NUM Number of negative samples; subtracts its log from PMI (only applicable to PPMI) [default: 1] --w+c Use ensemble of word and context vectors (not applicable to PPMI) --eig NUM Weighted exponent of the eigenvalue matrix (only applicable to SVD) [default: 0.5] """) data = read_test_set(args['<task_path>']) representation = create_representation(args) correlation = evaluate(representation, data) print 'Word Similarity', '\t%0.3f' % correlation
def main(): args = docopt(""" Usage: get_most_similar.py [options] <representation> <representation_path> <test_words> Options: --neg NUM Number of negative samples; subtracts its log from PMI (only applicable to PPMI) [default: 1] --w+c Use ensemble of word and context vectors (not applicable to PPMI) --eig NUM Weighted exponent of the eigenvalue matrix (only applicable to SVD) [default: 0.5] """) words = args['<test_words>'].split('_') representation = create_representation(args) for w in words: print w print representation.closest(w)
def main(): args = docopt(""" Usage: analogy_eval.py [options] <representation> <representation_path> <task_path> Options: --neg NUM Number of negative samples; subtracts its log from PMI (only applicable to PPMI) [default: 1] --w+c Use ensemble of word and context vectors (not applicable to PPMI) --eig NUM Weighted exponent of the eigenvalue matrix (only applicable to SVD) [default: 0.5] """) data = read_test_set(args['<task_path>']) xi, ix = get_vocab(data) representation = create_representation(args) accuracy_add, accuracy_mul = evaluate(representation, data, xi, ix) print args['<representation>'], args['<representation_path>'], '\t%0.3f' % accuracy_add, '\t%0.3f' % accuracy_mul
def main(): args = docopt(""" Usage: analogy_eval.py [options] <representation> <representation_path> <task_path> Options: --neg NUM Number of negative samples; subtracts its log from PMI (only applicable to PPMI) [default: 1] --w+c Use ensemble of word and context vectors (not applicable to PPMI) --concatenate Concatenate left vector and right vector together --eig NUM Weighted exponent of the eigenvalue matrix (only applicable to SVD) [default: 0.5] --contexts Use context embedding as word embedding """) data = read_test_set(args['<task_path>']) xi, ix = get_vocab(data) representation = create_representation(args) accuracy_add, accuracy_mul = evaluate(representation, data, xi, ix) print args['<representation>'], args[ '<representation_path>'], '\t%0.3f' % accuracy_add, '\t%0.3f' % accuracy_mul
def align_years(years, rep_type, in_dir, out_dir, **rep_args): first_iter = True base_embed = None for year in years: # Iterates through years print "Loading year:", year year_embed = create_representation( rep_type, in_dir + str(year), **rep_args) # Loads the individual embedding print "Aligning year:", year if first_iter: aligned_embed = year_embed first_iter = False else: aligned_embed = alignment.smart_procrustes_align( base_embed, year_embed, post_normalize=False) # Rotates to the previous year embedding base_embed = aligned_embed print "Writing year:", year foutname = out_dir + str(year) np.save(foutname + "-w.npy", aligned_embed.m) write_pickle(aligned_embed.iw, foutname + "-vocab.pkl")
def worker(proc_num, queue): while True: # time.sleep(random.random()*10) try: name = queue.get(block=False) except Empty: print proc_num, "Finished" return if name + ".pkl" in os.listdir(POLARITIES): continue print proc_num, "Running", name subredditgen.main(name) word_dict = util.load_pickle(DICTS.format(name)) word_dict.filter_extremes(no_above=0.1, no_below=100) to_keep = sorted(word_dict.dfs, key=lambda w : word_dict.dfs[w], reverse=True)[:5000] word_dict.filter_tokens(good_ids=to_keep) sub_vecs = create_representation("SVD", constants.SUBREDDIT_EMBEDDINGS.format(name)) pos_seeds, neg_seeds = seeds.twitter_seeds() sub_vecs = sub_vecs.get_subembed(set(word_dict.token2id.keys()).union(pos_seeds).union(neg_seeds)) pols = polarity_induction_methods.bootstrap(sub_vecs, pos_seeds, neg_seeds, return_all=True, nn=25, beta=0.9, num_boots=50, n_procs=10) util.write_pickle(pols, POLARITIES + name + ".pkl")
def main(): args = docopt(""" Usage: ws_eval.py [options] <representation> <representation_path> <task_path> Options: --neg NUM Number of negative samples; subtracts its log from PMI (only applicable to PPMI) [default: 1] --w+c Use ensemble of word and context vectors (not applicable to PPMI) --eig NUM Weighted exponent of the eigenvalue matrix (only applicable to SVD) [default: 0.5] --vocab FILE Optional: use vocabulary file to determine what is difficult for the embeddings --cutoff NUM Optional: Cutoff proportion for reporting rank mismatches --verbose NUM Specify 1 for bonus output for analysis """) data = read_test_set(args['<task_path>']) representation = create_representation(args) #print dir(representation), representation.iw[:3] correlation, actual, expected = evaluate(representation, data) top_n = 50 print args['<representation>'], args['<representation_path>'], '\t%0.6f' % correlation #print args['--verbose'] verbose = 1 if args['--verbose'] is not None and args['--verbose'] == '1' else 0 if args['--vocab'] is not None: reconstruct_spearmanr(actual, expected, representation, data, args['--vocab'], cutoff=args['--cutoff'], verbose=verbose)
print "Merging" full_word_set = set([]) for year_words in target_lists.itervalues(): full_word_set = full_word_set.union(set(year_words)) merge(out_pref, years, list(full_word_set)) if __name__ == '__main__': parser = argparse.ArgumentParser(description="Computes semantic change statistics for words.") parser.add_argument("dir", help="path to word vectors") parser.add_argument("num_procs", type=int, help="number of processes to spawn") parser.add_argument("word_file", help="path to sorted word file") parser.add_argument("out_dir", help="output path") parser.add_argument("--target-words", type=int, help="Number of words (of decreasing average frequency) to analyze", default=-1) parser.add_argument("--context-words", type=int, help="Number of words (of decreasing average frequency) to include in context. -2 means all regardless of word list", default=-1) parser.add_argument("--context-word-file") parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=1800) parser.add_argument("--year-inc", type=int, help="year increment", default=10) parser.add_argument("--type", default="PPMI") parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=2000) parser.add_argument("--disp-year", type=int, help="year to measure displacement from", default=2000) args = parser.parse_args() years = range(args.start_year, args.end_year + 1, args.year_inc) target_lists, context_lists = ioutils.load_target_context_words(years, args.word_file, args.target_words, -1) if args.context_word_file != None: print "Loading context words.." _ , context_lists = ioutils.load_target_context_words(years, args.word_file, -1, args.context_words) target_lists, context_lists = ioutils.load_target_context_words(years, args.word_file, args.target_words, args.context_words) ioutils.mkdir(args.out_dir) displacement_base = create_representation(args.type, args.dir + "/" + str(args.disp_year), restricted_context=context_lists[args.disp_year], normalize=True, add_context=False) run_parallel(args.num_procs, args.out_dir, args.dir + "/", years[1:], target_lists, context_lists, displacement_base, 0, args.year_inc, args.type)
print("Merging") full_word_set = set([]) for year_words in target_lists.values(): full_word_set = full_word_set.union(set(year_words)) merge(out_pref, years, list(full_word_set)) if __name__ == '__main__': parser = argparse.ArgumentParser(description="Computes semantic change statistics for words.") parser.add_argument("dir", help="path to word vectors") parser.add_argument("num_procs", type=int, help="number of processes to spawn") parser.add_argument("word_file", help="path to sorted word file") parser.add_argument("out_dir", help="output path") parser.add_argument("--target-words", type=int, help="Number of words (of decreasing average frequency) to analyze", default=-1) parser.add_argument("--context-words", type=int, help="Number of words (of decreasing average frequency) to include in context. -2 means all regardless of word list", default=-1) parser.add_argument("--context-word-file") parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=1800) parser.add_argument("--year-inc", type=int, help="year increment", default=10) parser.add_argument("--type", default="PPMI") parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=2000) parser.add_argument("--disp-year", type=int, help="year to measure displacement from", default=2000) args = parser.parse_args() years = list(range(args.start_year, args.end_year + 1, args.year_inc)) target_lists, context_lists = ioutils.load_target_context_words(years, args.word_file, args.target_words, -1) if args.context_word_file != None: print("Loading context words..") _ , context_lists = ioutils.load_target_context_words(years, args.word_file, -1, args.context_words) target_lists, context_lists = ioutils.load_target_context_words(years, args.word_file, args.target_words, args.context_words) ioutils.mkdir(args.out_dir) displacement_base = create_representation(args.type, args.dir + "/" + str(args.disp_year), restricted_context=context_lists[args.disp_year], normalize=True, add_context=False) run_parallel(args.num_procs, args.out_dir, args.dir + "/", years[1:], target_lists, context_lists, displacement_base, 0, args.year_inc, args.type)