def worker(proc_num, queue, out_pref, in_dir, word_list, displacement_base, thresh): while True: try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break print proc_num, "Loading matrices..." base = simple_create_representation(REP_TYPE, in_dir + str(year - 1) + ".bin", restricted_context=word_list[year - 1], thresh=thresh) delta = simple_create_representation( REP_TYPE, in_dir + str(year) + ".bin", restricted_context=word_list[year], thresh=thresh) print proc_num, "Getting deltas..." year_vols = get_cosine_deltas(base, delta, word_list[year]) year_disp = get_cosine_deltas(displacement_base, delta, word_list[year]) print proc_num, "Writing results..." ioutils.write_pickle(year_vols, out_pref + str(year) + "-vols.pkl") ioutils.write_pickle(year_disp, out_pref + str(year) + "-disps.pkl")
def worker(proc_num, queue, out_pref, in_dir, year_index_infos, knn, thresh): random.shuffle(years) print proc_num, "Start loop" while True: try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break print proc_num, "Making second orders for year", year old_embed = simple_create_representation(REP_TYPE, in_dir + str(year) + ".bin", thresh=thresh) old_embed = old_embed.get_subembed(year_index_infos[year]["list"]) old_mat = old_embed.m.tocoo() row_d, col_d, data_d, keep_rows = make_secondorder_mat( old_mat, thresh=thresh, min_cooccurs=0, shrink_mat=False) second_mat = coo_matrix((data_d, (row_d, col_d))) if knn != None: row_d, col_d, data_d = make_knn_mat(second_mat, knn) second_mat = coo_matrix((data_d, (row_d, col_d))) year_stats = get_year_stats(second_mat, old_embed.wi, old_embed.iw, stats=STATS) print proc_num, "Writing stats for year", year ioutils.write_pickle(year_stats, out_pref + str(year) + "-tmp.pkl")
def worker(proc_num, queue, out_pref, in_dir, word_list, displacement_base, thresh): while True: try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break print proc_num, "Loading matrices..." base = simple_create_representation(REP_TYPE, in_dir + str(year-1) + ".bin", restricted_context=word_list[year-1], thresh=thresh) delta = simple_create_representation(REP_TYPE, in_dir + str(year) + ".bin", restricted_context=word_list[year], thresh=thresh) print proc_num, "Getting deltas..." year_vols = get_cosine_deltas(base, delta, word_list[year]) year_disp = get_cosine_deltas(displacement_base, delta, word_list[year]) print proc_num, "Writing results..." ioutils.write_pickle(year_vols, out_pref + str(year) + "-vols.pkl") ioutils.write_pickle(year_disp, out_pref + str(year) + "-disps.pkl")
def worker(proc_num, queue, out_pref, in_dir, year_index_infos, knn, thresh): random.shuffle(years) print proc_num, "Start loop" while True: try: year = queue.get(block=False) except Empty: print proc_num, "Finished" break print proc_num, "Making second orders for year", year old_embed = simple_create_representation(REP_TYPE, in_dir + str(year) + ".bin", thresh=thresh) old_embed = old_embed.get_subembed(year_index_infos[year]["list"]) old_mat = old_embed.m.tocoo() row_d, col_d, data_d, keep_rows = make_secondorder_mat(old_mat, thresh=thresh, min_cooccurs=0, shrink_mat=False) second_mat = coo_matrix((data_d, (row_d, col_d))) if knn != None: row_d, col_d, data_d = make_knn_mat(second_mat, knn) second_mat = coo_matrix((data_d, (row_d, col_d))) year_stats = get_year_stats(second_mat, old_embed.wi, old_embed.iw, stats=STATS) print proc_num, "Writing stats for year", year ioutils.write_pickle(year_stats, out_pref + str(year) + "-tmp.pkl")
for p in procs: p.join() print "Merging" full_word_set = set([]) for year_words in word_list.itervalues(): full_word_set = full_word_set.union(set(year_words)) merge(out_pref, years, list(full_word_set)) if __name__ == '__main__': parser = argparse.ArgumentParser(description="Computes semantic change statistics for words.") parser.add_argument("dir", help="path to network data (also where output goes)") parser.add_argument("word_file", help="path to sorted word file") parser.add_argument("num_procs", type=int, help="number of processes to spawn") parser.add_argument("--num-words", type=int, help="Number of words (of decreasing average frequency) to include", default=-1) parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR) parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=END_YEAR) parser.add_argument("--disp-year", type=int, help="year to measure displacement from", default=END_YEAR) parser.add_argument("--thresh", type=float, help="relevance threshold", default=THRESH) args = parser.parse_args() years = range(args.start_year, args.end_year + 1) word_lists = ioutils.load_year_words(args.word_file, years) if args.num_words != -1: for year in years: word_lists[year] = word_lists[year][:args.num_words] ioutils.mkdir(args.dir + "/volstats") outpref ="/volstats/" + args.word_file.split("/")[-1].split(".")[0] + "-" + str(args.thresh) if args.num_words != -1: outpref += "-top" + str(args.num_words) displacement_base = simple_create_representation(REP_TYPE, args.dir + "/" + str(args.disp_year) + ".bin", restricted_context=word_lists[args.end_year], thresh=args.thresh) run_parallel(args.num_procs, args.dir + outpref, args.dir + "/", years[1:], word_lists, displacement_base, args.thresh)
parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=END_YEAR) parser.add_argument("--disp-year", type=int, help="year to measure displacement from", default=END_YEAR) parser.add_argument("--thresh", type=float, help="relevance threshold", default=THRESH) args = parser.parse_args() years = range(args.start_year, args.end_year + 1) word_lists = ioutils.load_year_words(args.word_file, years) if args.num_words != -1: for year in years: word_lists[year] = word_lists[year][:args.num_words] ioutils.mkdir(args.dir + "/volstats") outpref = "/volstats/" + args.word_file.split("/")[-1].split( ".")[0] + "-" + str(args.thresh) if args.num_words != -1: outpref += "-top" + str(args.num_words) displacement_base = simple_create_representation( REP_TYPE, args.dir + "/" + str(args.disp_year) + ".bin", restricted_context=word_lists[args.end_year], thresh=args.thresh) run_parallel(args.num_procs, args.dir + outpref, args.dir + "/", years[1:], word_lists, displacement_base, args.thresh)