def main(proc_num, lock, out_pref, tmp_out_pref, in_dir, years, word_list, word_indices, displacement_base, thresh): while True: lock.acquire() work_left = False for year in years: dirs = set(os.listdir(in_dir + "/volstats/")) if tmp_out_pref.split("/")[-1] + str(year) + "-jvols.pkl" in dirs: continue work_left = True print proc_num, "year", year fname = tmp_out_pref + str(year) + "-jvols.pkl" with open(fname, "w") as fp: fp.write("") fp.close() break lock.release() if not work_left: print proc_num, "Finished" break print proc_num, "Loading matrices..." base = matstore.retrieve_mat_as_binary_coo_thresh(in_dir + "/" + str(year - 1) + ".bin", args.thresh, min_size=MIN_SIZE) base = base.tocsr() delta = matstore.retrieve_mat_as_binary_coo_thresh(in_dir + "/" + str(year) + ".bin", args.thresh, min_size=MIN_SIZE) delta = delta.tocsr() print proc_num, "Getting deltas..." year_vols = get_jaccard_deltas(base, delta, word_list, word_indices) year_disp = get_jaccard_deltas(displacement_base, delta, word_list, word_indices) print proc_num, "Writing results..." ioutils.write_pickle(year_vols, tmp_out_pref + str(year) + "-jvols.pkl") ioutils.write_pickle(year_disp, tmp_out_pref + str(year) + "-jdisps.pkl")
def main(proc_num, lock, out_pref, tmp_out_pref, in_dir, years, word_list, word_indices, displacement_base, thresh): while True: lock.acquire() work_left = False for year in years: dirs = set(os.listdir(in_dir + "/volstats/")) if tmp_out_pref.split("/")[-1] + str(year) + "-jvols.pkl" in dirs: continue work_left = True print proc_num, "year", year fname = tmp_out_pref + str(year) + "-jvols.pkl" with open(fname, "w") as fp: fp.write("") fp.close() break lock.release() if not work_left: print proc_num, "Finished" break print proc_num, "Loading matrices..." base = matstore.retrieve_mat_as_binary_coo_thresh( in_dir + "/" + str(year - 1) + ".bin", args.thresh, min_size=MIN_SIZE) base = base.tocsr() delta = matstore.retrieve_mat_as_binary_coo_thresh(in_dir + "/" + str(year) + ".bin", args.thresh, min_size=MIN_SIZE) delta = delta.tocsr() print proc_num, "Getting deltas..." year_vols = get_jaccard_deltas(base, delta, word_list, word_indices) year_disp = get_jaccard_deltas(displacement_base, delta, word_list, word_indices) print proc_num, "Writing results..." ioutils.write_pickle(year_vols, tmp_out_pref + str(year) + "-jvols.pkl") ioutils.write_pickle(year_disp, tmp_out_pref + str(year) + "-jdisps.pkl")
p.start() for p in procs: p.join() print "Merging" merge(out_pref, tmp_out_pref, years, word_list) if __name__ == '__main__': parser = argparse.ArgumentParser(description="Merges years of raw 5gram data.") parser.add_argument("dir", help="path to network data (also where output goes)") parser.add_argument("word_file", help="path to sorted word file") parser.add_argument("index_file", help="path to word index file") parser.add_argument("num_procs", type=int, help="number of processes to spawn") parser.add_argument("--num-words", type=int, help="Number of words (of decreasing average frequency) to include", default=-1) parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR) parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=END_YEAR) parser.add_argument("--thresh", type=float, help="relevance threshold", default=THRESH) args = parser.parse_args() years = range(args.start_year+1, args.end_year + 1) word_list = ioutils.load_pickle(args.word_file) index = ioutils.load_pickle(args.index_file) if args.num_words != -1: word_list = word_list[:args.num_words] ioutils.mkdir(args.dir + "/volstats") word_list, word_indices = get_word_indices(word_list, index) outpref ="/volstats/" + args.word_file.split("/")[-1].split(".")[0] + "-" + str(args.thresh) if args.num_words != -1: outpref += "-top" + str(args.num_words) displacement_base = matstore.retrieve_mat_as_binary_coo_thresh(args.dir + "/" + str(args.end_year) + ".bin", args.thresh, min_size=MIN_SIZE) displacement_base = displacement_base.tocsr() run_parallel(args.num_procs, args.dir + outpref, args.dir + outpref + "-tmp", args.dir + "/", years, word_list, word_indices, displacement_base, args.thresh)
help="start year (inclusive)", default=START_YEAR) parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=END_YEAR) parser.add_argument("--thresh", type=float, help="relevance threshold", default=THRESH) args = parser.parse_args() years = range(args.start_year + 1, args.end_year + 1) word_list = ioutils.load_pickle(args.word_file) index = ioutils.load_pickle(args.index_file) if args.num_words != -1: word_list = word_list[:args.num_words] ioutils.mkdir(args.dir + "/volstats") word_list, word_indices = get_word_indices(word_list, index) outpref = "/volstats/" + args.word_file.split("/")[-1].split( ".")[0] + "-" + str(args.thresh) if args.num_words != -1: outpref += "-top" + str(args.num_words) displacement_base = matstore.retrieve_mat_as_binary_coo_thresh( args.dir + "/" + str(args.end_year) + ".bin", args.thresh, min_size=MIN_SIZE) displacement_base = displacement_base.tocsr() run_parallel(args.num_procs, args.dir + outpref, args.dir + outpref + "-tmp", args.dir + "/", years, word_list, word_indices, displacement_base, args.thresh)