os.remove(out_dir + str(year) + ".tmp.txt") def run_parallel(num_procs, out_dir, in_dir, count_dir, years, words, num_words, min_count, sample): queue = Queue() for year in years: queue.put(year) procs = [Process(target=worker, args=[i, queue, out_dir, in_dir, count_dir, words, num_words, min_count, sample]) for i in range(num_procs)] for p in procs: p.start() for p in procs: p.join() if __name__ == '__main__': parser = argparse.ArgumentParser(description="Computes various frequency statistics.") parser.add_argument("out_dir") parser.add_argument("in_dir") parser.add_argument("count_dir") parser.add_argument("word_file") parser.add_argument("--workers", type=int, default=10) parser.add_argument("--num-words", type=int, default=None) parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=1800) parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=2000) parser.add_argument("--year-inc", type=int, help="end year (inclusive)", default=1) parser.add_argument("--min-count", type=int, default=100) parser.add_argument("--sample", type=float, default=1e-5) args = parser.parse_args() years = range(args.start_year, args.end_year + 1, args.year_inc) words = ioutils.load_year_words(args.word_file, years) ioutils.mkdir(args.out_dir) run_parallel(args.workers, args.out_dir + "/", args.in_dir + "/", args.count_dir + "/", years, words, args.num_words, args.min_count, args.sample)
np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-v.npy", v) np.save((out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-s.npy", s) write_pickle(base_embed.iw, (out_dir + OUT_FORMAT).format(year=year, dim=dim) + "-vocab.pkl") if __name__ == '__main__': parser = ArgumentParser("Run SVD on historical co-occurrence matrices") parser.add_argument("in_dir", help="Directory with PPMI data") parser.add_argument("count_dir", help="Directory with PPMI data") parser.add_argument("word_file", help="File containing sorted list of words to potentially include") parser.add_argument("--num-words", type=int, help="Number of words to include", default=1000000) parser.add_argument("--dim", type=int, default=300) parser.add_argument("--workers", type=int, default=50) parser.add_argument("--start-year", type=int, default=1800) parser.add_argument("--end-year", type=int, default=1990) parser.add_argument("--year-inc", type=int, default=10) parser.add_argument("--min-count", type=int, default=100) args = parser.parse_args() queue = Queue() years = range(args.start_year, args.end_year + 1, args.year_inc) years.reverse() for year in years: queue.put(year) out_dir = args.in_dir + "/svd/" + str(args.dim) + "/" + str(args.num_words) + "/" + str(args.min_count) + "/" mkdir(out_dir) words = load_year_words(args.word_file, years) procs = [Process(target=worker, args=[i, queue, out_dir, args.in_dir, args.count_dir, words, args.dim, args.num_words, args.min_count]) for i in range(args.workers)] for p in procs: p.start() for p in procs: p.join()
description="Computes various frequency statistics.") parser.add_argument("out_dir") parser.add_argument("in_dir") parser.add_argument("count_dir") parser.add_argument( "word_file", help="file maps from year to word list (the output of freqperyear)") parser.add_argument("--workers", type=int, default=10) parser.add_argument("--num-words", type=int, default=None) parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=1800) parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=2000) parser.add_argument("--year-inc", type=int, help="end year (inclusive)", default=1) parser.add_argument("--min-count", type=int, default=100) parser.add_argument("--sample", type=float, default=1e-5) args = parser.parse_args() years = range(args.start_year, args.end_year + 1, args.year_inc) words = ioutils.load_year_words(args.word_file, years) ioutils.mkdir(args.out_dir) run_parallel(args.workers, args.out_dir + "/", args.in_dir + "/", args.count_dir + "/", years, words, args.num_words, args.min_count, args.sample)
type=int, help="Number of words to include", default=1000000) parser.add_argument("--dim", type=int, default=300) parser.add_argument("--workers", type=int, default=50) parser.add_argument("--start-year", type=int, default=1800) parser.add_argument("--end-year", type=int, default=1990) parser.add_argument("--year-inc", type=int, default=10) parser.add_argument("--min-count", type=int, default=100) args = parser.parse_args() queue = Queue() years = list(range(args.start_year, args.end_year + 1, args.year_inc)) years.reverse() for year in years: queue.put(year) out_dir = args.in_dir + "/svd/" + str(args.dim) + "/" + str( args.num_words) + "/" + str(args.min_count) + "/" mkdir(out_dir) words = load_year_words(args.word_file, years) procs = [ Process(target=worker, args=[ i, queue, out_dir, args.in_dir, args.count_dir, words, args.dim, args.num_words, args.min_count ]) for i in range(args.workers) ] for p in procs: p.start() for p in procs: p.join()