def load_year_index_infos_common(common_index, years, word_file, num_words=-1): year_index_infos = collections.defaultdict(dict) word_lists = load_year_words(word_file, years) for year, word_list in word_lists.iteritems(): year_index = common_index year_index_infos[year]["index"] = year_index if num_words != -1: word_list = word_list[:num_words] word_list, word_indices = get_word_indices(word_list, year_index) year_index_infos[year]["list"] = word_list year_index_infos[year]["indices"] = word_indices return year_index_infos
def load_year_index_infos(index_dir, years, word_file, num_words=-1): year_index_infos = collections.defaultdict(dict) word_lists = load_year_words(word_file, years) for year, word_list in word_lists.iteritems(): year_index = load_pickle(index_dir + "/" + str(year) + "-index.pkl") year_index_infos[year]["index"] = year_index if num_words != -1: word_list = word_list[:num_words] word_list, word_indices = get_word_indices(word_list, year_index) year_index_infos[year]["list"] = word_list year_index_infos[year]["indices"] = word_indices return year_index_infos
if args.word_file != None: if args.index_dir == None: print >> sys.stderr, "Must specify index dir with word file!" sys.exit() word_pickle = ioutils.load_pickle(args.word_file) if not args.start_year in word_pickle: word_lists = {} for year in years: word_lists[year] = word_pickle else: word_lists = word_pickle word_infos = {} for year, word_list in word_lists.iteritems(): year_index = ioutils.load_pickle(args.index_dir + "/" + str(year) + "-index.pkl") if args.num_words != -1: word_list = word_list[: args.num_words] word_list, word_indices = get_word_indices(word_list, year_index) word_infos[year] = (word_list, word_indices) outpref = "/netstats/" + args.word_file.split("/")[-1].split(".")[0] if args.num_words != -1: outpref += "-top" + str(args.num_words) else: word_info = None outpref = "/netstats/net" if args.thresh != None: outpref += "-" + str(args.thresh) ioutils.mkdir(args.dir + "/netstats") run_parallel( args.num_procs, args.dir + outpref, args.dir + "/netstats/", args.dir + "/", years, word_info, args.thresh )
merge(word_list, years, in_dir, out_file) if __name__ == '__main__': parser = argparse.ArgumentParser( description="Merges years of raw 5gram data.") parser.add_argument("out_file", help="path to network data (also where output goes)") parser.add_argument("in_dir", help="path to network data (also where output goes)") parser.add_argument("word_file", help="path to sorted word file") parser.add_argument("index_file", help="path to sorted word file") parser.add_argument("num_procs", type=int, help="number of processes to spawn") parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR) parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=END_YEAR) args = parser.parse_args() years = range(args.start_year, args.end_year + 1) index = ioutils.load_pickle(args.index_file) word_list = ioutils.load_pickle(args.word_file) word_list, _ = get_word_indices(word_list, index) run_parallel(args.num_procs, args.in_dir + "/", years, word_list, index, args.out_file)
print proc_num, "Writing stats for year", year ioutils.write_pickle(word_stats, in_dir + str(year) + "-freqs.pkl") def run_parallel(num_procs, in_dir, years, word_list, index, out_file): lock = Lock() procs = [Process(target=main, args=[i, lock, in_dir, years, word_list, index]) for i in range(num_procs)] for p in procs: p.start() for p in procs: p.join() print "Merging" merge(word_list, years, in_dir, out_file) if __name__ == '__main__': parser = argparse.ArgumentParser(description="Merges years of raw 5gram data.") parser.add_argument("out_file", help="path to network data (also where output goes)") parser.add_argument("in_dir", help="path to network data (also where output goes)") parser.add_argument("word_file", help="path to sorted word file") parser.add_argument("index_file", help="path to sorted word file") parser.add_argument("num_procs", type=int, help="number of processes to spawn") parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR) parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=END_YEAR) args = parser.parse_args() years = range(args.start_year, args.end_year + 1) index = ioutils.load_pickle(args.index_file) word_list = ioutils.load_pickle(args.word_file) word_list, _ = get_word_indices(word_list, index) run_parallel(args.num_procs, args.in_dir + "/", years, word_list, index, args.out_file)
indices = year_indices[year] mat = matstore.retrieve_mat_as_coo(in_dir + str(year) + ".bin") mat = mat.tocsr() mat = mat[indices, :] mat = mat[:, indices] samplesizes[year] = mat.sum() ioutils.write_pickle(samplesizes, out_file) if __name__ == '__main__': parser = argparse.ArgumentParser(description="get sample sizes") parser.add_argument("out_file", help="output file") parser.add_argument("in_dir", help="input directory") parser.add_argument("--word-file", help="path to sorted word file(s). Must also specify index.", default=None) parser.add_argument("--num-words", type=int, help="Number of words (of decreasing average frequency) to include. Must also specifiy word file and index.", default=-1) parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR) parser.add_argument("--end-year", type=int, help="start year (inclusive)", default=END_YEAR) args = parser.parse_args() years = range(args.start_year, args.end_year + 1) index = ioutils.load_pickle(INDEX_FILE) word_pickle = ioutils.load_pickle(args.word_file) word_info = {} if not args.start_year in word_pickle: word_pickle = word_pickle[:args.num_words] year_word_info = get_word_indices(word_pickle, index)[1] for year in years: word_info[year] = year_word_info else: for year in years: word_info[year] = get_word_indices(word_pickle[year][:args.num_words], index)[1] run(args.out_file, args.in_dir + "/", years, word_info)
default=None) parser.add_argument( "--num-words", type=int, help= "Number of words (of decreasing average frequency) to include. Must also specifiy word file and index.", default=-1) parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR) parser.add_argument("--end-year", type=int, help="start year (inclusive)", default=END_YEAR) args = parser.parse_args() years = range(args.start_year, args.end_year + 1) index = ioutils.load_pickle(INDEX_FILE) word_pickle = ioutils.load_pickle(args.word_file) word_info = {} if not args.start_year in word_pickle: word_pickle = word_pickle[:args.num_words] year_word_info = get_word_indices(word_pickle, index)[1] for year in years: word_info[year] = year_word_info else: for year in years: word_info[year] = get_word_indices( word_pickle[year][:args.num_words], index)[1] run(args.out_file, args.in_dir + "/", years, word_info)