Пример #1
0
def main(proc_num, lock, out_pref, tmp_out_pref, in_dir, years, word_list, word_indices, displacement_base, thresh):
    while True:
        lock.acquire()
        work_left = False
        for year in years:
            dirs = set(os.listdir(in_dir + "/volstats/"))
            if tmp_out_pref.split("/")[-1] + str(year) + "-jvols.pkl" in dirs:
                continue
            work_left = True
            print proc_num, "year", year
            fname = tmp_out_pref + str(year) + "-jvols.pkl"
            with open(fname, "w") as fp:
                fp.write("")
            fp.close()
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break
        
        print proc_num, "Loading matrices..."
        base = matstore.retrieve_mat_as_binary_coo_thresh(in_dir + "/" + str(year - 1) + ".bin", args.thresh, min_size=MIN_SIZE)
        base = base.tocsr()
        delta = matstore.retrieve_mat_as_binary_coo_thresh(in_dir + "/" + str(year) + ".bin", args.thresh, min_size=MIN_SIZE)
        delta = delta.tocsr()
        print proc_num, "Getting deltas..."
        year_vols = get_jaccard_deltas(base, delta, word_list, word_indices)
        year_disp = get_jaccard_deltas(displacement_base, delta, word_list, word_indices)
        print proc_num, "Writing results..."
        ioutils.write_pickle(year_vols, tmp_out_pref + str(year) + "-jvols.pkl")
        ioutils.write_pickle(year_disp, tmp_out_pref + str(year) + "-jdisps.pkl")
Пример #2
0
def main(proc_num, lock, out_pref, tmp_out_pref, in_dir, years, word_list,
         word_indices, displacement_base, thresh):
    while True:
        lock.acquire()
        work_left = False
        for year in years:
            dirs = set(os.listdir(in_dir + "/volstats/"))
            if tmp_out_pref.split("/")[-1] + str(year) + "-jvols.pkl" in dirs:
                continue
            work_left = True
            print proc_num, "year", year
            fname = tmp_out_pref + str(year) + "-jvols.pkl"
            with open(fname, "w") as fp:
                fp.write("")
            fp.close()
            break
        lock.release()
        if not work_left:
            print proc_num, "Finished"
            break

        print proc_num, "Loading matrices..."
        base = matstore.retrieve_mat_as_binary_coo_thresh(
            in_dir + "/" + str(year - 1) + ".bin",
            args.thresh,
            min_size=MIN_SIZE)
        base = base.tocsr()
        delta = matstore.retrieve_mat_as_binary_coo_thresh(in_dir + "/" +
                                                           str(year) + ".bin",
                                                           args.thresh,
                                                           min_size=MIN_SIZE)
        delta = delta.tocsr()
        print proc_num, "Getting deltas..."
        year_vols = get_jaccard_deltas(base, delta, word_list, word_indices)
        year_disp = get_jaccard_deltas(displacement_base, delta, word_list,
                                       word_indices)
        print proc_num, "Writing results..."
        ioutils.write_pickle(year_vols,
                             tmp_out_pref + str(year) + "-jvols.pkl")
        ioutils.write_pickle(year_disp,
                             tmp_out_pref + str(year) + "-jdisps.pkl")
Пример #3
0
        p.start()
    for p in procs:
        p.join()
    print "Merging"
    merge(out_pref, tmp_out_pref, years, word_list)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Merges years of raw 5gram data.")
    parser.add_argument("dir", help="path to network data (also where output goes)")
    parser.add_argument("word_file", help="path to sorted word file")
    parser.add_argument("index_file", help="path to word index file")
    parser.add_argument("num_procs", type=int, help="number of processes to spawn")
    parser.add_argument("--num-words", type=int, help="Number of words (of decreasing average frequency) to include", default=-1)
    parser.add_argument("--start-year", type=int, help="start year (inclusive)", default=START_YEAR)
    parser.add_argument("--end-year", type=int, help="end year (inclusive)", default=END_YEAR)
    parser.add_argument("--thresh", type=float, help="relevance threshold", default=THRESH)
    args = parser.parse_args()
    years = range(args.start_year+1, args.end_year + 1)
    word_list = ioutils.load_pickle(args.word_file)
    index = ioutils.load_pickle(args.index_file)
    if args.num_words != -1:
        word_list = word_list[:args.num_words]
    ioutils.mkdir(args.dir + "/volstats")
    word_list, word_indices = get_word_indices(word_list, index)
    outpref ="/volstats/" + args.word_file.split("/")[-1].split(".")[0] + "-" + str(args.thresh)
    if args.num_words != -1:
        outpref += "-top" + str(args.num_words)
    displacement_base = matstore.retrieve_mat_as_binary_coo_thresh(args.dir + "/" + str(args.end_year) + ".bin", args.thresh, min_size=MIN_SIZE)
    displacement_base = displacement_base.tocsr()
    run_parallel(args.num_procs, args.dir + outpref, args.dir + outpref + "-tmp", args.dir + "/", years, word_list, word_indices, displacement_base, args.thresh)       
Пример #4
0
                        help="start year (inclusive)",
                        default=START_YEAR)
    parser.add_argument("--end-year",
                        type=int,
                        help="end year (inclusive)",
                        default=END_YEAR)
    parser.add_argument("--thresh",
                        type=float,
                        help="relevance threshold",
                        default=THRESH)
    args = parser.parse_args()
    years = range(args.start_year + 1, args.end_year + 1)
    word_list = ioutils.load_pickle(args.word_file)
    index = ioutils.load_pickle(args.index_file)
    if args.num_words != -1:
        word_list = word_list[:args.num_words]
    ioutils.mkdir(args.dir + "/volstats")
    word_list, word_indices = get_word_indices(word_list, index)
    outpref = "/volstats/" + args.word_file.split("/")[-1].split(
        ".")[0] + "-" + str(args.thresh)
    if args.num_words != -1:
        outpref += "-top" + str(args.num_words)
    displacement_base = matstore.retrieve_mat_as_binary_coo_thresh(
        args.dir + "/" + str(args.end_year) + ".bin",
        args.thresh,
        min_size=MIN_SIZE)
    displacement_base = displacement_base.tocsr()
    run_parallel(args.num_procs, args.dir + outpref,
                 args.dir + outpref + "-tmp", args.dir + "/", years, word_list,
                 word_indices, displacement_base, args.thresh)