示例#1
0
def main():
    # Get a list of json.bz2 files to read
    paths = glob.glob('data/*.json.bz2')
    paths = paths[0:4]  # Truncate list for example

    # Open file for writing results
    f = bz2.BZ2File('term_volume_counts.bz2', "w")

    # Start a feature reader with the paths and pass the mapping function
    feature_reader = FeatureReader(paths)
    results = feature_reader.multiprocessing(get_term_volume_counts)

    # Save the results
    for vol, result in results:
        for t, c in result.iteritems():  # result.items() in python3
            s = "{0}\t{1}\t{2}\t{3}\n".format(vol[0], vol[1], t, c)
            f.write(s.encode('UTF-8'))  # For python3, use str(s)

    f.close()
def old():
    # Get a list of json.bz2 files to read
    paths = glob.glob('data/*.json.bz2')
    paths = paths[0:4] # Truncate list for example

    # Open file for writing results
    f = bz2.BZ2File('term_volume_counts.bz2', "w")

    # Start a feature reader with the paths and pass the mapping function
    feature_reader = FeatureReader(paths)
    results = feature_reader.multiprocessing(get_term_volume_counts)

    # Save the results
    for vol, result in results:
        for t,c in result.iteritems(): # result.items() in python3
            s = "{0}\t{1}\t{2}\t{3}\n".format(vol[0], vol[1],t,c)
            f.write(s.encode('UTF-8')) # For python3, use str(s)

    f.close()
def generic_processor(map_func,
                      result_func,
                      paths,
                      outpath=None,
                      batch_size=1000):
    if outpath:
        f = bz2.BZ2File(outpath, "w")
    else:
        f = sys.stdout
    csvf = csv.writer(f)
    n = 0
    m = math.ceil(float(len(paths)) / batch_size)

    logging.info("Script started")

    while (True):
        start = time.time()
        batch, paths = (paths[:batch_size], paths[batch_size:])
        n += 1
        logging.info("Starting batch {0}/{1}".format(n, m))
        feature_reader = FeatureReader(batch)

        results = feature_reader.multiprocessing(map_func)
        result_func(results, csvf)

        logging.info("Batch of {0} volumes finished in in {1}s".format(
            len(batch),
            time.time() - start))

        if outpath:
            logging.debug("Output filesize is currently: {0}Gb".format(
                os.stat(outpath).st_size / (1024**3)))

        if len(paths) == 0:
            break

    logging.info("Script done")
    f.close()