def main(): # Get a list of json.bz2 files to read paths = glob.glob('data/*.json.bz2') paths = paths[0:4] # Truncate list for example # Open file for writing results f = bz2.BZ2File('term_volume_counts.bz2', "w") # Start a feature reader with the paths and pass the mapping function feature_reader = FeatureReader(paths) results = feature_reader.multiprocessing(get_term_volume_counts) # Save the results for vol, result in results: for t, c in result.iteritems(): # result.items() in python3 s = "{0}\t{1}\t{2}\t{3}\n".format(vol[0], vol[1], t, c) f.write(s.encode('UTF-8')) # For python3, use str(s) f.close()
def old(): # Get a list of json.bz2 files to read paths = glob.glob('data/*.json.bz2') paths = paths[0:4] # Truncate list for example # Open file for writing results f = bz2.BZ2File('term_volume_counts.bz2', "w") # Start a feature reader with the paths and pass the mapping function feature_reader = FeatureReader(paths) results = feature_reader.multiprocessing(get_term_volume_counts) # Save the results for vol, result in results: for t,c in result.iteritems(): # result.items() in python3 s = "{0}\t{1}\t{2}\t{3}\n".format(vol[0], vol[1],t,c) f.write(s.encode('UTF-8')) # For python3, use str(s) f.close()
def generic_processor(map_func, result_func, paths, outpath=None, batch_size=1000): if outpath: f = bz2.BZ2File(outpath, "w") else: f = sys.stdout csvf = csv.writer(f) n = 0 m = math.ceil(float(len(paths)) / batch_size) logging.info("Script started") while (True): start = time.time() batch, paths = (paths[:batch_size], paths[batch_size:]) n += 1 logging.info("Starting batch {0}/{1}".format(n, m)) feature_reader = FeatureReader(batch) results = feature_reader.multiprocessing(map_func) result_func(results, csvf) logging.info("Batch of {0} volumes finished in in {1}s".format( len(batch), time.time() - start)) if outpath: logging.debug("Output filesize is currently: {0}Gb".format( os.stat(outpath).st_size / (1024**3))) if len(paths) == 0: break logging.info("Script done") f.close()