def leveldb_writer(entries, db_path, batch_size, bench_freq): log = logging.getLogger(__name__).getChild('leveldb') log.info("Path - %s" % db_path) if batch_size: log.info("Batch Size - %s" % batch_size) log.info("Benchmark Freq - %s" % bench_freq) db = leveldb.LevelDB( db_path, error_if_exists=True, write_buffer_size=100 << 20, # 100MB block_cache_size=400 << 20) # 400MB if batch_size: writer = leveldb.WriteBatch() else: writer = db b = Benchmark(bench_freq) for entry_list in iter(entries.get, 'STOP'): for entry in entry_list: db.Put(entry["id"].encode(), msgpack.dumps(entry, default=encode_datetime)) b.increment() if batch_size and b.count % batch_size == 0: db.Write(writer) entries.task_done() if batch_size: db.Write(writer) b.print_freq() log.info(db.GetStats()) entries.task_done()
def leveldb_writer(entries, db_path, batch_size, bench_freq): log = logging.getLogger(__name__).getChild('leveldb') log.info("Path - %s" % db_path) if batch_size: log.info("Batch Size - %s" % batch_size) log.info("Benchmark Freq - %s" % bench_freq) db = leveldb.LevelDB(db_path, error_if_exists=True, write_buffer_size=100 << 20, # 100MB block_cache_size=400 << 20) # 400MB if batch_size: writer = leveldb.WriteBatch() else: writer = db b = Benchmark(bench_freq) for entry_list in iter(entries.get, 'STOP'): for entry in entry_list: db.Put(entry["id"].encode(), msgpack.dumps(entry, default=encode_datetime)) b.increment() if batch_size and b.count % batch_size == 0: db.Write(writer) entries.task_done() if batch_size: db.Write(writer) b.print_freq() log.info(db.GetStats()) entries.task_done()
def pjk_writer(entries, output_file, bench_freq): pjk = PajekFactory() b = Benchmark() for entry_list in iter(entries.get, 'STOP'): for entry in entry_list: for citation in entry["citations"]: pjk.add_edge(entry["id"], citation) b.increment() entries.task_done() b.print_freq() with open_file(output_file, "w") as f: pjk.write(f) entries.task_done()
default=1000000) parser.add_argument('infile', nargs='+') arguments = parser.parse_args() date_after = None if arguments.after: date_after = datetime.datetime.strptime(arguments.after, "%Y") b = Benchmark() for file_name in arguments.infile: with open_file(file_name, "r") as f: p = WOSStream(f, arguments.wos_only, arguments.sample_rate, arguments.must_cite, date_after) output_file = "%s.json" % os.path.basename(f.name).split( ".", maxsplit=1)[0] if arguments.outdir: output_file = os.path.join(arguments.outdir, output_file) if not arguments.force and os.path.isfile(output_file): print("%s already exists, skipping..." % output_file) break with open(output_file, "w", encoding="utf-8") as g: for entry in p.parse(): dump(entry, g, ensure_ascii=False) g.write("\n") b.increment() b.print_freq()
parser.add_argument("-f", "--force", help="If outptut file already exists overwrite it.", action="store_true") parser.add_argument("-a", "--after", help="Only include nodes published on or after this year") parser.add_argument("-bf", "--benchmark_freq", help="How often to emit benchmark info", type=int, default=1000000) parser.add_argument("infile", nargs="+") arguments = parser.parse_args() date_after = None if arguments.after: date_after = datetime.datetime.strptime(arguments.after, "%Y") b = Benchmark() for file_name in arguments.infile: with open_file(file_name, "r") as f: p = WOSStream(f, arguments.wos_only, arguments.sample_rate, arguments.must_cite, date_after) output_file = "%s.json" % os.path.basename(f.name).split(".", maxsplit=1)[0] if arguments.outdir: output_file = os.path.join(arguments.outdir, output_file) if not arguments.force and os.path.isfile(output_file): print("%s already exists, skipping..." % output_file) break with open(output_file, "w", encoding="utf-8") as g: for entry in p.parse(): dump(entry, g, ensure_ascii=False) g.write("\n") b.increment() b.print_freq()