Пример #1
0
def leveldb_writer(entries, db_path, batch_size, bench_freq):
    log = logging.getLogger(__name__).getChild('leveldb')
    log.info("Path - %s" % db_path)
    if batch_size:
        log.info("Batch Size - %s" % batch_size)
    log.info("Benchmark Freq - %s" % bench_freq)

    db = leveldb.LevelDB(
        db_path,
        error_if_exists=True,
        write_buffer_size=100 << 20,  # 100MB
        block_cache_size=400 << 20)  # 400MB
    if batch_size:
        writer = leveldb.WriteBatch()
    else:
        writer = db

    b = Benchmark(bench_freq)
    for entry_list in iter(entries.get, 'STOP'):
        for entry in entry_list:
            db.Put(entry["id"].encode(),
                   msgpack.dumps(entry, default=encode_datetime))
            b.increment()
            if batch_size and b.count % batch_size == 0:
                db.Write(writer)
        entries.task_done()

    if batch_size:
        db.Write(writer)

    b.print_freq()
    log.info(db.GetStats())
    entries.task_done()
Пример #2
0
def leveldb_writer(entries, db_path, batch_size, bench_freq):
    log = logging.getLogger(__name__).getChild('leveldb')
    log.info("Path - %s" % db_path)
    if batch_size:
        log.info("Batch Size - %s" % batch_size)
    log.info("Benchmark Freq - %s" % bench_freq)

    db = leveldb.LevelDB(db_path,
                         error_if_exists=True,
                         write_buffer_size=100 << 20,  # 100MB
                         block_cache_size=400 << 20)  # 400MB
    if batch_size:
        writer = leveldb.WriteBatch()
    else:
        writer = db

    b = Benchmark(bench_freq)
    for entry_list in iter(entries.get, 'STOP'):
        for entry in entry_list:
            db.Put(entry["id"].encode(), msgpack.dumps(entry, default=encode_datetime))
            b.increment()
            if batch_size and b.count % batch_size == 0:
                db.Write(writer)
        entries.task_done()

    if batch_size:
        db.Write(writer)

    b.print_freq()
    log.info(db.GetStats())
    entries.task_done()
Пример #3
0
def pjk_writer(entries, output_file, bench_freq):
    pjk = PajekFactory()
    b = Benchmark()
    for entry_list in iter(entries.get, 'STOP'):
        for entry in entry_list:
            for citation in entry["citations"]:
                pjk.add_edge(entry["id"], citation)
            b.increment()
        entries.task_done()

    b.print_freq()
    with open_file(output_file, "w") as f:
        pjk.write(f)
    entries.task_done()
Пример #4
0
def pjk_writer(entries, output_file, bench_freq):
    pjk = PajekFactory()
    b = Benchmark()
    for entry_list in iter(entries.get, 'STOP'):
        for entry in entry_list:
            for citation in entry["citations"]:
                pjk.add_edge(entry["id"], citation)
            b.increment()
        entries.task_done()

    b.print_freq()
    with open_file(output_file, "w") as f:
        pjk.write(f)
    entries.task_done()
Пример #5
0
                        default=1000000)
    parser.add_argument('infile', nargs='+')
    arguments = parser.parse_args()

    date_after = None
    if arguments.after:
        date_after = datetime.datetime.strptime(arguments.after, "%Y")

    b = Benchmark()

    for file_name in arguments.infile:
        with open_file(file_name, "r") as f:
            p = WOSStream(f, arguments.wos_only, arguments.sample_rate,
                          arguments.must_cite, date_after)
            output_file = "%s.json" % os.path.basename(f.name).split(
                ".", maxsplit=1)[0]

            if arguments.outdir:
                output_file = os.path.join(arguments.outdir, output_file)

            if not arguments.force and os.path.isfile(output_file):
                print("%s already exists, skipping..." % output_file)
                break

            with open(output_file, "w", encoding="utf-8") as g:
                for entry in p.parse():
                    dump(entry, g, ensure_ascii=False)
                    g.write("\n")
                    b.increment()
    b.print_freq()
Пример #6
0
    parser.add_argument("-f", "--force", help="If outptut file already exists overwrite it.", action="store_true")
    parser.add_argument("-a", "--after", help="Only include nodes published on or after this year")
    parser.add_argument("-bf", "--benchmark_freq", help="How often to emit benchmark info", type=int, default=1000000)
    parser.add_argument("infile", nargs="+")
    arguments = parser.parse_args()

    date_after = None
    if arguments.after:
        date_after = datetime.datetime.strptime(arguments.after, "%Y")

    b = Benchmark()

    for file_name in arguments.infile:
        with open_file(file_name, "r") as f:
            p = WOSStream(f, arguments.wos_only, arguments.sample_rate, arguments.must_cite, date_after)
            output_file = "%s.json" % os.path.basename(f.name).split(".", maxsplit=1)[0]

            if arguments.outdir:
                output_file = os.path.join(arguments.outdir, output_file)

            if not arguments.force and os.path.isfile(output_file):
                print("%s already exists, skipping..." % output_file)
                break

            with open(output_file, "w", encoding="utf-8") as g:
                for entry in p.parse():
                    dump(entry, g, ensure_ascii=False)
                    g.write("\n")
                    b.increment()
    b.print_freq()