Пример #1
0
def produce_statements(output_list,
                       enrich=True,
                       no_upload=False,
                       pickle_file=None,
                       n_proc=1,
                       db=None):
    """Convert the reader output into a list of StatementData instances."""
    if db is None:
        db = get_primary_db()

    if enrich:
        _enrich_reading_data(output_list, db=db)

    stmt_data_list = make_statements(output_list, n_proc)

    if not no_upload:
        try:
            upload_statements(stmt_data_list, db=db)
        except Exception as e:
            logger.exception(e)
            if pickle_file is None:
                pickle_file = ("failure_stmt_dump_%s.pkl" %
                               datetime.now().strftime('%Y%m%d_%H%M%S'))
            logger.error(
                "Could not upload statements. Results pickled in: %s." %
                pickle_file)
    if pickle_file is not None:
        with open(pickle_file, 'wb') as f:
            pickle.dump([sd.statement for sd in stmt_data_list], f)
        print("Statements pickled in %s." % pickle_file)

    return stmt_data_list
Пример #2
0
def test_multiproc_statements():
    "Test the multiprocessing creation of statements."
    db = get_db_with_pubmed_content()
    readers = get_readers()
    tc_list = db.select_all(db.TextContent)
    id_dict = {'tcid': [tc.id for tc in tc_list]}
    outputs = rdb.make_db_readings(id_dict, readers, db=db)
    stmts = make_statements(outputs, 2)
    assert len(stmts)
Пример #3
0
def main():
    # Load arguments.
    parser = make_parser()
    args = parser.parse_args()
    if args.debug and not args.quiet:
        logger.setLevel(logging.DEBUG)

    # Load the input file.
    with open(args.input_file, 'r') as f:
        input_lines = f.readlines()
    logger.info("Found %d files." % len(input_lines))
    for ftype in ['nxml', 'txt']:
        logger.debug('%d are %s' % (
            len([f for f in input_lines if f.endswith(ftype)]), ftype
        ))

    # Select only a sample of the lines, if sample is chosen.
    if args.n_samp is not None:
        input_lines = random.sample(input_lines, args.n_samp)

    # If a range is specified, only use that range.
    if args.range_str is not None:
        start_idx, end_idx = [int(n) for n in args.range_str.split(':')]
        input_lines = input_lines[start_idx:end_idx]

    # Create a single base directory
    base_dir = _get_dir('run_%s' % ('_and_'.join(args.readers)))

    # Set the verbosity. The quiet argument overrides the verbose argument.
    verbose = args.verbose and not args.quiet

    # Get the readers objects.
    readers = [reader_class(base_dir=base_dir, n_proc=args.n_proc)
               for reader_class in get_readers()
               if reader_class.name.lower() in args.readers]

    # Read the files.
    outputs = read_files(input_lines, readers, verboes=verbose)
    reading_out_path = args.name + '_readings.pkl'
    with open(reading_out_path, 'wb') as f:
        pickle.dump([output.make_tuple() for output in outputs], f)
    print("Reading outputs stored in %s." % reading_out_path)

    stmt_data_list = make_statements(outputs)
    stmts_pkl_path = args.name + '_stmts.pkl'
    with open(stmts_pkl_path, 'wb') as f:
        pickle.dump([sd.statement for sd in stmt_data_list], f)
        print("Statements pickled in %s." % stmts_pkl_path)
Пример #4
0
    # If a range is specified, only use that range.
    if args.range_str is not None:
        start_idx, end_idx = [int(n) for n in args.range_str.split(':')]
        input_lines = input_lines[start_idx:end_idx]

    # Create a single base directory
    base_dir = _get_dir('run_%s' % ('_and_'.join(args.readers)))

    # Set the verbosity. The quiet argument overrides the verbose argument.
    verbose = args.verbose and not args.quiet

    # Get the readers objects.
    readers = [
        reader_class(base_dir=base_dir, n_proc=args.n_proc)
        for reader_class in get_readers()
        if reader_class.name.lower() in args.readers
    ]

    # Read the files.
    outputs = read_files(input_lines, readers, verboes=verbose)
    reading_out_path = args.name + '_readings.pkl'
    with open(reading_out_path, 'wb') as f:
        pickle.dump([output.make_tuple() for output in outputs], f)
    print("Reading outputs stored in %s." % reading_out_path)

    stmt_data_list = make_statements(outputs)
    stmts_pkl_path = args.name + '_stmts.pkl'
    with open(stmts_pkl_path, 'wb') as f:
        pickle.dump([sd.statement for sd in stmt_data_list], f)
        print("Statements pickled in %s." % stmts_pkl_path)