def test_stmt_mode_unread(): "Test whether we can only create statements from unread content." # Prep the inputs. db = get_db_with_pubmed_content() tcids = {tcid for tcid, in db.select_all(db.TextContent.id)} # Test with just sparser for tolerable speeds. readers = get_readers('SPARSER') # First create some readings. some_tcids = random.sample(tcids, len(tcids) // 2) workers0 = rdb.run_reading(readers, some_tcids, db=db, verbose=True) pre_stmt_hash_set = { sd.statement.get_hash(shallow=False) for sd in workers0[0].statement_outputs } # Now only make statements for the content that was not read. workers = rdb.run_reading(readers, tcids, db=db, verbose=True, stmt_mode='unread') stmt_hash_set = { sd.statement.get_hash(shallow=False) for sd in workers[0].statement_outputs } assert stmt_hash_set.isdisjoint(pre_stmt_hash_set), \ "There were overlapping statements."
def _run_reading(self, db, tcids, reader_name): ids_per_job = 5000 if len(tcids) > ids_per_job: raise ReadingUpdateError("Too many id's to run locally. Try " "running on batch (use_batch).") logger.info("Producing readings locally for %d new text refs." % len(tcids)) base_dir = path.join(THIS_DIR, 'read_all_%s' % reader_name) readers = rdb.construct_readers([reader_name], base_dir=base_dir, n_proc=self.n_proc) rdb.run_reading(readers, tcids, db=db, batch_size=ids_per_job, verbose=self.verbose) return
def test_multi_batch_run(): "Test that reading works properly with multiple batches run." db = get_db_with_pubmed_content() readers = get_readers() tcids = {tcid for tcid, in db.select_all(db.TextContent.id)} rdb.run_reading(readers, tcids, batch_size=len(tcids)//2, db=db, stmt_mode='none') # This should catch any repeated readings. num_readings = db.filter_query(db.Reading).count() # NOTE: This might need some special consideration given the new TRIPS # reader, which only reads titles num_expected = len(readers)*len(tcids) assert num_readings == num_expected, \ "Expected %d readings, only found %d." % (num_expected, num_readings)
def test_multiproc_statements(): "Test the multiprocessing creation of statements." db = get_db_with_pubmed_content() readers = get_readers() tcids = {tcid for tcid, in db.select_all(db.TextContent.id)} workers = rdb.run_reading(readers, tcids, db=db) assert not any(worker.extant_readings for worker in workers) outputs = [rd for worker in workers for rd in worker.new_readings] stmts = rdb.make_statements(outputs, 2) assert len(stmts)
def test_produce_readings(): "Comprehensive test of the high level production of readings." # Prep the inputs. db = get_db_with_pubmed_content() tcids = {tcid for tcid, in db.select_all(db.TextContent.id)} # Test with just sparser for tollerable speeds. readers = get_readers('SPARSER') # Test the reading_mode='none' option (should yield nothing, because there # aren't any readings yet.) workers = rdb.run_reading(readers, tcids, verbose=True, db=db, reading_mode='none', stmt_mode='none') assert all(len(worker.new_readings) == 0 for worker in workers) assert all(len(worker.extant_readings) == 0 for worker in workers) # Test just getting a pickle file (Nothing should be posted to the db.). pkl_file = 'test_db_res.pkl' workers = rdb.run_reading(readers, tcids, verbose=True, db=db, upload_readings=False, reading_pickle=pkl_file) N_new = len(workers[0].new_readings) N_old = len(workers[0].extant_readings) N_exp = len(readers)*len(tcids) assert N_new == N_exp, "Expected %d readings, got %d." % (N_exp, N_new) assert N_old == 0, "Found old readings, when there should be none." assert path.exists(pkl_file), "Pickle file not created." with open(pkl_file, 'rb') as f: N_pkl = len(pickle.load(f)) assert N_pkl == N_exp, \ "Expected %d readings in pickle, got %d." % (N_exp, N_pkl) N_readings = db.filter_query(db.Reading).count() assert N_readings == 0, \ "There shouldn't be any readings yet, but found %d." % N_readings # Test reading and insert to the database. rdb.run_reading(readers, tcids, verbose=True, db=db) N_db = db.filter_query(db.Reading).count() assert N_db == N_exp, "Expected %d readings, got %d." % (N_exp, N_db) # Test reading again, without read_mode='all', ('unread' by default) workers = rdb.run_reading(readers, tcids, verbose=True, db=db) N_old = len(workers[0].extant_readings) N_new = len(workers[0].new_readings) assert N_old == N_exp, \ "Got %d old readings, expected %d." % (N_old, N_exp) assert N_new == 0, \ "Got %d new readings, when none should have been read." % N_new assert all([rd.reading_id is not None for rd in workers[0].extant_readings]) # Test with read_mode='none' again. workers = rdb.run_reading(readers, tcids, verbose=True, db=db, reading_mode='none') N_old = len(workers[0].extant_readings) assert N_old == N_exp assert all([rd.reading_id is not None for rd in workers[0].extant_readings]) # Test the read_mode='all'. workers = rdb.run_reading(readers, tcids, verbose=True, db=db, reading_mode='all') old = workers[0].extant_readings new = workers[0].new_readings assert len(new) == N_exp assert len(old) == 0 assert all([rd.reading_id is not None for rd in new])
def main(): arg_parser = get_parser() args = arg_parser.parse_args() s3 = boto3.client('s3') s3_log_prefix = get_s3_job_log_prefix(args.s3_base, args.job_name) logger.info("Using log prefix \"%s\"" % s3_log_prefix) id_list_key = args.s3_base + 'id_list' logger.info("Looking for id list on s3 at \"%s\"" % id_list_key) try: id_list_obj = s3.get_object(Bucket=bucket_name, Key=id_list_key) except botocore.exceptions.ClientError as e: # Handle a missing object gracefully if e.response['Error']['Code'] == 'NoSuchKey': logger.info('Could not find PMID list file at %s, exiting' % id_list_key) sys.exit(1) # If there was some other kind of problem, re-raise the exception else: raise e # Get the content from the object id_list_str = id_list_obj['Body'].read().decode('utf8').strip() id_str_list = id_list_str.splitlines()[args.start_index:args.end_index] random.shuffle(id_str_list) tcids = [int(line.strip()) for line in id_str_list] # Get the reader objects if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) kwargs = {'base_dir': args.out_dir, 'n_proc': args.num_cores} readers = construct_readers(args.readers, **kwargs) # Record the reader versions used in this run. reader_versions = {} for reader in readers: reader_versions[reader.name] = reader.get_version() s3.put_object(Bucket=bucket_name, Key=get_s3_reader_version_loc(args.s3_base, args.job_name), Body=json.dumps(reader_versions)) # Some combinations of options don't make sense: forbidden_combos = [('all', 'unread'), ('none', 'unread'), ('none', 'none')] assert (args.read_mode, args.rslt_mode) not in forbidden_combos, \ ("The combination of reading mode %s and statement mode %s is not " "allowed." % (args.reading_mode, args.rslt_mode)) # Get a handle for the database if args.test: from indra_db.tests.util import get_temp_db db = get_temp_db(clear=True) else: db = None # Read everything ======================================== if args.batch is None: run_reading(readers, tcids, verbose=True, db=db, reading_mode=args.read_mode, rslt_mode=args.rslt_mode) else: for tcid_batch in batch_iter(tcids, args.batch): run_reading(readers, tcid_batch, verbose=True, db=db, reading_mode=args.read_mode, rslt_mode=args.rslt_mode) # Preserve the sparser logs contents = os.listdir('.') logger.info("Checking for any log files to cache:\n" + '\n'.join(contents)) sparser_logs = [] trips_logs = [] for fname in contents: # Check if this file is a sparser log if fname.startswith('sparser') and fname.endswith('log'): sparser_logs.append(fname) elif is_trips_datestring(fname): for sub_fname in os.listdir(fname): if sub_fname.endswith('.log') or sub_fname.endswith('.err'): trips_logs.append(os.path.join(fname, sub_fname)) _dump_logs_to_s3(s3, s3_log_prefix, 'sparser', sparser_logs) _dump_logs_to_s3(s3, s3_log_prefix, 'trips', trips_logs) return