def main(): parser = get_parser() args = parser.parse_args() if args.test: if 'test' not in args.database: from indra_db.tests.util import get_temp_db db = get_temp_db() else: db = get_db(args.database) elif args.database == 'primary': db = get_primary_db() else: db = get_db(args.database) readers = ['SPARSER', 'REACH', 'TRIPS', 'ISI', 'EIDOS', 'MTI'] if args.method == 'local': bulk_manager = BulkLocalReadingManager(readers, buffer_days=args.buffer, n_procs=args.num_procs) elif args.method == 'aws': bulk_manager = BulkAwsReadingManager(readers, buffer_days=args.buffer, project_name=args.project_name) else: assert False, "This shouldn't be allowed." if args.task == 'read_all': bulk_manager.read_all(db) elif args.task == 'read_new': bulk_manager.read_new(db) return
def test_lazy_copier_unique_constraints(): db = get_temp_db(clear=True) N = int(10**5) S = int(10**8) fake_mids_a = {('man-' + str(random.randint(0, S)), ) for _ in range(N)} fake_mids_b = {('man-' + str(random.randint(0, S)), ) for _ in range(N)} assert len(fake_mids_a | fake_mids_b) < len(fake_mids_a) + len(fake_mids_b) start = datetime.now() db.copy('text_ref', fake_mids_a, ('manuscript_id', )) print("First load:", datetime.now() - start) try: db.copy('text_ref', fake_mids_b, ('manuscript_id', )) assert False, "Vanilla copy succeeded when it should have failed." except Exception as e: db._conn.rollback() pass start = datetime.now() db.copy_lazy('text_ref', fake_mids_b, ('manuscript_id', )) print("Lazy copy:", datetime.now() - start) mid_results = [mid for mid, in db.select_all(db.TextRef.manuscript_id)] assert len(mid_results) == len(set(mid_results)), \ (len(mid_results), len(set(mid_results))) return
def test_lazy_report_copy(): db = get_temp_db(True) inps_1 = _do_init_copy(db) inps_2 = {('b', '2'), ('c', '1'), ('d', '3')} left_out = db.copy_report_lazy('text_ref', inps_2, COLS) _assert_set_equal(inps_1 | inps_2, _ref_set(db)) _assert_set_equal(inps_1 & inps_2, {t[:2] for t in left_out})
def test_insert_and_query_pmid(): "Test that we can add a text_ref and get the text_ref back." db = get_temp_db() pmid = '1234' text_ref_id = db.insert('text_ref', pmid=pmid) entries = db.select_all('text_ref', db.TextRef.pmid == pmid) assert_equal(len(entries), 1, "One item inserted, multiple entries found.") assert_equal(entries[0].pmid, pmid) assert_equal(entries[0].id, text_ref_id, "Got back wrong text_ref_id.")
def get_test_db_with_pubmed_content(with_pm=False): "Populate the database with sample content from pubmed." db = get_temp_db(clear=True) pm = Pubmed(ftp_url=get_test_ftp_url(), local=True) pm.populate(db) if with_pm: return db, pm else: return db
def test_lazy_copy(): db = get_temp_db(True) inps_1 = {('a', '1'), ('b', '2')} inps_2 = {('b', '2'), ('c', '1'), ('d', '3')} db.copy('text_ref', inps_1, COLS) _assert_set_equal(inps_1, _ref_set(db)) db.copy_lazy('text_ref', inps_2, COLS) _assert_set_equal(inps_1 | inps_2, _ref_set(db))
def test_multible_pmc_oa_content(): "Test to make sure repeated content is handled correctly." db = get_temp_db() pmc = PmcOA(ftp_url=get_test_ftp_url(), local=True) pmc.populate(db) num_conts = len(db.select_all('text_content')) pmc.populate(db) assert len(db.select_all('text_content')) == num_conts,\ "Duplicate text content allowed to be submitted." return
def test_multiple_pmids(): "Test that pre-existing pmids are correctly handled." db = get_temp_db() med = Pubmed(ftp_url=get_test_ftp_url(), local=True) med.populate(db) num_refs = len(db.select_all('text_ref')) med.populate(db) assert len(db.select_all('text_ref')) == num_refs,\ "Duplicate pmids allowed to be submitted.." return
def test_uniqueness_text_ref_url(): "Test whether the uniqueness imposed on the url of text_refs is enforced." db = get_temp_db() url = 'http://foobar.com' db.insert('text_ref', url=url) try: db.insert('text_ref', url=url) except IntegrityError: return # PASS assert False, "Uniqueness was not enforced."
def test_vanilla_copy(): db = get_temp_db(True) inps = {('a', '1'), ('b', '1')} db.copy('text_ref', inps, COLS) assert inps == _ref_set(db) try: db.copy('text_ref', inps, COLS) except: return assert False, "Copy of duplicate data succeeded."
def test_detailed_copy_report(): db = get_temp_db(True) inps_1 = _do_init_copy(db) inps_2 = {('b', '2'), ('c', '1'), ('d', '3')} exiting_ids = {trid for trid, in db.select_all(db.TextRef.id)} existing_ids, new_ids, skipped_rows = \ db.copy_detailed_report_lazy('text_ref', inps_2, COLS) _assert_set_equal(inps_1 | inps_2, _ref_set(db)) _assert_set_equal(inps_1 & inps_2, {t[:2] for t in skipped_rows}) assert {trid for trid, in new_ids} != exiting_ids
def test_push_copy(): db = get_temp_db(True) inps_1 = _do_init_copy(db) inps_2 = {('b', '2'), ('c', '1'), ('d', '3')} original_date = db.select_one(db.TextRef.create_date, db.TextRef.pmid == 'b') db.copy_push('text_ref', inps_2, COLS) _assert_set_equal(inps_1 | inps_2, _ref_set(db)) new_date = db.select_one(db.TextRef.create_date, db.TextRef.pmid == 'b') assert new_date != original_date, "PMID b was not updated."
def test_uniqueness_text_ref_doi_pmid(): "Test uniqueness enforcement behavior for text_ref insertion." db = get_temp_db() pmid = '1234' doi = 'foo/1234' db.insert('text_ref', doi=doi, pmid=pmid) try: db.insert('text_ref', doi=doi, pmid=pmid) except IntegrityError: return # PASS finally: db._clear(force=True) assert False, "Uniqueness was not enforced."
def _check_kbm(Kb, *args, **kwargs): db = get_temp_db(clear=True) dbid = db.select_one(db.DBInfo.id, db.DBInfo.db_name == Kb.name) assert dbid is None kbm = Kb(*args, **kwargs) kbm.upload(db) dbid = db.select_one(db.DBInfo.id, db.DBInfo.db_name == Kb.name)[0] assert dbid is not None db_stmts = db.select_all(db.RawStatements) print(len(db_stmts)) assert len(db_stmts) assert all(s.db_info_id == dbid for s in db_stmts) db.session.close()
def test_db_lazy_insert(): db = get_temp_db(clear=True) N = int(10**5) S = int(10**8) fake_pmids_a = {(i, str(random.randint(0, S))) for i in range(N)} fake_pmids_b = {(int(N / 2 + i), str(random.randint(0, S))) for i in range(N)} expected = {id: pmid for id, pmid in fake_pmids_b} for id, pmid in fake_pmids_a: expected[id] = pmid start = datetime.now() db.copy('text_ref', fake_pmids_a, ('id', 'pmid')) print("First load:", datetime.now() - start) try: db.copy('text_ref', fake_pmids_b, ('id', 'pmid')) assert False, "Vanilla copy succeeded when it should have failed." except Exception as e: db._conn.rollback() pass # Try adding more text refs lazily. Overlap is guaranteed. start = datetime.now() db.copy_lazy('text_ref', fake_pmids_b, ('id', 'pmid')) print("Lazy copy:", datetime.now() - start) refs = db.select_all([db.TextRef.id, db.TextRef.pmid]) result = {id: pmid for id, pmid in refs} assert result.keys() == expected.keys() passed = True for id, pmid in expected.items(): if result[id] != pmid: print(id, pmid) passed = False assert passed, "Result did not match expected." # As a benchmark, see how long this takes the "old fashioned" way. db._clear(force=True) start = datetime.now() db.copy('text_ref', fake_pmids_a, ('id', 'pmid')) print('Second load:', datetime.now() - start) start = datetime.now() current_ids = {trid for trid, in db.select_all(db.TextRef.id)} clean_fake_pmids_b = {t for t in fake_pmids_b if t[0] not in current_ids} db.copy('text_ref', clean_fake_pmids_b, ('id', 'pmid')) print('Old fashioned copy:', datetime.now() - start) return
def test_multiple_text_ref_pmc_oa(): "Test whether a duplicate text ref in pmc oa is handled correctly." db = get_temp_db() pmc = PmcOA(ftp_url=get_test_ftp_url(), local=True) pmc.review_fname = 'test_review_multiple_text_ref_pmc_oa.txt' inp = dict.fromkeys(pmc.tr_cols) inp.update(pmcid='PMC5579538', doi='10.1021/acsomega.7b00205') pmc.upload_batch(db, [inp], []) num_refs = len(db.select_all('text_ref')) pmc.upload_batch(db, [inp], []) assert len(db.select_all('text_ref')) == num_refs,\ "Duplicate refs allowed to be submitted.." remove(pmc.review_fname) return
def test_detailed_copy_report_repeated_pmid_with_conflict(): db = get_temp_db(True) inps_1 = {('1', 'PMC1', '10.1/a'), ('2', 'PMC2', '10.2/b')} inps_2 = {('1', 'PMC3', '10.1/a')} cols = ('pmid', 'pmcid', 'doi') db.copy('text_ref', inps_1, cols) existing_ids, new_ids, skipped_rows = \ db.copy_detailed_report_lazy('text_ref', inps_2, cols, ('pmid', 'id')) assert existing_ids == [('1', 1)] assert len(skipped_rows) == 1 assert not new_ids
def test_push_report_copy(): db = get_temp_db(True) inps_1 = {('a', '1'), ('b', '2')} inps_2 = {('b', '2'), ('c', '1'), ('d', '3')} db.copy('text_ref', inps_1, COLS) _assert_set_equal(inps_1, _ref_set(db)) original_date = db.select_one(db.TextRef.create_date, db.TextRef.pmid == 'b') updated = db.copy_report_push('text_ref', inps_2, COLS) _assert_set_equal(inps_1 | inps_2, _ref_set(db)) _assert_set_equal(inps_1 & inps_2, {t[:2] for t in updated}) new_date = db.select_one(db.TextRef.create_date, db.TextRef.pmid == 'b') assert new_date != original_date, 'PMID b was not updated.'
def test_simple_db_insert(): db = get_temp_db() db._clear(force=True) stmts = [ Phosphorylation(Agent('MEK', db_refs={'FPLX': 'MEK'}), Agent('ERK', db_refs={'FPLX': 'ERK'}), evidence=Evidence(source_api='test')), Complex([Agent(n, db_refs={'FPLX': n}) for n in ('MEK', 'ERK')], evidence=Evidence(source_api='test')) ] dbid = db.insert(db.DBInfo, db_name='test', source_api='tester') insert_db_stmts(db, stmts, dbid) db_stmts = db.select_all(db.RawStatements) db_agents = db.select_all(db.RawAgents) assert len(db_stmts) == 2, len(db_stmts) assert len(db_agents) == 8, len(db_agents) db.session.close()
def _get_db_no_pa_stmts(): db = get_temp_db(clear=True) db_builder = DbBuilder(db) db_builder.add_text_refs([('12345', 'PMC54321'), ('24680', 'PMC08642'), ('97531', )]) db_builder.add_text_content([['pubmed-ttl', 'pubmed-abs', 'pmc_oa'], ['pubmed-abs', 'manuscripts'], ['pubmed-ttl', 'pubmed-abs']]) db_builder.add_readings([['REACH', 'TRIPS'], ['REACH', 'SPARSER'], ['REACH', 'ISI'], ['SPARSER'], ['REACH', 'SPARSER'], ['SPARSER', 'TRIPS', 'REACH'], ['REACH', 'EIDOS']]) db_builder.add_raw_reading_statements([ [Phosphorylation(mek, erk)], # reach pubmed title [Phosphorylation(mek, erk, 'T', '124')], # trips pubmed title [ Phosphorylation(mek, erk), Inhibition(erk, ras), (Phosphorylation(mek, erk), 'in the body') ], # reach pubmed-abs [ Complex([mek, erk]), Complex([erk, ras]), (Phosphorylation(None, erk), 'In the body') ], # sparser pubmed-abs [], # reach pmc_oa [], # ISI pmc_oa [Phosphorylation(map2k1, mapk1)], # sparser pubmed-abs [], # reach manuscripts [], # sparser manuscripts [Inhibition(simvastatin_ng, raf), Activation(map2k1_mg, erk)], # sparser pubmed title [], # TRIPS pubmed title [], # reach pubmed title [], # reach pubmed abs [], # eidos pubmed abs ]) db_builder.add_databases(['biopax', 'tas', 'bel']) db_builder.add_raw_database_statements([[ Activation(mek, raf), Inhibition(erk, ras), Phosphorylation(mek, erk) ], [Inhibition(simvastatin, raf)], [Phosphorylation(mek, erk, 'T', '124')]]) return db
def test_detailed_copy_report_pmid_and_id(): db = get_temp_db(True) inps_1 = _do_init_copy(db) inps_2 = {('b', '2'), ('c', '1'), ('d', '3')} existing_id_dict = { pmid: trid for trid, pmid in db.select_all([db.TextRef.id, db.TextRef.pmid]) } existing_ids, new_ids, skipped_rows = \ db.copy_detailed_report_lazy('text_ref', inps_2, COLS, ('pmid', 'pmcid', 'id')) new_id_dict = {pmid: trid for pmid, trid in new_ids} returned_existing_id_dict = {pmid: trid for pmid, _, trid, in existing_ids} assert returned_existing_id_dict == {'b': 1} _assert_set_equal(inps_1 | inps_2, _ref_set(db)) _assert_set_equal(inps_1 & inps_2, {t[:2] for t in skipped_rows}) assert set(existing_id_dict.keys()) != set(new_id_dict.keys())
def test_normal_db_reading_call(): s3 = boto3.client('s3') chdir(path.expanduser('~')) # Put some basic stuff in the test databsae N = 6 db = get_temp_db(clear=True) db.copy('text_ref', [(i, 'PMID80945%d' % i) for i in range(N)], cols=('id', 'pmid')) text_content = [ (i, i, 'pubmed', 'text', 'abstract', zip_string('MEK phosphorylates ERK in test %d.' % i)) for i in range(N) ] text_content += [ (N, N-1, 'pmc_oa', 'text', 'fulltext', zip_string('MEK phosphorylates ERK. EGFR activates SHC.')) ] db.copy('text_content', text_content, cols=('id', 'text_ref_id', 'source', 'format', 'text_type', 'content')) # Put an id file on s3 basename = 'local_db_test_run' s3_prefix = 'reading_results/%s/' % basename s3.put_object(Bucket='bigmech', Key=s3_prefix + 'id_list', Body='\n'.join(['%d' % i for i in range(len(text_content))])) # Call the reading tool sub = DbReadingSubmitter(basename, ['sparser']) job_name, cmd = sub._make_command(0, len(text_content)) cmd += ['--test'] check_call(cmd) sub.produce_report() # Remove garbage on s3 res = s3.list_objects(Bucket='bigmech', Prefix=s3_prefix) for entry in res['Contents']: print("Removing %s..." % entry['Key']) s3.delete_object(Bucket='bigmech', Key=entry['Key']) return
def test_lazy_copier_update(): db = get_temp_db(clear=True) N = int(10**5) S = int(10**8) fake_pmids_a = {(i, str(random.randint(0, S))) for i in range(N)} fake_pmids_b = {(int(N / 2 + i), str(random.randint(0, S))) for i in range(N)} expected = {id: pmid for id, pmid in fake_pmids_a} for id, pmid in fake_pmids_b: expected[id] = pmid start = datetime.now() db.copy('text_ref', fake_pmids_a, ('id', 'pmid')) print("First load:", datetime.now() - start) try: db.copy('text_ref', fake_pmids_b, ('id', 'pmid')) assert False, "Vanilla copy succeeded when it should have failed." except Exception as e: db._conn.rollback() pass # Try adding more text refs lazily. Overlap is guaranteed. start = datetime.now() db.copy_push('text_ref', fake_pmids_b, ('id', 'pmid')) print("Lazy copy:", datetime.now() - start) refs = db.select_all([db.TextRef.id, db.TextRef.pmid]) result = {id: pmid for id, pmid in refs} assert result.keys() == expected.keys() passed = True for id, pmid in expected.items(): if result[id] != pmid: print(id, pmid) passed = False assert passed, "Result did not match expected."
def __init__(self): self.db = get_temp_db(clear=True) N = int(10**5) S = int(10**8) self.fake_pmids_a = {(i, str(random.randint(0, S))) for i in range(N)} self.fake_pmids_b = {(int(N / 2 + i), str(random.randint(0, S))) for i in range(N)} self.expected = {id: pmid for id, pmid in self.fake_pmids_a} for id, pmid in self.fake_pmids_b: self.expected[id] = pmid start = datetime.now() self.db.copy('text_ref', self.fake_pmids_a, ('id', 'pmid')) print("First load:", datetime.now() - start) try: self.db.copy('text_ref', self.fake_pmids_b, ('id', 'pmid')) assert False, "Vanilla copy succeeded when it should have failed." except Exception as e: self.db._conn.rollback() pass
def test_dump(): db = get_temp_db(clear=True) m = XddManager() # Enter "old" DOIs s3 = boto3.client('s3') res = s3.list_objects_v2(**m.bucket.kw()) dois = set() for ref in res['Contents']: key = ref['Key'] if 'bib' not in key: continue try: obj = s3.get_object(Key=key, **m.bucket.kw()) except Exception: print('ack') continue bibs = json.loads(obj['Body'].read()) dois |= { bib['identifier'][0]['id'] for bib in bibs if 'identifier' in bib } sample_dois = random.sample(dois, len(dois) // 2) new_trs = [db.TextRef.new(doi=doi) for doi in sample_dois] print(f"Adding {len(new_trs)} 'old' text refs.") db.session.add_all(new_trs) db.session.commit() # Run the update. m.run(db) # Check the result. assert db.select_all(db.TextRef) assert db.select_all(db.TextContent) assert db.select_all(db.Reading) assert db.select_all(db.RawStatements) assert db.select_all(db.RawAgents)
def _construct_database(): db = get_temp_db(clear=True) db_builder = DbBuilder(db) db_builder.add_text_refs([('12345', 'PMC54321'), ('24680', 'PMC08642')]) db_builder.add_text_content([['pubmed-abs', 'pmc_oa'], ['pubmed-abs']]) db_builder.add_readings([['REACH'], ['REACH'], ['REACH', 'SPARSER']]) mek = Agent('MEK', db_refs={'FPLX': 'MEK'}) erk = Agent('ERK', db_refs={'FPLX': 'ERK'}) raf = Agent('RAF', db_refs={'FPLX': 'RAF'}) db_builder.add_raw_reading_statements( [[Phosphorylation(mek, erk), Complex([mek, erk])], [Phosphorylation(mek, erk)], [Activation(mek, erk)], [Complex([mek, erk]), Complex([raf, erk])]]) db_builder.add_databases(['signor']) db_builder.add_raw_database_statements([[Complex([raf, erk])]]) db_builder.add_pa_statements([(Phosphorylation(mek, erk), [0, 2]), (Complex([mek, erk]), [1, 4]), (Activation(mek, erk), [3]), (Complex([raf, erk]), [5, 6])]) return db
def __init__(self): self.test_db = get_temp_db(clear=True) self.test_data = _make_test_db_input()
def test_db_presence(): db = get_temp_db(clear=True) db.insert(db.TextRef, pmid='12345')
def test_dump_build(): """Test the dump pipeline. Method ------ CREATE CONTEXT: - Create a local principal database with a small amount of content. Aim for representation of stmt motifs and sources. - Create a local readonly database. - Create a fake bucket (moto) RUN THE DUMP CHECK THE RESULTS """ assert config.is_db_testing() # Create the dump locale. s3 = boto3.client('s3') dump_head = config.get_s3_dump() s3.create_bucket(Bucket=dump_head.bucket) assert dump_head.bucket == S3_DATA_LOC['bucket'] # Create the principal database. db = get_temp_db(clear=True) db.copy('text_ref', [ # trid ('1', 1, 'PMC1', 1), # 1 ('2', 2, 'PMC2', 2), # 2 ('3', 3, None, None), # 3 (None, None, 'PMC4', 4) # 4 ], ('pmid', 'pmid_num', 'pmcid', 'pmcid_num')) db.copy('mesh_ref_annotations', [ (1, 11, False), (1, 13, False), (1, 12, True), (2, 12, True), (3, 13, False), (3, 33, True) ], ('pmid_num', 'mesh_num', 'is_concept')) db.copy('text_content', [ # tcid (1, 'pubmed', 'txt', 'abstract'), # 1 (1, 'pmc', 'xml', 'fulltext'), # 2 (2, 'pubmed', 'txt', 'title'), # 3 (3, 'pubmed', 'txt', 'abstract'), # 4 (3, 'pmc', 'xml', 'fulltext'), # 5 (4, 'pmc', 'xml', 'fulltext') # 6 ], ('text_ref_id', 'source', 'format', 'text_type')) db.copy('reading', [(tcid, rdr, 1, reader_versions[rdr][-1], 'emtpy') for tcid, rdr in [ # 1 2 3 (1, 'reach'), (1, 'eidos'), (1, 'isi'), # 4 (2, 'reach'), # 5 6 7 (3, 'reach'), (3, 'eidos'), (3, 'trips'), # 8 (4, 'reach'), # 9 (5, 'reach'), # 10 (6, 'reach') ]], ('text_content_id', 'reader', 'batch_id', 'reader_version', 'format')) db.copy('db_info', [ ('signor', 'signor', 'Signor'), # 1 ('pc', 'biopax', 'Pathway Commons'), # 2 ('medscan', 'medscan', 'MedScan') # 3 ], ('db_name', 'source_api', 'db_full_name')) raw_stmts = { 'reading': { 2: [ Inhibition( Agent('Fever', db_refs={'TEXT': 'fever', 'MESH': 'D005334'}), Agent('Cough', db_refs={'TEXT': 'cough', 'MESH': 'D003371'}), evidence=Evidence(text="We found fever inhibits cough.") ) ], 4: [ Phosphorylation( Agent('MEK', db_refs={'FPLX': 'MEK', 'TEXT': 'mek'}), Agent('ERK', db_refs={'FPLX': 'MEK', 'TEXT': 'erk'}), evidence=Evidence(text="mek phosphorylates erk, so say I.") ), Activation( Agent('MAP2K1', db_refs={'HGNC': '6840', 'TEXT': 'MEK1'}), Agent('MAPK1', db_refs={'HGNC': '6871', 'TEXT': 'ERK1'}), evidence=Evidence(text="MEK1 activates ERK1, or os I'm told.") ), Activation( Agent('ERK', db_refs={'FPLX': 'ERK', 'TEXT': 'ERK'}), Agent('JNK', db_refs={'FPLX': 'JNK', 'TEXT': 'JNK'}), evidence=Evidence(text="ERK activates JNK, maybe.") ), Complex([ Agent('MEK', db_refs={'FPLX': 'MEK', 'TEXT': 'MAP2K'}), Agent('ERK', db_refs={'FPLX': 'ERK', 'TEXT': 'MAPK'}), Agent('RAF', db_refs={'FPLX': 'RAF', 'TEXT': 'RAF'}) ], evidence=Evidence(text="MAP2K, MAPK, and RAF form a complex.")) ], 7: [ Activation( Agent('ERK', db_refs={'FPLX': 'ERK', 'TEXT': 'ERK'}), Agent('JNK', db_refs={'FPLX': 'JNK', 'TEXT': 'JNK'}), evidence=Evidence(text='ERK activates JNK, maybe.') ) ], 8: [ Complex([ Agent('MEK', db_refs={'FPLX': 'MEK', 'TEXT': 'mek'}), Agent('ERK', db_refs={'FPLX': 'ERK', 'TEXT': 'erk'}) ], evidence=Evidence(text="...in the mek-erk complex.")) ], }, 'databases': { 2: [ Conversion( Agent('FRK', db_refs={'HGNC': '3955'}), [Agent('ATP', db_refs={'MESH': 'D000255'})], [Agent('hydron', db_refs={'CHEBI': 'CHEBI:15378'})] ) ], 3: [ Phosphorylation( Agent('MEK', db_refs={'FPLX': 'MEK', 'TEXT': 'MEK'}), Agent('ERK', db_refs={'FPLX': 'ERK', 'TEXT': 'ERK'}), evidence=Evidence(text="...MEK phosphorylates ERK medscan.") ) ] } } simple_insert_stmts(db, raw_stmts) # Run preassembly. prass.create_corpus(db) # Do the dump proceedure. ro = get_temp_ro(clear=True) dump(db, ro) # Check that the s3 dump exists. all_dumps = dm.list_dumps() assert len(all_dumps) == 1 # Check to make sure all the dump files are present. dump_path = all_dumps[0] file_list = dump_path.list_objects(s3) assert dm.Start.from_list(file_list) assert dm.Readonly.from_list(file_list) assert dm.Belief.from_list(file_list) assert dm.Sif.from_list(file_list) assert dm.StatementHashMeshId.from_list(file_list) assert dm.FullPaStmts.from_list(file_list) assert dm.End.from_list(file_list) # Check what tables are active in the readonly database. active_tables = ro.get_active_tables() for tbl in ro.get_tables(): if ro.tables[tbl]._temp: # If it was temp, it should be gone. assert tbl not in active_tables else: # Otherwise, it should be there. assert tbl in active_tables # Check that the principal db has no more ro schema. assert 'readonly' not in db.get_schemas() # Check contents of the readonly database. assert len(ro.select_all(ro.FastRawPaLink)) \ == len(db.select_all(db.RawUniqueLinks)) # Check that a query basically works. from indra_db.client.readonly import HasAgent res = HasAgent('MEK').get_statements(ro) assert len(res.statements()) == 2, len(res.statements()) # Check that belief is represented in the table. bdict = {h: b for h, b in ro.select_all([ro.SourceMeta.mk_hash, ro.SourceMeta.belief])} assert all(1 >= b > 0 for b in bdict.values()) # Check to make sure lambda was diverted correctly. call_records = config.get_test_call_records() assert len(call_records) == 2 assert all(rec.func_name == '_set_lambda_env' for rec in call_records) assert all(isinstance(rec.args[1], dict) for rec in call_records) assert 'INDRAROOVERRIDE' in call_records[0].args[1] assert call_records[0].args[1]['INDRAROOVERRIDE'] == str(db.url) assert not call_records[1].args[1]
def _build_test_set(): agents = [{ 'NAME': 'ERK', 'FPLX': 'ERK', 'TEXT': 'MAPK' }, { 'NAME': 'TP53', 'HGNC': '11998' }, { 'NAME': 'MEK', 'FPLX': 'MEK' }, { 'NAME': 'Vemurafenib', 'CHEBI': 'CHEBI:63637' }] stypes = ['Phosphorylation', 'Activation', 'Inhibition', 'Complex'] sources = [('medscan', 'rd'), ('reach', 'rd'), ('pc11', 'db'), ('signor', 'db')] mesh_ids = ['D000225', 'D002352', 'D015536'] mesh_combos = [] for num_mesh in range(0, 3): if num_mesh == 1: mesh_groups = [[mid] for mid in mesh_ids] else: mesh_groups = combinations(mesh_ids, num_mesh) mesh_combos.extend(list(mesh_groups)) random.shuffle(mesh_combos) source_data = [] for num_srcs in range(1, 5): if num_srcs == 1: src_iter = [[src] for src in sources] else: src_iter = combinations(sources, num_srcs) for src_list in src_iter: only_src = None if len(src_list) > 1 else src_list[0][0] has_rd = any(t == 'rd' for _, t in src_list) if has_rd: mesh_ids = mesh_combos[len(source_data) % len(mesh_combos)] else: mesh_ids = [] source_data.append({ 'sources': {src: random.randint(1, 50) for src, _ in src_list}, 'has_rd': any(t == 'rd' for _, t in src_list), 'has_db': any(t == 'db' for _, t in src_list), 'only_src': only_src, 'mesh_ids': mesh_ids }) random.shuffle(source_data) stmts = [ tuple(tpl) + (None, None) for tpl in product(stypes, permutations(agents, 2)) ] stmts += [('ActiveForm', (ref, ), activity, is_active) for activity, is_active, ref in product( ['transcription', 'activity'], [True, False], agents)] complex_pairs = [] name_meta_rows = [] name_meta_cols = ('mk_hash', 'ag_num', 'db_id', 'role_num', 'type_num', 'ev_count', 'activity', 'is_active', 'agent_count') text_meta_rows = [] text_meta_cols = ('mk_hash', 'ag_num', 'db_id', 'role_num', 'type_num', 'ev_count', 'activity', 'is_active', 'agent_count') other_meta_rows = [] other_meta_cols = ('mk_hash', 'ag_num', 'db_name', 'db_id', 'role_num', 'type_num', 'ev_count', 'activity', 'is_active', 'agent_count') source_meta_rows = [] source_meta_cols = ('mk_hash', 'reach', 'medscan', 'pc11', 'signor', 'ev_count', 'type_num', 'activity', 'is_active', 'agent_count', 'num_srcs', 'src_json', 'only_src', 'has_rd', 'has_db') mesh_meta_rows = [] mesh_meta_cols = ('mk_hash', 'ev_count', 'mesh_num', 'type_num', 'activity', 'is_active', 'agent_count') for stype, refs, activity, is_active in stmts: # Extract agents, and make a Statement. StmtClass = get_statement_by_name(stype) if stype == 'ActiveForm': ag = make_agent_from_ref(refs[0]) stmt = StmtClass(ag, activity=activity, is_active=is_active) else: ag1 = make_agent_from_ref(refs[0]) ag2 = make_agent_from_ref(refs[1]) if stype == 'Complex': if {ag1.name, ag2.name} in complex_pairs: continue stmt = StmtClass([ag1, ag2]) complex_pairs.append({ag1.name, ag2.name}) else: stmt = StmtClass(ag1, ag2) # Connect with a source. source_dict = source_data[len(source_meta_rows) % len(source_data)] ev_count = sum(source_dict['sources'].values()) src_row = (stmt.get_hash(), ) for src_name in ['reach', 'medscan', 'pc11', 'signor']: src_row += (source_dict['sources'].get(src_name), ) src_row += (ev_count, ro_type_map.get_int(stype), activity, is_active, len(refs), len(source_dict['sources']), json.dumps(source_dict['sources']), source_dict['only_src'], source_dict['has_rd'], source_dict['has_db']) source_meta_rows.append(src_row) # Add mesh rows for mesh_id in source_dict['mesh_ids']: mesh_meta_rows.append( (stmt.get_hash(), ev_count, int(mesh_id[1:]), ro_type_map.get_int(stype), activity, is_active, len(refs))) # Generate agent rows. ref_rows, _, _ = extract_agent_data(stmt, stmt.get_hash()) for row in ref_rows: row = row[:4] + (ro_role_map.get_int( row[4]), ro_type_map.get_int(stype), ev_count, activity, is_active, len(refs)) if row[2] == 'NAME': row = row[:2] + row[3:] name_meta_rows.append(row) elif row[2] == 'TEXT': row = row[:2] + row[3:] text_meta_rows.append(row) else: other_meta_rows.append(row) db = get_temp_db(clear=True) src_meta_cols = [{'name': col} for col, _ in sources] db.SourceMeta.load_cols(db.engine, src_meta_cols) for tbl in [ db.SourceMeta, db.MeshMeta, db.NameMeta, db.TextMeta, db.OtherMeta ]: tbl.__table__.create(db.engine) db.copy('readonly.source_meta', source_meta_rows, source_meta_cols) db.copy('readonly.mesh_meta', mesh_meta_rows, mesh_meta_cols) db.copy('readonly.name_meta', name_meta_rows, name_meta_cols) db.copy('readonly.text_meta', text_meta_rows, text_meta_cols) db.copy('readonly.other_meta', other_meta_rows, other_meta_cols) return db