def main(): parser = get_parser() args = parser.parse_args() if args.test: if 'test' not in args.database: from indra_db.tests.util import get_temp_db db = get_temp_db() else: db = get_db(args.database) elif args.database == 'primary': db = get_primary_db() else: db = get_db(args.database) readers = ['SPARSER', 'REACH', 'TRIPS', 'ISI', 'EIDOS', 'MTI'] if args.method == 'local': bulk_manager = BulkLocalReadingManager(readers, buffer_days=args.buffer, n_procs=args.num_procs) elif args.method == 'aws': bulk_manager = BulkAwsReadingManager(readers, buffer_days=args.buffer, project_name=args.project_name) else: assert False, "This shouldn't be allowed." if args.task == 'read_all': bulk_manager.read_all(db) elif args.task == 'read_new': bulk_manager.read_new(db) return
def test_num_evidence(): ro = get_db('primary') q = HasNumEvidence(tuple(range(5, 10))) res = q.get_statements(ro, limit=5, ev_limit=8) assert all(5 <= n < 10 for n in res.evidence_totals.values()) stmts = res.statements() assert all(5 < len(s.evidence) <= 8 for s in stmts)
def read_db_ids_search_terms(id_search_terms, id_type): """Return extracted EmmaaStatements from INDRA database given an ID-search term dict. Parameters ---------- id_search_terms : dict A dict representing a set of IDs pointing to search terms that produced them. Returns ------- list[:py:class:`emmaa.model.EmmaaStatement`] A list of EmmaaStatements extracted from the given IDs. """ ids = list(id_search_terms.keys()) date = datetime.datetime.utcnow() db = get_db('primary') id_stmts = get_raw_stmt_jsons_from_papers(ids, id_type=id_type, db=db) estmts = [] for _id, stmt_jsons in id_stmts.items(): stmts = stmts_from_json(stmt_jsons) id_estmts = to_emmaa_stmts(stmts, date, id_search_terms[_id], {'internal': True}) estmts += id_estmts return estmts
def test_evidence_filtering_trios(): ro = get_db('primary') q1 = HasAgent('TP53') q_list = [ ~HasOnlySource('medscan'), HasSources(['reach', 'sparser']), HasDatabases(), HasReadings(), FromMeshId('D001943') ] for q2, q3, q4 in combinations(q_list, 3): query = q1 | q2 | q3 | q4 ev_filter = q2.ev_filter() & q3.ev_filter() & q4.ev_filter() query.get_statements(ro, limit=2, ev_limit=5, evidence_filter=ev_filter) ev_filter = q2.ev_filter() | q3.ev_filter() | q4.ev_filter() query.get_statements(ro, limit=2, ev_limit=5, evidence_filter=ev_filter) for q2, q3, q4 in permutations(q_list, 3): query = q1 | q2 | q3 | q4 ev_filter = q2.ev_filter() & q3.ev_filter() | q4.ev_filter() query.get_statements(ro, limit=2, ev_limit=5, evidence_filter=ev_filter)
def get_belief(db=None, partition=True): if db is None: db = dbu.get_db('primary') if partition: import networkx as nx hashes = {h for h, in db.select_all(db.PAStatements.mk_hash)} link_pair = [ db.PASupportLinks.supporting_mk_hash, db.PASupportLinks.supported_mk_hash ] links = {tuple(link) for link in db.select_all(link_pair)} g = nx.Graph() g.add_nodes_from(hashes) g.add_edges_from(links) group = set() beliefs = {} for c in nx.connected_components(g): group |= c if len(group) >= 10000: sg = g.subgraph(group) stmts = load_mock_statements(db, hashes=group, sup_links=list(sg.edges)) beliefs.update(calculate_belief(stmts)) group = set() return beliefs else: stmts = load_mock_statements(db) return calculate_belief(stmts)
def __check_stmts(self, json_stmts, check_support=False, check_stmts=False): assert len(json_stmts) is not 0, \ 'Did not get any statements.' stmts = stmts_from_json(json_stmts) for s in stmts: assert s.evidence, "Statement lacks evidence." for ev in s.evidence: if ev.source_api in {'reach', 'sparser', 'trips'} \ and ev.pmid is None: # Check because occasionally there is genuinely no pmid. from indra_db.util import get_db db = get_db('primary') tr = db.select_one(db.TextRef, db.TextRef.id == ev.text_refs['TRID']) assert tr.pmid is None, \ ('Statement from reading missing pmid:\n%s\n%s.' % (s, json.dumps(ev.to_json(), indent=2))) # To allow for faster response-times, we currently do not include # support links in the response. if check_support: assert any([s.supports + s.supported_by for s in stmts]),\ ("Some statements lack support: %s." % str([str(s) for s in stmts if not s.supports+s.supported_by])) if check_stmts: assert all([not s1.matches(s2) for s1, s2 in combinations(stmts, 2)]),\ ("Some statements match: %s." % str([(s1, s2) for s1, s2 in combinations(stmts, 2) if s1.matches(s2)])) return
def get_unique_text_refs(): """Get unique INDRA DB TextRef IDs for all identifiers in CORD19. Queries TextRef IDs with PMIDs, PMCIDs, and DOIs from CORD19, then deduplicates to obtain a unique set of TextRefs. Returns ------- set of ints Unique TextRef IDs. """ pmcids = get_ids('pmcid') pmids = [fix_pmid(pmid) for pmid in get_ids('pubmed_id')] dois = [fix_doi(doi) for doi in get_ids('doi')] # Get unique text_refs from the DB db = get_db('primary') print("Getting TextRefs by PMCID") tr_pmcids = db.select_all(db.TextRef.id, db.TextRef.pmcid_in(pmcids)) print("Getting TextRefs by PMID") tr_pmids = db.select_all(db.TextRef.id, db.TextRef.pmid_in(pmids)) tr_dois = [] for ix, doi_batch in enumerate(batch_iter(dois, 10000)): print("Getting Text Refs by DOI batch", ix) tr_doi_batch = db.select_all(db.TextRef.id, db.TextRef.doi_in(doi_batch, filter_ids=True)) tr_dois.extend(tr_doi_batch) ids = set([res.id for res_list in (tr_dois, tr_pmcids, tr_pmids) for res in res_list]) print(len(ids), "unique TextRefs in DB") trs = db.select_all(db.TextRef, db.TextRef.id.in_(ids)) return trs
def get_text_content_stats(fname=None, db=None): if db is None: db = get_db('primary') tc_rdng_link = db.TextContent.id == db.Reading.text_content_id __report_stat("\nText Content statistics:", fname) __report_stat('------------------------', fname) total_content = db.count(db.TextContent) __report_stat("Total number of text content entries: %d" % total_content) latest_updates = (db.session.query(db.Updates.source, func.max(db.Updates.datetime)).group_by( db.Updates.source).all()) __report_stat( ("Latest updates:\n %s" % '\n '.join(['%s: %s' % (s, d) for s, d in latest_updates])), fname) content_read = db.count(db.Reading.text_content_id) __report_stat("Total content read: %d" % content_read, fname) fulltext_content = db.count(db.TextContent, db.TextContent.text_type == 'fulltext') __report_stat("Number of fulltext entries: %d" % fulltext_content, fname) fulltext_read = db.count(db.TextContent, db.TextContent.text_type == 'fulltext', tc_rdng_link) __report_stat("Number of fulltext entries read: %d" % fulltext_read, fname) _report_groups(db, db.TextContent.id, db.TextContent.source, fname) _report_groups(db, db.TextContent.id, db.TextContent.source, fname, tc_rdng_link) return
def dump(self, continuing=False): if self.use_principal: ro = get_db(self.db_label) else: ro = get_ro(self.db_label) s3_path = self.get_s3_path() dump_sif(s3_path, ro=ro)
def get_db_statistics(fname=None, db=None, tables=None): """Get statistics on the contents of the database""" if db is None: db = get_db('primary') task_dict = { 'text_ref': get_text_ref_stats, 'text_content': get_text_content_stats, 'readings': get_readings_stats, 'raw_statements': get_statements_stats, 'pa_statements': get_pa_statement_stats } task_order = [ 'text_ref', 'text_content', 'readings', 'raw_statements', 'pa_statements' ] # Get the statistics if tables is None: for task_name in task_order: stat_meth = task_dict[task_name] stat_meth(fname, db) else: table_set = set(tables) for task_name in [tn for tn in task_order if tn in table_set]: task_dict[task_name](fname, db) return
def get_statements_stats(fname=None, db=None, indra_version=None): if db is None: db = get_db('primary') tc_rdng_link = db.TextContent.id == db.Reading.text_content_id stmt_rdng_link = db.Reading.id == db.RawStatements.reader_ref __report_stat('\nStatement Statistics:', fname) __report_stat('---------------------', fname) if indra_version is not None: filters = [db.RawStatements.indra_version == indra_version] else: filters = [] total_raw_statements = db.count(db.RawStatements, *filters) __report_stat("Total number of raw statements: %d" % total_raw_statements, fname) readers = db.session.query(db.Reading.reader).distinct().all() sources = db.session.query(db.TextContent.source).distinct().all() stats = '' for reader, in readers: for src, in sources: cnt = db.count(db.RawStatements, stmt_rdng_link, tc_rdng_link, db.Reading.reader == reader, db.TextContent.source == src, *filters) stats += (' Raw statements from %s reading %s: %d\n' % (reader, src, cnt)) __report_stat("Statements by reader and content source:\n%s" % stats, fname) _report_groups(db, db.RawStatements.id, db.DBInfo.db_name, fname, db.RawStatements.db_info_id == db.DBInfo.id) if indra_version is None: _report_groups(db, db.RawStatements.id, db.RawStatements.indra_version, fname) return
def _main(): parser = _make_parser() args = parser.parse_args() if args.debug: logger.setLevel(logging.DEBUG) from indra_db.databases import logger as db_logger db_logger.setLevel(logging.DEBUG) print("Getting %s database." % args.database) db = get_db(args.database) assert db is not None db.grab_session() s3_cache = S3Path.from_string(args.cache) pa = DbPreassembler(args.batch, s3_cache, stmt_type=args.stmt_type, yes_all=args.yes_all) desc = 'Continuing' if args.continuing else 'Beginning' print("%s to %s preassembled corpus." % (desc, args.task)) if args.task == 'create': pa.create_corpus(db, args.continuing) elif args.task == 'update': pa.supplement_corpus(db, args.continuing) else: raise IndraDBPreassemblyError('Unrecognized task: %s.' % args.task)
def test_has_hash(): ro = get_db('primary') hashes = {h for h, in ro.session.query(ro.SourceMeta.mk_hash).limit(10)} q = HasHash(hashes) res = q.get_statements(ro, limit=5, ev_limit=8) assert set(res.results.keys()) < hashes assert set(res.results.keys()) == set(res.source_counts.keys())
def load_readonly_dump(db_label, ro_label, dump_file): principal_db = get_db(db_label) readonly_db = get_ro(ro_label) logger.info("Using dump_file = \"%s\"." % dump_file) logger.info("%s - Beginning upload of content (est. ~30 minutes)" % datetime.now()) with ReadonlyTransferEnv(principal_db, readonly_db): readonly_db.load_dump(dump_file)
def test_from_papers(): ro = get_db('primary') pmid = '27014235' q = FromPapers([('pmid', pmid)]) res = q.get_statements(ro, limit=5) assert res.statements() assert all( any(ev.text_refs.get('PMID') == pmid for ev in s.evidence) for s in res.statements())
def dump(self, continuing=False): if self.use_principal: ro = get_db(self.db_label) else: ro = get_ro(self.db_label) query_res = ro.session.query(ro.FastRawPaLink.pa_json.distinct()) json_list = [json.loads(js[0]) for js in query_res.all()] s3 = boto3.client('s3') s3.put_object(Body=json.dumps(json_list), **self.get_s3_path().kw())
def load_readonly(from_dump): """Load the readonly database with readonly schema dump.""" start = Start.from_date(from_dump) dump_file = Readonly.from_list(start.manifest).get_s3_path() if not dump_file: print(f"ERROR: No readonly dump for {start.date_stamp}") return load_readonly_dump(get_db('primary', protected=True), get_ro('primary', protected=False), dump_file)
def get_text_refs_for_pubmed_search_term(search_term, **kwargs): """"Returns text ref IDs for PMIDs obtained using a PubMed search.""" print('Searching for %s' % search_term) pmids = pubmed_client.get_ids(search_term, **kwargs) print('Getting TextRefs for %d PMIDs' % len(pmids)) db = get_db('primary') tr_pmids = db.select_all(db.TextRef.id, db.TextRef.pmid_in(pmids)) trids = {res.id for res in tr_pmids} return trids
def test_get_agents(): ro = get_db('primary') query = HasAgent('TP53') res = query.get_agents(ro, limit=10) assert isinstance(res, QueryResult) assert len(res.results) <= 10, len(res.results) js = res.json() assert 'results' in js assert len(js['results']) == len(res.results)
def get_reach_readings(tr_dicts, dump_dir=None): db = get_db('primary') # Get text ref dicts with article metadata aligned between DB and CORD19 # Get REACH readings reach_data = db.select_all((db.Reading, db.TextRef, db.TextContent.source, db.TextContent.text_type), db.TextRef.id.in_(tr_dicts.keys()), db.TextContent.text_ref_id == db.TextRef.id, db.Reading.text_content_id == db.TextContent.id, db.Reading.reader == 'REACH') # Group readings by TextRef def tr_id_key_func(rd): return rd[1].id def content_priority_func(rd): text_type_priorities = {'fulltext': 0, 'abstract': 1, 'title': 2} source_priorities = { 'pmc_oa': 0, 'manuscripts': 1, 'elsevier': 2, 'pubmed': 3 } return (rd[1].id, text_type_priorities[rd[3]], source_priorities[rd[2]]) # Sort by TextRef ID and content type/source reach_data.sort(key=content_priority_func) # Iterate over groups rds_filt = [] for tr_id, tr_group in groupby(reach_data, tr_id_key_func): rds = list(tr_group) best_reading = rds[0] tr_dicts[tr_id]['READING_ID'] = best_reading.Reading.id rds_filt.append(best_reading) # If a dump directory is given, put all files in it trs_by_cord = {} if dump_dir: json_dir = join(dump_dir, 'json') os.mkdir(json_dir) for reading_result in rds_filt: tr = reading_result.TextRef reading = reading_result.Reading # If the reading output is empty, skip if not reading.bytes: continue text_ref = tr_dicts[tr.id] cord_uid = text_ref['CORD19_UID'] trs_by_cord[cord_uid] = text_ref with open(join(json_dir, f'{cord_uid}.json'), 'wt') as f: content = zlib.decompress(reading.bytes, 16 + zlib.MAX_WBITS) f.write(content.decode('utf8')) # Dump the metadata dictionary with open(join(dump_dir, 'metadata.json'), 'wt') as f: json.dump(trs_by_cord, f, indent=2) return rds_filt
def get_raw_stmts(tr_dicts, date_limit=None): """Return all raw stmts in INDRA DB for a given set of TextRef IDs. Parameters ---------- tr_dicts : dict of text ref information Keys are text ref IDs (ints) mapped to dictionaries of text ref metadata. date_limit : Optional[int] A number of days to check the readings back. Returns ------- list of stmts Raw INDRA Statements retrieved from the INDRA DB. """ # Get raw statement IDs from the DB for the given TextRefs db = get_db('primary') # Get statements for the given text refs text_ref_ids = list(tr_dicts.keys()) print(f"Distilling statements for {len(text_ref_ids)} TextRefs") start = time.time() clauses = [ db.TextRef.id.in_(text_ref_ids), db.TextContent.text_ref_id == db.TextRef.id, db.Reading.text_content_id == db.TextContent.id, db.RawStatements.reading_id == db.Reading.id ] if date_limit: start_date = (datetime.datetime.utcnow() - datetime.timedelta(days=date_limit)) print(f'Limiting to stmts from readings in the last {date_limit} days') clauses.append(db.Reading.create_date > start_date) db_stmts = distill_stmts(db, get_full_stmts=True, clauses=clauses) # Group lists of statements by the IDs TextRef that they come from stmts_by_trid = {} for stmt in db_stmts: trid = stmt.evidence[0].text_refs['TRID'] if trid not in stmts_by_trid: stmts_by_trid[trid] = [stmt] else: stmts_by_trid[trid].append(stmt) # For every statement, update the text ref dictionary of the evidence # object with the aligned DB/CORD19 dictionaries obtained from the # function cord19_metadata_for_trs: stmts_flat = [] for tr_id, stmt_list in stmts_by_trid.items(): tr_dict = tr_dicts[tr_id] if tr_dict: for stmt in stmt_list: stmt.evidence[0].text_refs.update(tr_dict) stmts_flat += stmt_list elapsed = time.time() - start print(f"{elapsed} seconds") return stmts_flat
def dump(self, continuing=False): if self.use_principal: ro = get_db(self.db_label) else: ro = get_ro(self.db_label) q = ro.select_all([ro.MeshMeta.mk_hash, ro.MeshMeta.mesh_num]) s3 = boto3.client('s3') s3.put_object(Body=pickle.dumps(q.all()), **self.get_s3_path().kw())
def show_list(): """List the readers and their most recent runs.""" import tabulate from indra_db.util import get_db db = get_db('primary') rows = [(rn, format_date(lu)) for rn, lu in ReadingManager.get_latest_updates(db).items()] headers = ('Reader', 'Last Updated') print(tabulate.tabulate(rows, headers))
def test_evidence_count_is_10(): ro = get_db('primary') query = HasAgent('TP53') - HasOnlySource('medscan') res = query.get_statements(ro, limit=2, ev_limit=10) assert isinstance(res, StatementQueryResult) stmts = res.statements() assert len(stmts) == 2 assert all(len(s.evidence) <= 10 for s in stmts) assert res.returned_evidence == 20 assert sum(res.evidence_totals.values()) > 20
def show_list(): """List the knowledge sources and their status.""" import tabulate from indra_db.util import get_db db = get_db('primary') rows = [(M.name, M.short_name, format_date(M.get_last_update(db))) for M in KnowledgebaseManager.__subclasses__()] print( tabulate.tabulate(rows, ('Name', 'Short Name', 'Last Updated'), tablefmt='simple'))
def managed_db(db_label='primary', protected=False): """Get indra_db handle managed with contextmanager Cleans up even if an error occurs while handle is open """ db = get_db(db_label, protected) try: yield db finally: db.session.rollback() db.session.close()
def test_has_sources(): ro = get_db('primary') q = HasSources(['reach', 'sparser']) res = q.get_statements(ro, limit=5, ev_limit=8) assert len(res.results) == 5 stmts = res.statements() res_json = res.json() assert 'results' in res_json assert len(stmts) == len(res.results) assert all(sc[r] > 0 for sc in res.source_counts.values() for r in ['reach', 'sparser'])
def main(db_name): db = get_db(db_name) data_json = {} data_json.update(get_all_daily_counts(db)) print('Dumping json...') with open(db_name + '_stats.json', 'w') as f: json.dump(data_json, f, indent=2) return
def dump(self, continuing=False): principal_db = get_db(self.db_label) logger.info("%s - Generating readonly schema (est. a long time)" % datetime.now()) principal_db.generate_readonly(allow_continue=continuing) logger.info( "%s - Beginning dump of database (est. 1 + epsilon hours)" % datetime.now()) principal_db.dump_readonly(self.get_s3_path()) return
def test_evidence_count_is_none(): ro = get_db('primary') query = HasAgent('TP53') - HasOnlySource('medscan') res = query.get_statements(ro, limit=2) assert isinstance(res, StatementQueryResult) stmts = res.statements() assert len(stmts) == 2 ev_list = stmts[0].evidence assert len(ev_list) > 10 assert all( len(s.evidence) == res.evidence_totals[s.get_hash()] for s in stmts) assert res.returned_evidence == sum(res.evidence_totals.values())