def upload_readings(output_list, db=None): """Put the reading output on the database.""" if db is None: db = get_primary_db() # Create the list of records to be copied, ensuring no uniqueness conflicts r_list = db.select_all( db.Reading, db.Reading.text_content_id.in_([rd.tcid for rd in output_list])) exisiting_tcid_set = set([r.text_content_id for r in r_list]) upload_list = [] for reading_data in output_list: # First check if this tcid is even in the set of existing tcids in the # readings table. if reading_data.tcid in exisiting_tcid_set: r_tcid_list = [ r for r in r_list if r.text_content_id == reading_data.tcid ] # Now check for any exact matches: if any([reading_data.matches(r) for r in r_tcid_list]): continue # If there were no conflicts, we can add this to the copy list. upload_list.append(reading_data.make_tuple()) # Copy into the database. logger.info("Adding %d/%d reading entries to the database." % (len(upload_list), len(output_list))) db.copy('reading', upload_list, ReadingData.get_cols()) return
def produce_statements(output_list, enrich=True, no_upload=False, pickle_file=None, n_proc=1, db=None): """Convert the reader output into a list of StatementData instances.""" if db is None: db = get_primary_db() if enrich: _enrich_reading_data(output_list, db=db) stmt_data_list = make_statements(output_list, n_proc) if not no_upload: try: upload_statements(stmt_data_list, db=db) except Exception as e: logger.exception(e) if pickle_file is None: pickle_file = ("failure_stmt_dump_%s.pkl" % datetime.now().strftime('%Y%m%d_%H%M%S')) logger.error( "Could not upload statements. Results pickled in: %s." % pickle_file) if pickle_file is not None: with open(pickle_file, 'wb') as f: pickle.dump([sd.statement for sd in stmt_data_list], f) print("Statements pickled in %s." % pickle_file) return stmt_data_list
def get_priority_tcids(id_dict, priorities, always_add=None, db=None): """For all ids, besides tcids, choose best content available. This function will convert all ids to tcids. """ if db is None: db = get_primary_db() def is_better(new, old): if new in priorities and old in priorities: return priorities.index(new) < priorities.index(old) return False logger.debug("Getting content prioritized by %s." % str(priorities)) tcids = set(id_dict.pop('tcid', [])) clauses = get_clauses(id_dict, db) tcid_source = set() for clause in clauses: q = (db.session.query(db.TextRef.id, db.TextContent.id, db.TextContent.source).filter( db.TextContent.text_ref_id == db.TextRef.id, clause)) id_set = set(q.all()) logger.debug("Got %d more ids." % len(id_set)) tcid_source |= id_set logger.debug("Got %d id's total." % len(tcid_source)) tr_best = {} for trid, tcid, source in tcid_source: if trid not in tr_best.keys() or is_better(source, tr_best[trid][0]): tr_best[trid] = (source, tcid) if always_add is not None and source in always_add: tcids.add(tcid) tcids |= {tcid for _, tcid in tr_best.values()} return tcids
def get_unique_text_refs(): """Get unique INDRA DB TextRef IDs for all identifiers in CORD19. Queries TextRef IDs with PMIDs, PMCIDs, and DOIs from CORD19, then deduplicates to obtain a unique set of TextRefs. Returns ------- set of ints Unique TextRef IDs. """ pmcids = get_ids('pmcid') pmids = [fix_pmid(pmid) for pmid in get_ids('pubmed_id')] dois = [fix_doi(doi) for doi in get_ids('doi')] # Get unique text_refs from the DB db = get_primary_db() print("Getting TextRefs by PMCID") tr_pmcids = db.select_all(db.TextRef.id, db.TextRef.pmcid_in(pmcids)) print("Getting TextRefs by PMID") tr_pmids = db.select_all(db.TextRef.id, db.TextRef.pmid_in(pmids)) tr_dois = [] for ix, doi_batch in enumerate(batch_iter(dois, 10000)): print("Getting Text Refs by DOI batch", ix) tr_doi_batch = db.select_all( db.TextRef.id, db.TextRef.doi_in(doi_batch, filter_ids=True)) tr_dois.extend(tr_doi_batch) ids = set([ res.id for res_list in (tr_dois, tr_pmcids, tr_pmids) for res in res_list ]) print(len(ids), "unique TextRefs in DB") trs = db.select_all(db.TextRef, db.TextRef.id.in_(ids)) return trs
def __init__(self, tcids, reader, verbose=True, reading_mode='unread', rslt_mode='all', batch_size=1000, db=None, n_proc=1): self.tcids = tcids self.reader = reader self.reader.reset() self.verbose = verbose self.reading_mode = reading_mode self.rslt_mode = rslt_mode self.batch_size = batch_size self.n_proc = n_proc if db is None: self._db = get_primary_db() else: self._db = db self._tc_rd_link = \ self._db.TextContent.id == self._db.Reading.text_content_id logger.info("Instantiating reading handler for reader %s with version " "%s using reading mode %s and statement mode %s for %d " "tcids." % (reader.name, reader.get_version(), reading_mode, rslt_mode, len(tcids))) # To be filled. self.extant_readings = [] self.new_readings = [] self.result_outputs = [] self.starts = {} self.stops = {} return
def get_text_refs_for_pubmed_search_term(search_term, **kwargs): """"Returns text ref IDs for PMIDs obtained using a PubMed search.""" print('Searching for %s' % search_term) pmids = pubmed_client.get_ids(search_term, **kwargs) print('Getting TextRefs for %d PMIDs' % len(pmids)) db = get_primary_db() tr_pmids = db.select_all(db.TextRef.id, db.TextRef.pmid_in(pmids)) trids = {res.id for res in tr_pmids} return trids
def get_reach_readings(tr_dicts, dump_dir=None): db = get_primary_db() # Get text ref dicts with article metadata aligned between DB and CORD19 # Get REACH readings reach_data = db.select_all((db.Reading, db.TextRef, db.TextContent.source, db.TextContent.text_type), db.TextRef.id.in_(tr_dicts.keys()), db.TextContent.text_ref_id == db.TextRef.id, db.Reading.text_content_id == db.TextContent.id, db.Reading.reader == 'REACH') # Group readings by TextRef def tr_id_key_func(rd): return rd[1].id def content_priority_func(rd): text_type_priorities = {'fulltext': 0, 'abstract': 1, 'title': 2} source_priorities = { 'pmc_oa': 0, 'manuscripts': 1, 'elsevier': 2, 'pubmed': 3 } return (rd[1].id, text_type_priorities[rd[3]], source_priorities[rd[2]]) # Sort by TextRef ID and content type/source reach_data.sort(key=content_priority_func) # Iterate over groups rds_filt = [] for tr_id, tr_group in groupby(reach_data, tr_id_key_func): rds = list(tr_group) best_reading = rds[0] tr_dicts[tr_id]['READING_ID'] = best_reading.Reading.id rds_filt.append(best_reading) # If a dump directory is given, put all files in it trs_by_cord = {} if dump_dir: json_dir = join(dump_dir, 'json') os.mkdir(json_dir) for reading_result in rds_filt: tr = reading_result.TextRef reading = reading_result.Reading # If the reading output is empty, skip if not reading.bytes: continue text_ref = tr_dicts[tr.id] cord_uid = text_ref['CORD19_UID'] trs_by_cord[cord_uid] = text_ref with open(join(json_dir, f'{cord_uid}.json'), 'wt') as f: content = zlib.decompress(reading.bytes, 16 + zlib.MAX_WBITS) f.write(content.decode('utf8')) # Dump the metadata dictionary with open(join(dump_dir, 'metadata.json'), 'wt') as f: json.dump(trs_by_cord, f, indent=2) return rds_filt
def get_raw_stmts(tr_dicts, date_limit=None): """Return all raw stmts in INDRA DB for a given set of TextRef IDs. Parameters ---------- tr_dicts : dict of text ref information Keys are text ref IDs (ints) mapped to dictionaries of text ref metadata. date_limit : Optional[int] A number of days to check the readings back. Returns ------- list of stmts Raw INDRA Statements retrieved from the INDRA DB. """ # Get raw statement IDs from the DB for the given TextRefs db = get_primary_db() # Get statements for the given text refs text_ref_ids = list(tr_dicts.keys()) print(f"Distilling statements for {len(text_ref_ids)} TextRefs") start = time.time() clauses = [ db.TextRef.id.in_(text_ref_ids), db.TextContent.text_ref_id == db.TextRef.id, db.Reading.text_content_id == db.TextContent.id, db.RawStatements.reading_id == db.Reading.id ] if date_limit: start_date = (datetime.datetime.utcnow() - datetime.timedelta(days=date_limit)) print(f'Limiting to stmts from readings in the last {date_limit} days') clauses.append(db.Reading.create_date > start_date) db_stmts = distill_stmts(db, get_full_stmts=True, clauses=clauses) # Group lists of statements by the IDs TextRef that they come from stmts_by_trid = {} for stmt in db_stmts: trid = stmt.evidence[0].text_refs['TRID'] if trid not in stmts_by_trid: stmts_by_trid[trid] = [stmt] else: stmts_by_trid[trid].append(stmt) # For every statement, update the text ref dictionary of the evidence # object with the aligned DB/CORD19 dictionaries obtained from the # function cord19_metadata_for_trs: stmts_flat = [] for tr_id, stmt_list in stmts_by_trid.items(): tr_dict = tr_dicts[tr_id] if tr_dict: for stmt in stmt_list: stmt.evidence[0].text_refs.update(tr_dict) stmts_flat += stmt_list elapsed = time.time() - start print(f"{elapsed} seconds") return stmts_flat
def get_raw_stmt_jsons_from_papers(id_list, id_type='pmid', db=None): """Get raw statement jsons for a given list of papers. Parameters ---------- id_list : list A list of ints or strs that are ids of papers of type `id_type`. id_type : str Default is 'pmid'. The type of ids given in id_list, e.g. 'pmid', 'pmcid', 'trid'. db : :py:class:`DatabaseManager` Optionally specify a database manager that attaches to something besides the primary database, for example a local database instance. Returns ------- result_dict : dict A dictionary keyed by id (of `id_type`) with a list of raw statement json objects as each value. Ids for which no statements are found will not be included in the dict. """ if db is None: db = get_primary_db() # Get the attribute for this id type. id_attr = _get_id_col(db.TextRef, id_type) # Get the results. res = db.select_all([db.TextRef, db.RawStatements.json], id_attr.in_(id_list), *db.link(db.RawStatements, db.TextRef)) # Organized the results into a dict of lists keyed by id value. # Fix pmids along the way. result_dict = defaultdict(list) for tr, rjson_bytes in res: id_val = _get_id_col(tr, id_type) # Decode and unpack the json rjson = json.loads(rjson_bytes.decode('utf-8')) # Fix the pmids in this json. rjson['evidence'][0]['pmid'] = tr.pmid # Set the text_refs in this json ev = rjson['evidence'][0] if 'text_refs' not in ev.keys(): ev['text_refs'] = {} for idt in ['trid', 'pmid', 'pmcid', 'doi']: ev['text_refs'][idt.upper()] = _get_id_col(tr, idt) # Add this to the results. result_dict[id_val].append(rjson) return result_dict
def get_id_dict(id_str_list): """Parse the list of id string into a dict.""" id_types = get_primary_db().TextRef.__table__.columns.keys() id_types.remove('id') id_types += ['trid', 'tcid'] id_dict = {id_type: [] for id_type in id_types} for id_entry in id_str_list: id_type, id_val = _convert_id_entry(id_entry, id_types) if id_type in ['trid', 'tcid']: id_dict[id_type].append(int(id_val)) else: id_dict[id_type].append(id_val) return id_dict
def test_statements_by_hashes_large_query(self): # TODO: Figure out a way to query hashes that isn't excruciatingly slow. # Get a set of hashes. db = get_primary_db() res = db.select_sample_from_table(1000, db.EvidenceCounts) hash_cnt_dict = {ev_cts.mk_hash: ev_cts.ev_count for ev_cts in res} # Run the test. resp, dt, size = self.__time_query('post', 'statements/from_hashes', hashes=list(hash_cnt_dict.keys())) resp_dict = json.loads(resp.data.decode('utf-8')) self.__check_stmts(resp_dict['statements'].values()) self.__check_time(dt, time_goal=20) return
def upload_statements(stmt_data_list, db=None): """Upload the statements to the database.""" if db is None: db = get_primary_db() logger.info("Uploading %d statements to the database." % len(stmt_data_list)) db.copy('raw_statements', [s.make_tuple() for s in stmt_data_list], StatementData.get_cols()) logger.info("Uploading agents to the database.") reading_id_set = set([sd.reading_id for sd in stmt_data_list]) if len(reading_id_set): db_stmts = (db.select_one(db.RawStatements, db.RawStatements.uuid.like(s.statement.uuid)) for s in stmt_data_list) insert_agents(db, 'raw', db_stmts, verbose=True) return
def _enrich_reading_data(reading_data_iter, db=None): """Get db ids for all ReadingData objects that correspond to a db ref. Note that the objects are modified IN PLACE, so nothing is returned, and if a copy of the objects is passed as an argument, this function will have no effect. This does nothing if the readings are not in the database. """ logger.debug("Enriching the reading data with database refs.") if db is None: db = get_primary_db() possible_matches = db.select_all( 'reading', db.Reading.text_content_id.in_( [rd.tcid for rd in reading_data_iter if rd.reading_id is None])) for rdata in reading_data_iter: for reading in possible_matches: if rdata.matches(reading): rdata.reading_id = reading.id break return
def get_db_readings(id_dict, readers, force_fulltext=False, batch_size=1000, db=None): """Get readings from the database.""" if db is None: db = get_primary_db() # Get any previous readings. Note that we do this BEFORE posting the new # readings. Otherwise we would have duplicates. previous_readings_query = get_readings_query(id_dict, readers, db=db, force_fulltext=force_fulltext) if previous_readings_query is not None: prev_readings = [ ReadingData.from_db_reading(r) for r in previous_readings_query.yield_per(batch_size) ] else: prev_readings = [] return prev_readings
def get_pmids_for_mesh_terms(mesh_list): num_mesh_list = [int(mid[1:]) for mid in mesh_list] db = get_primary_db() res = db.select_all(db.MeshRefAnnotations.pmid_num, db.MeshRefAnnotations.mesh_num.in_(num_mesh_list)) return [t[0] for t in res]
def get_direct_raw_stmt_jsons_from_agents(agents=None, stmt_type=None, db=None, max_stmts=None, offset=None): """Get Raw statement jsons from a list of agent refs and Statement type.""" if db is None: db = get_primary_db() # Turn the agents parameters into an intersection of queries for stmt ids. entity_queries = [] for role, ag_dbid, ns in agents: # Make the id match paradigms for the database. ag_dbid = regularize_agent_id(ag_dbid, ns) # Sanitize wildcards. for char in ['%', '_']: ag_dbid = ag_dbid.replace(char, '\%s' % char) # Generate the query q = (db.session.query(db.RawAgents.stmt_id.label('stmt_id')).filter( db.RawAgents.db_id.like(ag_dbid))) if ns is not None: q = q.filter(db.RawAgents.db_name.like(ns)) if role is not None: q = q.filter(db.RawAgents.role == role.upper()) entity_queries.append(q) ag_query_al = intersect_all(*entity_queries).alias('intersection') ag_query = db.session.query(ag_query_al).distinct().subquery('ag_stmt_ids') # Create a query for the raw statement json rid_c = db.RawStatements.reading_id.label('rid') json_q = (db.session.query( db.RawStatements.json, rid_c, ag_query).filter(db.RawStatements.id == ag_query.c.stmt_id)) # Filter by type, if applicable. if stmt_type is not None: json_q = json_q.filter(db.RawStatements.type == stmt_type) # Apply count limits and such. if max_stmts is not None: json_q = json_q.limit(max_stmts) if offset is not None: json_q = json_q.offset(offset) # Construct final query, that joins with text ref info on the database. json_q = json_q.subquery('json_content') ref_q = (db.session.query( json_q, db.Reading.text_content_id.label('tcid'), db.TextRef).outerjoin(db.Reading, db.Reading.id == json_q.c.rid).join( db.TextContent, db.TextContent.id == db.Reading.text_content_id).join( db.TextRef, db.TextRef.id == db.TextContent.text_ref_id)) # Process the jsons, filling text ref info. raw_stmt_jsons = {} for json_bytes, rid, sid, tcid, tr in ref_q.all(): raw_j = json.loads(json_bytes) ev = raw_j['evidence'][0] ev['text_refs'] = tr.get_ref_dict() ev['text_refs']['TCID'] = tcid ev['text_refs']['READING_ID'] = rid if tr.pmid: ev['pmid'] = tr.pmid raw_stmt_jsons[sid] = raw_j return raw_stmt_jsons
def produce_readings(id_dict, reader_list, verbose=False, read_mode='unread', get_preexisting=True, force_fulltext=False, batch_size=1000, no_upload=False, pickle_file=None, db=None, log_readers=True, prioritize=False): """Produce the reading output for the given ids, and upload them to db. This function will also retrieve pre-existing readings from the database, thus improving performance. Parameters ---------- id_dict : dict {<id_type>:[<id value>, ...]} A dict of lists of the id's to be read, keyed by id_type. reader_list : list [Reader] A list of Reader descendents to be used in reading. verbose : bool Optional, default False - If True, log and print the output of the commandline reader utilities, if False, don't. read_mode : str : 'all', 'unread', or 'none' Optional, default 'undread' - If 'all', read everything (generally slow); if 'unread', only read things that were unread, (the cache of old readings may still be used if `stmt_mode='all'` to get everything); if 'none', don't read, and only retrieve existing readings. get_preexisting : bool Optional, default True. If True, retrieve old readings where available (if `read_mode` is not 'all'). If False, don't retrieve old readings. force_fulltext : bool Optional, default False - If True, only read fulltext article, ignoring abstracts. batch_size : int Optional, default 1000 - The number of text content entries to be yielded by the database at a given time. no_read : bool Optional, default False - If True, do not perform any new readings, and only retrieve existing readings from the database. no_upload : bool Optional, default False - If True, do not upload content to the database. pickle_file : str or None Optional, default None - otherwise the path to a file in which the reading data will be saved. db : indra_db.DatabaseManager instance Optional, default is None, in which case the primary database provided by `get_primary_db` function is used. Used to interface with a different databse. log_readers : bool Default True. If True, stash the logs of the readers in a file. prioritize : bool Default False. If True, choose only the best content to read. Returns ------- outputs : list [ReadingData] A list of the outputs of the readings in the form of ReadingData instances. """ # Get a database instance. logger.debug("Producing readings in %s mode." % read_mode) if db is None: db = get_primary_db() # Sort out our priorities if prioritize: logger.debug("Prioritizing...") tcids = get_priority_tcids(id_dict, ['pmc_oa', 'manuscripts', 'elsevier'], always_add=['pubmed'], db=db) id_dict = {'tcid': list(tcids)} # Handle the cases where I need to retrieve old readings. prev_readings = [] skip_reader_tcid_dict = None if get_preexisting and read_mode != 'all': prev_readings = get_db_readings(id_dict, reader_list, force_fulltext, batch_size, db=db) skip_reader_tcid_dict = {r.name: [] for r in reader_list} logger.info("Found %d pre-existing readings." % len(prev_readings)) if read_mode != 'none': for rd in prev_readings: skip_reader_tcid_dict[rd.reader].append(rd.tcid) # Now produce any new readings that need to be produced. outputs = [] if read_mode != 'none': outputs = make_db_readings(id_dict, reader_list, verbose=verbose, skip_dict=skip_reader_tcid_dict, db=db, force_fulltext=force_fulltext, force_read=(read_mode == 'all'), batch_size=batch_size, log=log_readers) logger.info("Made %d new readings." % len(outputs)) if not no_upload: try: upload_readings(outputs, db=db) except Exception as e: logger.exception(e) if pickle_file is None: pickle_file = ("failure_reading_dump_%s.pkl" % datetime.now().strftime('%Y%m%d_%H%M%S')) logger.error( "Cound not upload readings. Results are pickled in: " + pickle_file) outputs += prev_readings if pickle_file is not None: with open(pickle_file, 'wb') as f: pickle.dump([output.make_tuple() for output in outputs], f) print("Reading outputs stored in %s." % pickle_file) return outputs
def make_db_readings(id_dict, readers, batch_size=1000, force_fulltext=False, force_read=False, skip_dict=None, db=None, **kwargs): """Read contents retrieved from the database. The content will be retrieved in batchs, given by the `batch` argument. This prevents the system RAM from being overloaded. Parameters ---------- id_dict : dict {<id_type>:[<id value>, ...]} A dict of lists of the id's to be read, keyed by id_type. readers : list of reader objects A list of the readers that will be use, for example ['reach'] if you wanted to use the reach reader. batch_size : int The number of content entries read for each batch. Default 1000. force_fulltext : bool If True, only get fulltext content from the database. Default False. force_read : bool If True, read even if text_content id is found in skip_dict. skip_dict : dict {<reader> : list [int]} A dict containing text content id's to be skipped. db : indra_db.DatabaseManager instance A handle to a database. Default None; if None, a handle to the primary database (see indra_db) is retrieved. Other keyword arguments are passed to the `read` methods of the readers. Returns ------- outputs : list of ReadingData instances The results of the readings with relevant metadata. """ if db is None: db = get_primary_db() # Get the iterator. logger.debug("Getting iterator.") tc_read_q = get_content_query(id_dict, readers, db=db, force_fulltext=force_fulltext, force_read=force_read) logger.debug("Begginning to iterate.") batch_list_dict = {r.name: [] for r in readers} new_outputs = [] if tc_read_q is not None: for text_content in tc_read_q.yield_per(batch_size): # The get_content function returns an iterator which yields # results in batches, so as not to overwhelm RAM. We need to read # in batches for much the same reason. for r in readers: if not force_read: if skip_dict is not None: if text_content.id in skip_dict[r.name]: continue else: # Try to get a previous reading from this reader. reading = db.select_one( db.Reading, db.Reading.text_content_id == text_content.id, _get_matches_clause(db, r)) if reading is not None: continue processed_content = process_content(text_content) if processed_content is not None: batch_list_dict[r.name].append(processed_content) if (len(batch_list_dict[r.name]) + 1) % batch_size is 0: # TODO: this is a bit cludgy...maybe do this better? # Perhaps refactor read_content. logger.debug("Reading batch of files for %s." % r.name) results = r.read(batch_list_dict[r.name], **kwargs) if results is not None: new_outputs += results batch_list_dict[r.name] = [] logger.debug("Finished iteration.") # Pick up any stragglers. for r in readers: if len(batch_list_dict[r.name]) > 0: logger.debug("Reading remaining files for %s." % r.name) results = r.read(batch_list_dict[r.name], **kwargs) if results is not None: new_outputs += results return new_outputs
def run(): db = get_primary_db() stmts = load_mock_statements(db) return calculate_belief(stmts)
def get_content_query(ids, readers, db=None, force_fulltext=False, force_read=False, debug=False, print_summary=False): """Construct a query to access all the content that will be read. If ids is not 'all', and does not contain any ids, None is returned. Parameters ---------- ids : 'all' or dict {<id type> : [str/int]} If 'all', then all the content will be included in the query. Otherwise a the content will be constrained to that corresponding to the ids in id_dict, which are matched using text refs. readers : list [Reader child instances] A list of the reader objects, which contain the required metadata (name and version of the reader) used to find content that needs to be read. db : indra_db.DatabaseManager instance Optional, default None, in which case the primary database is used. If specified, the alternative database will be used. This function should not alter the database. force_fulltext : bool Optional, default False - If True, only fulltext content will be read, as opposed to including abstracts. force_read : bool Optional, default False - If True, all content will be returned, whether it has been read or not. Returns ------- tc_tbr_query : sqlalchemy query object or None The query of the text content to be read (tc_tbr). If there are no ids contained in ids, or it is not 'all', return None. """ if debug: logger.setLevel(logging.DEBUG) if db is None: db = get_primary_db() logger.debug("Got db handle.") # These allow conditions on different tables to equal conditions on the # dependent tables. tc_tr_binding = db.TextContent.text_ref_id == db.TextRef.id rd_tc_binding = db.Reading.text_content_id == db.TextContent.id # Begin the list of clauses with the binding between text content and # text refs. clauses = [tc_tr_binding] # Add a fulltext requirement, if applicable. if force_fulltext: clauses.append(db.TextContent.text_type == texttypes.FULLTEXT) # If we are actually getting anything, else we return None. if ids == 'all' or any([len(id_list) > 0 for id_list in ids.values()]): if ids is not 'all': sub_clauses = get_clauses(ids, db) if len(sub_clauses) > 1: clauses.append(sql.or_(*sub_clauses)) else: clauses.append(*sub_clauses) # Get the text content query object tc_query = db.filter_query(db.TextContent, *clauses).distinct() if not force_read: logger.debug("Getting content to be read.") # Each sub query is a set of content that has been read by one of # the readers. tc_q_subs = [ tc_query.filter(rd_tc_binding, _get_matches_clause(db, r)) for r in readers ] tc_tbr_query = tc_query.except_(sql.intersect(*tc_q_subs)) else: logger.debug('All content will be read (force_read).') tc_tbr_query = tc_query if print_summary: try: logger.debug("Going to try to make a nice summary...") logger.info(get_text_content_summary_string(tc_tbr_query, db)) except Exception: logger.debug("Could not print summary of results.") else: logger.debug("No ids in id_dict, so no query formed.") return None return tc_tbr_query.distinct()
gatherer.add('refs', len(filtered_tr_records)) # Process the text content data filtered_tc_records, flawed_tcs = \ self.filter_text_content(db, mod_tc_data) # Upload the text content data. logger.info('Adding %d more text content entries...' % len(filtered_tc_records)) self.copy_into_db(db, 'text_content', filtered_tc_records, self.tc_cols) gatherer.add('content', len(filtered_tc_records)) return { 'filtered_tr_records': filtered_tr_records, 'flawed_tr_records': flawed_tr_records, 'mod_tc_data': mod_tc_data, 'filtered_tc_records': filtered_tc_records } if __name__ == '__main__': download_latest_data() md = get_metadata_dict() md = [ e for e in md if e['doi'] and e['doi'].upper() != '0.1126/SCIENCE.ABB7331' ] cm = Cord19Manager(md) db = get_primary_db() res = cm.populate(db)
def get_readings_query(ids, readers, db=None, force_fulltext=False): """Create a query to access all the relevant existing readings. Note that if ids is not 'all' and ids is a dict with no ids in it, this function returns None. Parameters ---------- ids : 'all' or dict {<id_type> : [str/int]} If 'all', then all possible readings in the database matching the given readers and other conditions will be returned. Otherwise, only those that correspond to one of the ids in ids dict will be contained. If an ids dict has no ids in it, None is returned. readers : list [Reader child instances] A list of the readers whose names and versions you wish to match in the readings queried from the database. db : indra_db.DatabaseManager instance Optional, default None, in which case the primary database is used. If specified, the alternative database will be used. This function should not alter the database. force_fulltext : bool Optional, default False - If True, only readings corresponding to fulltext content will be read, as opposed to including readings created from abstracts. Returns ------- readings_query : sql query instance or None Returns a query that can be used to access the specified content, or else None if no content was specified. """ if db is None: db = get_primary_db() clauses = [ # Bind conditions on readings to conditions on content. db.Reading.text_content_id == db.TextContent.id, # Bind text content to text refs db.TextContent.text_ref_id == db.TextRef.id, # Check if at least one of the readers has read the content sql.or_(*[_get_matches_clause(db, reader) for reader in readers]) ] if force_fulltext: clauses.append(db.TextContent.text_type == texttypes.FULLTEXT) if ids == 'all' or any([id_list for id_list in ids.values()]): if ids != 'all': sub_clauses = get_clauses(ids, db) if len(sub_clauses) > 1: clauses.append(sql.or_(*sub_clauses)) else: clauses.append(*sub_clauses) readings_query = db.filter_query( db.Reading, # Bind conditions on readings to conditions on content. db.Reading.text_content_id == db.TextContent.id, # Bind text content to text refs db.TextContent.text_ref_id == db.TextRef.id, # Check if at least one of the readers has read the content sql.or_(*[_get_matches_clause(db, reader) for reader in readers]), # Conditions generated from the list of ids. These include a # text-ref text-content binding to connect with id data. *clauses) else: return None return readings_query.distinct()
def get_pmids_for_mesh_terms(mesh_list): db = get_primary_db() res = db.select_all(db.MeshRefAnnotations.pmid, db.MeshRefAnnotations.mesh_id.in_(mesh_list)) return [t[0] for t in res]