def add_raw_database_statements(self, stmt_lists): """Add raw statementes that came from knowledge bases/databases.""" assert self.databases is not None if self.raw_statements is None: self.raw_statements = [] new_raw_statements = [] for dbidx, stmt_list in enumerate(stmt_lists): db_info = self.databases[dbidx] for stmt in stmt_list: ev = Evidence(db_info.source_api) stmt.evidence.append(ev) src_hash = ev.get_source_hash() raw_json = stmt.to_json() db_rs = self.db.RawStatements( db_info_id=db_info.id, json=json.dumps(raw_json).encode('utf-8'), type=raw_json['type'], uuid=stmt.uuid, batch_id=1, source_hash=src_hash, mk_hash=stmt.get_hash(), indra_version="test") self.raw_statements.append(db_rs) new_raw_statements.append(db_rs) self.db.session.add_all(new_raw_statements) self.db.session.commit() insert_raw_agents(self.db, 1, [s for slist in stmt_lists for s in slist])
def simple_insert_stmts(db, stmts_dict): """Insert raw statements from readings into the database. `stmts_dict` must be of the form {<source_type>: {<source_id>: [stmts]}} where `source_type` is "reading" or "databases", and source_id would be a reading ID or a db_info_id, respectively. """ batch_id = db.make_copy_batch_id() stmt_data = [] cols = ('uuid', 'mk_hash', 'db_info_id', 'reading_id', 'type', 'json', 'batch_id', 'source_hash', 'indra_version') all_stmts = [] for category, stmts in stmts_dict.items(): for src_id, stmt_list in stmts.items(): for stmt in stmt_list: stmt_info = { 'uuid': stmt.uuid, 'mk_hash': stmt.get_hash(refresh=True), 'type': stmt.__class__.__name__, 'json': json.dumps(stmt.to_json()).encode('utf-8'), 'batch_id': batch_id, 'source_hash': -1, 'indra_version': get_version() } if category == 'reading': stmt_info['reading_id'] = src_id else: stmt_info['db_info_id'] = src_id stmt_data.append(tuple(stmt_info.get(col) for col in cols)) all_stmts.append(stmt) db.copy('raw_statements', stmt_data, cols) insert_raw_agents(db, batch_id, all_stmts)
def insert_the_statements(self, input_tuples): print("Loading %d statements..." % len(input_tuples)) cols = self.test_data['raw_statements']['cols'] new_input_dict = {} batch_id_set = set() for t in input_tuples: batch_id_set.add(t[cols.index('batch_id')]) rid = t[cols.index('reading_id')] dbid = t[cols.index('db_info_id')] mk_hash = t[cols.index('mk_hash')] if rid is not None: key = (mk_hash, rid, t[cols.index('text_hash')]) elif dbid is not None: key = (mk_hash, dbid, t[cols.index('source_hash')]) else: raise ValueError("Either rid or dbid must be non-none.") new_input_dict[key] = t logger.debug("Loading %d/%d statements." % (len(new_input_dict), len(input_tuples))) self.test_db.copy_push('raw_statements', new_input_dict.values(), cols, constraint='reading_raw_statement_uniqueness') print("Inserting agents...") for batch_id in batch_id_set: dbu.insert_raw_agents(self.test_db, batch_id) return set(new_input_dict.values())
def dump_statements(self, db): tc_rows = set(self.text_content.values()) tc_cols = ('text_ref_id', 'source', 'format', 'text_type', 'preprint') logger.info(f"Dumping {len(tc_rows)} text content.") db.copy_lazy('text_content', tc_rows, tc_cols) # Look up tcids for newly entered content. tcids = db.select_all( [db.TextContent.text_ref_id, db.TextContent.id], db.TextContent.text_ref_id.in_(self.statements.keys()), db.TextContent.format == 'xdd' ) tcid_lookup = {trid: tcid for trid, tcid in tcids} # Compile reading and statements into rows. r_rows = set() r_cols = ('id', 'text_content_id', 'reader', 'reader_version', 'format', 'batch_id') s_rows = set() rd_batch_id = db.make_copy_batch_id() stmt_batch_id = db.make_copy_batch_id() stmts = [] for trid, trid_set in self.statements.items(): for reader, stmt_list in trid_set.items(): tcid = tcid_lookup[trid] reader_version = self.reader_versions[reader.upper()] reading_id = generate_reading_id(tcid, reader, reader_version) r_rows.add((reading_id, tcid, reader.upper(), reader_version, 'xdd', rd_batch_id)) for sj in stmt_list: stmt = Statement._from_json(sj) stmts.append(stmt) sd = DatabaseStatementData( stmt, reading_id, indra_version=self.indra_version ) s_rows.add(sd.make_tuple(stmt_batch_id)) logger.info(f"Dumping {len(r_rows)} readings.") db.copy_lazy('reading', r_rows, r_cols, commit=False) logger.info(f"Dumping {len(s_rows)} raw statements.") db.copy_lazy('raw_statements', s_rows, DatabaseStatementData.get_cols(), commit=False) if len(stmts): insert_raw_agents(db, stmt_batch_id, stmts, verbose=False, commit=False) update_rows = [(json.dumps(self.reader_versions), self.indra_version, group.key[:-1]) for group in self.groups] db.copy('xdd_updates', update_rows, ('reader_versions', 'indra_version', 'day_str')) return
def add_raw_reading_statements(self, stmt_lists): """Add raw statements.""" assert self.readings is not None if self.raw_statements is None: self.raw_statements = [] new_raw_statements = [] for ridx, stmt_list in enumerate(stmt_lists): rid = self.readings[ridx].id reader = self.readings[ridx].reader if reader == 'SPARSER': pmid = None else: pmid = self.readings[ridx].text_content.text_ref.pmid def ev(stmt, detail=None): reading = self.readings[ridx] text = f"{reading.text_content.source} from trid " \ f"{reading.text_content.text_ref_id} indicates " \ f"{type(stmt).__name__}: {stmt.agent_list()}." if detail is not None: text = f"{detail} {text}" return Evidence(self.readings[ridx].reader.lower(), pmid=pmid, text=text) for stmt in stmt_list: if isinstance(stmt, tuple): stmt, detail = stmt else: detail = None stmt.evidence.append(ev(stmt, detail)) raw_json = stmt.to_json() db_rs = self.db.RawStatements( reading_id=rid, json=json.dumps(raw_json).encode('utf-8'), type=raw_json['type'], uuid=stmt.uuid, batch_id=0, text_hash=DatabaseStatementData(stmt)._get_text_hash(), source_hash=stmt.evidence[0].get_source_hash(), mk_hash=stmt.get_hash(), indra_version="test") self.raw_statements.append(db_rs) new_raw_statements.append(db_rs) self.db.session.add_all(new_raw_statements) self.db.session.commit() insert_raw_agents(self.db, 0, [ s[0] if isinstance(s, tuple) else s for slist in stmt_lists for s in slist ])
def dump_results_to_db(self): """Upload the results to the database.""" self.starts['dump_results_db'] = datetime.utcnow() logger.info("Uploading %d results to the database." % len(self.result_outputs)) batch_id = self._db.make_copy_batch_id() if self.reader.results_type == 'statements': # Find and filter out duplicate statements. stmt_tuples = {} stmts = [] stmt_dups = {} for sd in self.result_outputs: tpl = sd.make_tuple(batch_id) key = (tpl[1], tpl[4], tpl[9]) if key in stmt_tuples.keys(): logger.warning('Duplicate key found: %s.' % str(key)) if key in stmt_dups.keys(): stmt_dups[key].append(tpl) else: stmt_dups[key] = [tpl] else: stmt_tuples[key] = tpl stmts.append(sd.result) # Dump the good statements into the raw statements table. updated = self._db.copy_report_push( 'raw_statements', stmt_tuples.values(), DatabaseStatementData.get_cols(), constraint='reading_raw_statement_uniqueness', commit=False, return_cols=('uuid',) ) gatherer.add('new_stmts', len(stmt_tuples) - len(updated)) gatherer.add('upd_stmts', len(updated)) # Dump the duplicates into a separate to all for debugging. self._db.copy('rejected_statements', [tpl for dl in stmt_dups.values() for tpl in dl], DatabaseStatementData.get_cols(), commit=False) # Add the agents for the accepted statements. logger.info("Uploading agents to the database.") if len(stmts): insert_raw_agents(self._db, batch_id, stmts, verbose=False) self.stops['dump_statements_db'] = datetime.utcnow() else: mesh_term_tuples = set() for mrd in self.result_outputs: tpl = mrd.make_tuple(batch_id) mesh_term_tuples.add(tpl) # Dump mesh_terms to the table skipped = self._db.copy_report_lazy('mti_ref_annotations_test', mesh_term_tuples, DatabaseMeshRefData.get_cols()) gatherer.add('new_mesh_terms', len(mesh_term_tuples) - len(skipped)) gatherer.add('skp_mesh_terms', len(skipped)) return
def dump_statements(self, db): from indra_db.reading.read_db import DatabaseStatementData, \ generate_reading_id tc_rows = set(self.text_content.values()) tc_cols = ('text_ref_id', 'source', 'format', 'text_type', 'preprint') logger.info(f"Dumping {len(tc_rows)} text content.") db.copy_lazy('text_content', tc_rows, tc_cols) # Look up tcids for newly entered content. tcids = db.select_all([db.TextContent.text_ref_id, db.TextContent.id], db.TextContent.text_ref_id.in_( self.statements.keys()), db.TextContent.format == 'xdd') tcid_lookup = {trid: tcid for trid, tcid in tcids} # Compile reading and statements into rows. r_rows = set() r_cols = ('id', 'text_content_id', 'reader', 'reader_version', 'format', 'batch_id') s_rows = set() rd_batch_id = db.make_copy_batch_id() stmt_batch_id = db.make_copy_batch_id() stmts = [] for trid, trid_set in self.statements.items(): for reader, stmt_list in trid_set.items(): tcid = tcid_lookup[trid] reader_version = self.reader_versions[reader.upper()] reading_id = generate_reading_id(tcid, reader, reader_version) r_rows.add((reading_id, tcid, reader.upper(), reader_version, 'xdd', rd_batch_id)) for sj in stmt_list: stmt = Statement._from_json(sj) stmts.append(stmt) sd = DatabaseStatementData( stmt, reading_id, indra_version=self.indra_version) s_rows.add(sd.make_tuple(stmt_batch_id)) logger.info(f"Dumping {len(r_rows)} readings.") db.copy_lazy('reading', r_rows, r_cols, commit=False, constraint='reading-uniqueness') logger.info(f"Dumping {len(s_rows)} raw statements.") skipped = db.copy_report_lazy('raw_statements', s_rows, DatabaseStatementData.get_cols(), commit=False) skipped_uuids = { t[DatabaseStatementData.get_cols().index('uuid')] for t in skipped } new_stmts = [s for s in stmts if s.uuid not in skipped_uuids] if len(new_stmts): insert_raw_agents(db, stmt_batch_id, new_stmts, verbose=False, commit=False) return