示例#1
0
    def add_raw_database_statements(self, stmt_lists):
        """Add raw statementes that came from knowledge bases/databases."""
        assert self.databases is not None
        if self.raw_statements is None:
            self.raw_statements = []
        new_raw_statements = []
        for dbidx, stmt_list in enumerate(stmt_lists):
            db_info = self.databases[dbidx]

            for stmt in stmt_list:
                ev = Evidence(db_info.source_api)
                stmt.evidence.append(ev)
                src_hash = ev.get_source_hash()
                raw_json = stmt.to_json()
                db_rs = self.db.RawStatements(
                    db_info_id=db_info.id,
                    json=json.dumps(raw_json).encode('utf-8'),
                    type=raw_json['type'],
                    uuid=stmt.uuid,
                    batch_id=1,
                    source_hash=src_hash,
                    mk_hash=stmt.get_hash(),
                    indra_version="test")
                self.raw_statements.append(db_rs)
                new_raw_statements.append(db_rs)

        self.db.session.add_all(new_raw_statements)
        self.db.session.commit()

        insert_raw_agents(self.db, 1,
                          [s for slist in stmt_lists for s in slist])
示例#2
0
文件: util.py 项目: steppi/indra_db
def simple_insert_stmts(db, stmts_dict):
    """Insert raw statements from readings into the database.

    `stmts_dict` must be of the form {<source_type>: {<source_id>: [stmts]}}
    where `source_type` is "reading" or "databases", and source_id would be a
    reading ID or a db_info_id, respectively.
    """
    batch_id = db.make_copy_batch_id()

    stmt_data = []
    cols = ('uuid', 'mk_hash', 'db_info_id', 'reading_id', 'type', 'json',
            'batch_id', 'source_hash', 'indra_version')

    all_stmts = []
    for category, stmts in stmts_dict.items():
        for src_id, stmt_list in stmts.items():
            for stmt in stmt_list:
                stmt_info = {
                    'uuid': stmt.uuid,
                    'mk_hash': stmt.get_hash(refresh=True),
                    'type': stmt.__class__.__name__,
                    'json': json.dumps(stmt.to_json()).encode('utf-8'),
                    'batch_id': batch_id,
                    'source_hash': -1,
                    'indra_version': get_version()
                }
                if category == 'reading':
                    stmt_info['reading_id'] = src_id
                else:
                    stmt_info['db_info_id'] = src_id
                stmt_data.append(tuple(stmt_info.get(col) for col in cols))
                all_stmts.append(stmt)

    db.copy('raw_statements', stmt_data, cols)
    insert_raw_agents(db, batch_id, all_stmts)
示例#3
0
文件: util.py 项目: steppi/indra_db
    def insert_the_statements(self, input_tuples):
        print("Loading %d statements..." % len(input_tuples))
        cols = self.test_data['raw_statements']['cols']
        new_input_dict = {}

        batch_id_set = set()

        for t in input_tuples:
            batch_id_set.add(t[cols.index('batch_id')])

            rid = t[cols.index('reading_id')]
            dbid = t[cols.index('db_info_id')]
            mk_hash = t[cols.index('mk_hash')]
            if rid is not None:
                key = (mk_hash, rid, t[cols.index('text_hash')])
            elif dbid is not None:
                key = (mk_hash, dbid, t[cols.index('source_hash')])
            else:
                raise ValueError("Either rid or dbid must be non-none.")
            new_input_dict[key] = t

        logger.debug("Loading %d/%d statements." %
                     (len(new_input_dict), len(input_tuples)))

        self.test_db.copy_push('raw_statements',
                               new_input_dict.values(),
                               cols,
                               constraint='reading_raw_statement_uniqueness')

        print("Inserting agents...")
        for batch_id in batch_id_set:
            dbu.insert_raw_agents(self.test_db, batch_id)

        return set(new_input_dict.values())
示例#4
0
    def dump_statements(self, db):
        tc_rows = set(self.text_content.values())
        tc_cols = ('text_ref_id', 'source', 'format', 'text_type', 'preprint')
        logger.info(f"Dumping {len(tc_rows)} text content.")
        db.copy_lazy('text_content', tc_rows, tc_cols)

        # Look up tcids for newly entered content.
        tcids = db.select_all(
            [db.TextContent.text_ref_id, db.TextContent.id],
            db.TextContent.text_ref_id.in_(self.statements.keys()),
            db.TextContent.format == 'xdd'
        )
        tcid_lookup = {trid: tcid for trid, tcid in tcids}

        # Compile reading and statements into rows.
        r_rows = set()
        r_cols = ('id', 'text_content_id', 'reader', 'reader_version',
                  'format', 'batch_id')
        s_rows = set()
        rd_batch_id = db.make_copy_batch_id()
        stmt_batch_id = db.make_copy_batch_id()
        stmts = []
        for trid, trid_set in self.statements.items():
            for reader, stmt_list in trid_set.items():
                tcid = tcid_lookup[trid]
                reader_version = self.reader_versions[reader.upper()]
                reading_id = generate_reading_id(tcid, reader, reader_version)
                r_rows.add((reading_id, tcid, reader.upper(), reader_version,
                            'xdd', rd_batch_id))
                for sj in stmt_list:
                    stmt = Statement._from_json(sj)
                    stmts.append(stmt)
                    sd = DatabaseStatementData(
                        stmt,
                        reading_id,
                        indra_version=self.indra_version
                    )
                    s_rows.add(sd.make_tuple(stmt_batch_id))

        logger.info(f"Dumping {len(r_rows)} readings.")
        db.copy_lazy('reading', r_rows, r_cols, commit=False)

        logger.info(f"Dumping {len(s_rows)} raw statements.")
        db.copy_lazy('raw_statements', s_rows,
                     DatabaseStatementData.get_cols(), commit=False)
        if len(stmts):
            insert_raw_agents(db, stmt_batch_id, stmts, verbose=False,
                              commit=False)

        update_rows = [(json.dumps(self.reader_versions), self.indra_version,
                        group.key[:-1])
                       for group in self.groups]
        db.copy('xdd_updates', update_rows,
                ('reader_versions', 'indra_version', 'day_str'))
        return
示例#5
0
    def add_raw_reading_statements(self, stmt_lists):
        """Add raw statements."""
        assert self.readings is not None
        if self.raw_statements is None:
            self.raw_statements = []
        new_raw_statements = []
        for ridx, stmt_list in enumerate(stmt_lists):
            rid = self.readings[ridx].id
            reader = self.readings[ridx].reader
            if reader == 'SPARSER':
                pmid = None
            else:
                pmid = self.readings[ridx].text_content.text_ref.pmid

            def ev(stmt, detail=None):
                reading = self.readings[ridx]
                text = f"{reading.text_content.source} from trid " \
                       f"{reading.text_content.text_ref_id} indicates " \
                       f"{type(stmt).__name__}: {stmt.agent_list()}."
                if detail is not None:
                    text = f"{detail} {text}"
                return Evidence(self.readings[ridx].reader.lower(),
                                pmid=pmid,
                                text=text)

            for stmt in stmt_list:
                if isinstance(stmt, tuple):
                    stmt, detail = stmt
                else:
                    detail = None
                stmt.evidence.append(ev(stmt, detail))
                raw_json = stmt.to_json()
                db_rs = self.db.RawStatements(
                    reading_id=rid,
                    json=json.dumps(raw_json).encode('utf-8'),
                    type=raw_json['type'],
                    uuid=stmt.uuid,
                    batch_id=0,
                    text_hash=DatabaseStatementData(stmt)._get_text_hash(),
                    source_hash=stmt.evidence[0].get_source_hash(),
                    mk_hash=stmt.get_hash(),
                    indra_version="test")
                self.raw_statements.append(db_rs)
                new_raw_statements.append(db_rs)
        self.db.session.add_all(new_raw_statements)
        self.db.session.commit()

        insert_raw_agents(self.db, 0, [
            s[0] if isinstance(s, tuple) else s for slist in stmt_lists
            for s in slist
        ])
示例#6
0
    def dump_results_to_db(self):
        """Upload the results to the database."""
        self.starts['dump_results_db'] = datetime.utcnow()
        logger.info("Uploading %d results to the database." %
                    len(self.result_outputs))
        batch_id = self._db.make_copy_batch_id()

        if self.reader.results_type == 'statements':
            # Find and filter out duplicate statements.
            stmt_tuples = {}
            stmts = []
            stmt_dups = {}
            for sd in self.result_outputs:
                tpl = sd.make_tuple(batch_id)
                key = (tpl[1], tpl[4], tpl[9])
                if key in stmt_tuples.keys():
                    logger.warning('Duplicate key found: %s.' % str(key))
                    if key in stmt_dups.keys():
                        stmt_dups[key].append(tpl)
                    else:
                        stmt_dups[key] = [tpl]
                else:
                    stmt_tuples[key] = tpl
                    stmts.append(sd.result)

            # Dump the good statements into the raw statements table.
            updated = self._db.copy_report_push(
                'raw_statements',
                stmt_tuples.values(),
                DatabaseStatementData.get_cols(),
                constraint='reading_raw_statement_uniqueness',
                commit=False,
                return_cols=('uuid',)
            )
            gatherer.add('new_stmts', len(stmt_tuples) - len(updated))
            gatherer.add('upd_stmts', len(updated))

            # Dump the duplicates into a separate to all for debugging.
            self._db.copy('rejected_statements',
                          [tpl for dl in stmt_dups.values() for tpl in dl],
                          DatabaseStatementData.get_cols(),
                          commit=False)

            # Add the agents for the accepted statements.
            logger.info("Uploading agents to the database.")
            if len(stmts):
                insert_raw_agents(self._db, batch_id, stmts, verbose=False)
            self.stops['dump_statements_db'] = datetime.utcnow()
        else:
            mesh_term_tuples = set()
            for mrd in self.result_outputs:
                tpl = mrd.make_tuple(batch_id)
                mesh_term_tuples.add(tpl)

            # Dump mesh_terms to the table
            skipped = self._db.copy_report_lazy('mti_ref_annotations_test',
                                                mesh_term_tuples,
                                                DatabaseMeshRefData.get_cols())

            gatherer.add('new_mesh_terms', len(mesh_term_tuples) - len(skipped))
            gatherer.add('skp_mesh_terms', len(skipped))

        return
示例#7
0
文件: xdd.py 项目: pagreene/indra_db
    def dump_statements(self, db):
        from indra_db.reading.read_db import DatabaseStatementData, \
            generate_reading_id
        tc_rows = set(self.text_content.values())
        tc_cols = ('text_ref_id', 'source', 'format', 'text_type', 'preprint')
        logger.info(f"Dumping {len(tc_rows)} text content.")
        db.copy_lazy('text_content', tc_rows, tc_cols)

        # Look up tcids for newly entered content.
        tcids = db.select_all([db.TextContent.text_ref_id, db.TextContent.id],
                              db.TextContent.text_ref_id.in_(
                                  self.statements.keys()),
                              db.TextContent.format == 'xdd')
        tcid_lookup = {trid: tcid for trid, tcid in tcids}

        # Compile reading and statements into rows.
        r_rows = set()
        r_cols = ('id', 'text_content_id', 'reader', 'reader_version',
                  'format', 'batch_id')
        s_rows = set()
        rd_batch_id = db.make_copy_batch_id()
        stmt_batch_id = db.make_copy_batch_id()
        stmts = []
        for trid, trid_set in self.statements.items():
            for reader, stmt_list in trid_set.items():
                tcid = tcid_lookup[trid]
                reader_version = self.reader_versions[reader.upper()]
                reading_id = generate_reading_id(tcid, reader, reader_version)
                r_rows.add((reading_id, tcid, reader.upper(), reader_version,
                            'xdd', rd_batch_id))
                for sj in stmt_list:
                    stmt = Statement._from_json(sj)
                    stmts.append(stmt)
                    sd = DatabaseStatementData(
                        stmt, reading_id, indra_version=self.indra_version)
                    s_rows.add(sd.make_tuple(stmt_batch_id))

        logger.info(f"Dumping {len(r_rows)} readings.")
        db.copy_lazy('reading',
                     r_rows,
                     r_cols,
                     commit=False,
                     constraint='reading-uniqueness')

        logger.info(f"Dumping {len(s_rows)} raw statements.")
        skipped = db.copy_report_lazy('raw_statements',
                                      s_rows,
                                      DatabaseStatementData.get_cols(),
                                      commit=False)
        skipped_uuids = {
            t[DatabaseStatementData.get_cols().index('uuid')]
            for t in skipped
        }
        new_stmts = [s for s in stmts if s.uuid not in skipped_uuids]
        if len(new_stmts):
            insert_raw_agents(db,
                              stmt_batch_id,
                              new_stmts,
                              verbose=False,
                              commit=False)
        return