Exemplo n.º 1
0
def get_statement_object(db_stmt):
    """Get an INDRA Statement object from a db_stmt."""
    if isinstance(db_stmt, bytes):
        jb = db_stmt
    else:
        jb = db_stmt.json
    return Statement._from_json(json.loads(jb.decode('utf-8')))
Exemplo n.º 2
0
def test_conversion_keying():
    stmt_json = {
        "type": "Conversion",
        "subj": {
            "name": "inflammatory response",
            "db_refs": {}
        },
        "obj_from": [{
            "name": "KNG1",
            "db_refs": {
                "HGNC": "6383",
                "UP": "P01042"
            }
        }],
        "obj_to": [{
            "name": "Kallidin",
            "db_refs": {
                "SCHEM": "Kallidin"
            }
        }],
        "id": "d2361669-dfe5-45e0-914a-c96a82ad25fb"
    }
    stmt_list = [Statement._from_json(stmt_json)]
    stmt_list[0].agent_list()
    list(_get_keyed_stmts(stmt_list))
    return
Exemplo n.º 3
0
def _choose_unique(not_duplicates, get_full_stmts, stmt_tpl_grp):
    """Choose one of the statements from a redundant set."""
    assert stmt_tpl_grp, "This cannot be empty."
    if len(stmt_tpl_grp) == 1:
        s_tpl = stmt_tpl_grp[0]
        duplicate_ids = set()
    else:
        stmt_tpl_set = set(stmt_tpl_grp)
        preferred_tpls = {tpl for tpl in stmt_tpl_set
                          if tpl[1] in not_duplicates}
        if not preferred_tpls:
            s_tpl = stmt_tpl_set.pop()
        elif len(preferred_tpls) == 1:
            s_tpl = preferred_tpls.pop()
        else:  # len(preferred_stmts) > 1
            assert False, \
                ("Duplicate deduplicated statements found: %s"
                 % str(preferred_tpls))
        duplicate_ids = {tpl[1] for tpl in stmt_tpl_set
                         if tpl[1] not in not_duplicates}

    if get_full_stmts:
        stmt_json = json.loads(s_tpl[2].decode('utf-8'))
        ret_stmt = Statement._from_json(stmt_json)
    else:
        ret_stmt = s_tpl[1]
    return ret_stmt, duplicate_ids
Exemplo n.º 4
0
 def get_statement(cls, cl_statement):
     """Get an INDRA Statement from cl-json"""
     stmt_json = cls.converter.cl_to_json(cl_statement)
     if not stmt_json:
         return None
     elif isinstance(stmt_json, list):
         return stmts_from_json(stmt_json)
     else:
         return Statement._from_json(stmt_json)
Exemplo n.º 5
0
def _get_reading_statement_dict(db, clauses=None, get_full_stmts=True):
    """Get a nested dict of statements, keyed by ref, content, and reading."""
    # Construct the query for metadata from the database.
    q = (db.session.query(db.TextRef, db.TextContent.id,
                          db.TextContent.source, db.Reading.id,
                          db.Reading.reader_version, db.RawStatements.id,
                          db.RawStatements.json)
         .filter(db.RawStatements.reading_id == db.Reading.id,
                 db.Reading.text_content_id == db.TextContent.id,
                 db.TextContent.text_ref_id == db.TextRef.id))
    if clauses:
        q = q.filter(*clauses)

    # Prime some counters.
    num_duplicate_evidence = 0
    num_unique_evidence = 0

    # Populate a dict with all the data.
    stmt_nd = NestedDict()
    for tr, tcid, src, rid, rv, sid, sjson in q.yield_per(1000):
        # Back out the reader name.
        for reader, rv_list in reader_versions.items():
            if rv in rv_list:
                break
        else:
            raise Exception("rv %s not recognized." % rv)

        # Get the json for comparison and/or storage
        stmt_json = json.loads(sjson.decode('utf8'))
        stmt = Statement._from_json(stmt_json)
        _set_evidence_text_ref(stmt, tr)

        # Hash the compbined stmt and evidence matches key.
        stmt_hash = stmt.get_hash(shallow=False)

        # For convenience get the endpoint statement dict
        s_dict = stmt_nd[tr.id][src][tcid][reader][rv][rid]

        # Initialize the value to a set, and count duplicates
        if stmt_hash not in s_dict.keys():
            s_dict[stmt_hash] = set()
            num_unique_evidence += 1
        else:
            num_duplicate_evidence += 1

        # Either store the statement, or the statement id.
        if get_full_stmts:
            s_dict[stmt_hash].add((sid, stmt))
        else:
            s_dict[stmt_hash].add((sid, None))

    # Report on the results.
    print("Found %d relevant text refs with statements." % len(stmt_nd))
    print("number of statement exact duplicates: %d" % num_duplicate_evidence)
    print("number of unique statements: %d" % num_unique_evidence)
    return stmt_nd
Exemplo n.º 6
0
    def dump_statements(self, db):
        tc_rows = set(self.text_content.values())
        tc_cols = ('text_ref_id', 'source', 'format', 'text_type', 'preprint')
        logger.info(f"Dumping {len(tc_rows)} text content.")
        db.copy_lazy('text_content', tc_rows, tc_cols)

        # Look up tcids for newly entered content.
        tcids = db.select_all(
            [db.TextContent.text_ref_id, db.TextContent.id],
            db.TextContent.text_ref_id.in_(self.statements.keys()),
            db.TextContent.format == 'xdd'
        )
        tcid_lookup = {trid: tcid for trid, tcid in tcids}

        # Compile reading and statements into rows.
        r_rows = set()
        r_cols = ('id', 'text_content_id', 'reader', 'reader_version',
                  'format', 'batch_id')
        s_rows = set()
        rd_batch_id = db.make_copy_batch_id()
        stmt_batch_id = db.make_copy_batch_id()
        stmts = []
        for trid, trid_set in self.statements.items():
            for reader, stmt_list in trid_set.items():
                tcid = tcid_lookup[trid]
                reader_version = self.reader_versions[reader.upper()]
                reading_id = generate_reading_id(tcid, reader, reader_version)
                r_rows.add((reading_id, tcid, reader.upper(), reader_version,
                            'xdd', rd_batch_id))
                for sj in stmt_list:
                    stmt = Statement._from_json(sj)
                    stmts.append(stmt)
                    sd = DatabaseStatementData(
                        stmt,
                        reading_id,
                        indra_version=self.indra_version
                    )
                    s_rows.add(sd.make_tuple(stmt_batch_id))

        logger.info(f"Dumping {len(r_rows)} readings.")
        db.copy_lazy('reading', r_rows, r_cols, commit=False)

        logger.info(f"Dumping {len(s_rows)} raw statements.")
        db.copy_lazy('raw_statements', s_rows,
                     DatabaseStatementData.get_cols(), commit=False)
        if len(stmts):
            insert_raw_agents(db, stmt_batch_id, stmts, verbose=False,
                              commit=False)

        update_rows = [(json.dumps(self.reader_versions), self.indra_version,
                        group.key[:-1])
                       for group in self.groups]
        db.copy('xdd_updates', update_rows,
                ('reader_versions', 'indra_version', 'day_str'))
        return
Exemplo n.º 7
0
def test_conversion_keying():
    stmt_json = {"type": "Conversion",
                 "subj": {"name": "inflammatory response", "db_refs": {}},
                 "obj_from": [{"name": "KNG1",
                               "db_refs": {"HGNC": "6383", "UP": "P01042"}}],
                 "obj_to": [{"name": "Kallidin",
                             "db_refs": {"SCHEM": "Kallidin"}}],
                 "id": "d2361669-dfe5-45e0-914a-c96a82ad25fb"}
    stmt_list = [Statement._from_json(stmt_json)]
    stmt_list[0].agent_list()
    list(_get_keyed_stmts(stmt_list))
    return
Exemplo n.º 8
0
    def insert_the_statements(self, input_tuples):
        print("Loading %d statements..." % len(input_tuples))
        cols = self.test_data['raw_statements']['cols'] + ('source_hash', )
        new_input_tuples = []
        for t in input_tuples:
            s = Statement._from_json(json.loads(t[-1].decode('utf-8')))
            t += (s.evidence[0].get_source_hash(), )
            new_input_tuples.append(t)

        self.test_db.copy('raw_statements', new_input_tuples, cols)
        print("Inserting agents...")
        dbu.insert_agents(self.test_db, 'raw')
        return
Exemplo n.º 9
0
def _get_input_stmt_tuples(num_stmts):
    print("\tPrepping the raw statements...")
    stmt_tuples, col_names = _get_stmt_tuples(num_stmts)
    copy_col_names = ('uuid', 'mk_hash', 'type', 'indra_version', 'json',
                      'reading_id', 'db_info_id')
    copy_stmt_tuples = []
    for tpl in stmt_tuples:
        entry_dict = dict(zip(col_names, tpl))
        json_bytes = entry_dict['json']
        stmt = Statement._from_json(json.loads(json_bytes.decode('utf-8')))
        entry_dict['mk_hash'] = stmt.get_hash()
        ret_tpl = tuple([entry_dict[col] for col in copy_col_names])
        copy_stmt_tuples.append(ret_tpl)
    return copy_stmt_tuples, copy_col_names
Exemplo n.º 10
0
def _json_str_to_stmts_dict(json_str):
    """Make a dict of statements keyed by their uuid's from json representation

    This function is the inverse of _stmts_dict_to_json_str()

    Parameters
    ----------
    json_str : str
        A json compatible string

    Returns
    -------
    stmt_dict : dict
        Dict with statements keyed by their uuid's: {uuid: stmt}
    """
    stmt_jsons = json.loads(json_str)
    stmts = [Statement._from_json(s) for s in stmt_jsons]
    return {s.uuid: s for s in stmts}
Exemplo n.º 11
0
def str_imp(o, uuid=None, other_stmt_keys=None):
    if o is None:
        return '~'
    cname = o.__class__.__name__
    if cname == 'TextRef':
        return ('<TextRef: trid: %s, pmid: %s, pmcid: %s>'
                % (o.id, o.pmid, o.pmcid))
    if cname == 'TextContent':
        return ('<TextContent: tcid: %s, trid: %s, src: %s>'
                % (o.id, o.text_ref_id, o.source))
    if cname == 'Reading':
        return ('<Reading: rid: %s, tcid: %s, reader: %s, rv: %s>'
                % (o.id, o.text_content_id, o.reader, o.reader_version))
    if cname == 'RawStatements':
        s = Statement._from_json(json.loads(o.json.decode()))
        s_str = ('<RawStmt: %s sid: %s, uuid: %s, type: %s, iv: %s, hash: %s>'
                 % (str(s), o.id, o.uuid, o.type, o.indra_version, o.mk_hash))
        if other_stmt_keys and s.get_hash(shallow=True) in other_stmt_keys:
            s_str = '+' + s_str
        if s.uuid == uuid:
            s_str = '*' + s_str
        return s_str
Exemplo n.º 12
0
def get_filtered_db_stmts(db, get_full_stmts=False, clauses=None):
    """Get the set of statements/ids from databases minus exact duplicates."""
    # Only get the json if it's going to be used.
    if get_full_stmts:
        tbl_list = [db.RawStatements.json]
    else:
        tbl_list = [db.RawStatements.id]

    db_s_q = db.filter_query(tbl_list, db.RawStatements.db_info_id.isnot(None))

    # Add any other criterion specified at higher levels.
    if clauses:
        db_s_q = db_s_q.filter(*clauses)

    # Produce a generator of statement groups.
    db_stmt_data = db_s_q.yield_per(10000)
    if get_full_stmts:
        return {
            Statement._from_json(json.loads(s_json.decode('utf-8')))
            for s_json, in db_stmt_data
        }
    else:
        return {sid for sid, in db_stmt_data}
Exemplo n.º 13
0
def _get_statement_object(db_stmt):
    """Get an INDRA Statement object from a db_stmt."""
    return Statement._from_json(json.loads(db_stmt.json.decode('utf-8')))
Exemplo n.º 14
0
def _stmt_from_json(stmt_json_bytes):
    return Statement._from_json(json.loads(stmt_json_bytes.decode('utf-8')))
Exemplo n.º 15
0
def distill_stmts_from_reading(db, get_full_stmts=False, clauses=None):
    """Get a corpus of statements from clauses and filters duplicate evidence.

    Note that this will only get statements from reading.

    Parameters
    ----------
    db : :py:class:`DatabaseManager`
        A database manager instance to access the database.
    get_full_stmts : bool
        By default (False), only Statement ids (the primary index of Statements
        on the database) are returned. However, if set to True, serialized
        INDRA Statements will be returned. Note that this will in general be
        VERY large in memory, and therefore should be used with caution.
    clauses : None or list of sqlalchemy clauses
        By default None. Specify sqlalchemy clauses to reduce the scope of
        statements, e.g. `clauses=[db.Statements.type == 'Phosphorylation']` or
        `clauses=[db.Statements.uuid.in_([<uuids>])]`.

    Returns
    -------
    stmt_dn : NestedDict
        A deeply nested recursive dictionary, carrying the metadata for the
        Statements.
    stmt_ret : set
        A set of either statement ids or serialized statements, depending on
        `get_full_stmts`.
    """
    # Construct the query for metadata from the database.
    q = (db.session.query(db.TextContent.text_ref_id, db.TextContent.id,
                          db.TextContent.source, db.Readings.id,
                          db.Readings.reader_version, db.Statements.id,
                          db.Statements.json).filter(
                              db.TextContent.id == db.Readings.text_content_id,
                              db.Readings.id == db.Statements.reader_ref))
    if clauses:
        q.filter(*clauses)

    # Specify sources of fulltext content, and order priorities.
    full_text_content = ['manuscripts', 'pmc_oa', 'elsevier']

    # Specify versions of readers, and preference.
    sparser_versions = ['sept14-linux\n', 'sept14-linux']
    reach_versions = ['61059a-biores-e9ee36', '1.3.3-61059a-biores-']

    # Prime some counters.
    num_duplicate_evidence = 0
    num_unique_evidence = 0

    # Populate a dict with all the data.
    stmt_nd = NestedDict()
    for trid, tcid, src, rid, rv, sid, sjson in q.yield_per(1000):
        # Back out the reader name.
        if rv in sparser_versions:
            reader = 'sparser'
        elif rv in reach_versions:
            reader = 'reach'
        else:
            raise Exception("rv %s not recognized." % rv)

        # Get the json for comparison and/or storage
        stmt_json = json.loads(sjson.decode('utf8'))
        stmt = Statement._from_json(stmt_json)

        # Hash the compbined stmt and evidence matches key.
        m_key = stmt.matches_key() + stmt.evidence[0].matches_key()
        stmt_hash = hash(m_key)

        # For convenience get the endpoint statement dict
        s_dict = stmt_nd[trid][src][tcid][reader][rv][rid]

        # Initialize the value to a set, and count duplicates
        if stmt_hash not in s_dict.keys():
            s_dict[stmt_hash] = set()
            num_unique_evidence += 1
        else:
            num_duplicate_evidence += 1

        # Either store the statement, or the statement id.
        if get_full_stmts:
            s_dict[stmt_hash].add(stmt)
        else:
            s_dict[stmt_hash].add(sid)

    # Report on the results.
    print("Found %d relevant text refs with statements." % len(stmt_nd))
    print("number of statement exact duplicates: %d" % num_duplicate_evidence)
    print("number of unique statements: %d" % num_unique_evidence)

    # Now we filter and get the set of statements/statement ids.
    stmts = set()
    for trid, src_dict in stmt_nd.items():
        # Filter out unneeded fulltext.
        while sum([k != 'pubmed' for k in src_dict.keys()]) > 1:
            worst_src = min(src_dict,
                            key=lambda x: full_text_content.index(x[0]))
            del src_dict[worst_src]

        # Filter out the older reader versions
        for reader, rv_list in [('reach', reach_versions),
                                ('sparser', sparser_versions)]:
            for rv_dict in src_dict.gets(reader):
                best_rv = max(rv_dict, key=lambda x: rv_list.index(x))

                # Take any one of the duplicates. Statements/Statement ids are
                # already grouped into sets of duplicates keyed by the
                # Statement and Evidence matches key hashes. We only want one
                # of each.
                stmts |= {(ev_hash, list(ev_set)[0])
                          for ev_hash, ev_set in rv_dict[best_rv].items()}

    return stmt_nd, stmts
Exemplo n.º 16
0
    def dump_statements(self, db):
        from indra_db.reading.read_db import DatabaseStatementData, \
            generate_reading_id
        tc_rows = set(self.text_content.values())
        tc_cols = ('text_ref_id', 'source', 'format', 'text_type', 'preprint')
        logger.info(f"Dumping {len(tc_rows)} text content.")
        db.copy_lazy('text_content', tc_rows, tc_cols)

        # Look up tcids for newly entered content.
        tcids = db.select_all([db.TextContent.text_ref_id, db.TextContent.id],
                              db.TextContent.text_ref_id.in_(
                                  self.statements.keys()),
                              db.TextContent.format == 'xdd')
        tcid_lookup = {trid: tcid for trid, tcid in tcids}

        # Compile reading and statements into rows.
        r_rows = set()
        r_cols = ('id', 'text_content_id', 'reader', 'reader_version',
                  'format', 'batch_id')
        s_rows = set()
        rd_batch_id = db.make_copy_batch_id()
        stmt_batch_id = db.make_copy_batch_id()
        stmts = []
        for trid, trid_set in self.statements.items():
            for reader, stmt_list in trid_set.items():
                tcid = tcid_lookup[trid]
                reader_version = self.reader_versions[reader.upper()]
                reading_id = generate_reading_id(tcid, reader, reader_version)
                r_rows.add((reading_id, tcid, reader.upper(), reader_version,
                            'xdd', rd_batch_id))
                for sj in stmt_list:
                    stmt = Statement._from_json(sj)
                    stmts.append(stmt)
                    sd = DatabaseStatementData(
                        stmt, reading_id, indra_version=self.indra_version)
                    s_rows.add(sd.make_tuple(stmt_batch_id))

        logger.info(f"Dumping {len(r_rows)} readings.")
        db.copy_lazy('reading',
                     r_rows,
                     r_cols,
                     commit=False,
                     constraint='reading-uniqueness')

        logger.info(f"Dumping {len(s_rows)} raw statements.")
        skipped = db.copy_report_lazy('raw_statements',
                                      s_rows,
                                      DatabaseStatementData.get_cols(),
                                      commit=False)
        skipped_uuids = {
            t[DatabaseStatementData.get_cols().index('uuid')]
            for t in skipped
        }
        new_stmts = [s for s in stmts if s.uuid not in skipped_uuids]
        if len(new_stmts):
            insert_raw_agents(db,
                              stmt_batch_id,
                              new_stmts,
                              verbose=False,
                              commit=False)
        return