Exemplo n.º 1
0
def get_rows_from_statement(
    statement: Statement,
    allow_duplicates: bool = True,
    keep_only_pmids: Union[None, str, Collection[str]] = None,
) -> Iterable[Row]:
    """Convert an INDRA statement into an iterable of BEL curation rows.

    :param statement: The INDRA statement
    :param allow_duplicates: Keep several evidences for the same INDRA statement
    :param keep_only_pmids: If set only keeps evidences from this PMID. Warning: still might
     have multiple evidences.
    """
    statement.evidence = [e for e in statement.evidence if _keep_evidence(e)]

    # Remove evidences from BioPax
    if 0 == len(statement.evidence):
        return iter([])

    if isinstance(keep_only_pmids, str):
        keep_only_pmids = {keep_only_pmids}
    if keep_only_pmids is not None:
        statement.evidence = [
            evidence for evidence in statement.evidence
            if evidence.pmid in keep_only_pmids
        ]
        # Might also be a case where several evidences from
        # same document exist, but we really only want one.
    if not allow_duplicates:
        # Remove all but the first remaining evidence for the statement
        # unused_evidences = statement.evidence[1:]
        del statement.evidence[1:]

    yield from _get_rows_from_statement(statement)
Exemplo n.º 2
0
def read_stmts(fname):
    jsons = read_jsons(fname)
    stmts = []
    for js in jsons:
        st = Statement.from_json(json.dumps(js))
        stmts.append(st)
    return stmts
Exemplo n.º 3
0
def get_statement_object(db_stmt):
    """Get an INDRA Statement object from a db_stmt."""
    if isinstance(db_stmt, bytes):
        jb = db_stmt
    else:
        jb = db_stmt.json
    return Statement._from_json(json.loads(jb.decode('utf-8')))
Exemplo n.º 4
0
def test_conversion_keying():
    stmt_json = {
        "type": "Conversion",
        "subj": {
            "name": "inflammatory response",
            "db_refs": {}
        },
        "obj_from": [{
            "name": "KNG1",
            "db_refs": {
                "HGNC": "6383",
                "UP": "P01042"
            }
        }],
        "obj_to": [{
            "name": "Kallidin",
            "db_refs": {
                "SCHEM": "Kallidin"
            }
        }],
        "id": "d2361669-dfe5-45e0-914a-c96a82ad25fb"
    }
    stmt_list = [Statement._from_json(stmt_json)]
    stmt_list[0].agent_list()
    list(_get_keyed_stmts(stmt_list))
    return
Exemplo n.º 5
0
def extend_refinements_graph(
    g: networkx.DiGraph,
    stmt: Statement,
    less_specifics: List[int],
    matches_fun: Optional[Callable[[Statement], str]] = None,
) -> networkx.DiGraph:
    """Extend refinements graph with a new statement and its refinements.

    Parameters
    ----------
    g :
        A refinements graph to be extended.
    stmt :
        The statement to be added to the refinements graph.
    less_specifics :
        A list of statement hashes of statements that are refined
        by this statement (i.e., are less specific versions of it).
    matches_fun :
        An optional function to calculate the matches key and hash of a
        given statement. Default: None
    """
    sh = stmt.get_hash(matches_fun=matches_fun)
    g.add_node(sh, stmt=stmt)
    for less_spec in less_specifics:
        g.add_edge(less_spec, sh)
    return g
Exemplo n.º 6
0
def _choose_unique(not_duplicates, get_full_stmts, stmt_tpl_grp):
    """Choose one of the statements from a redundant set."""
    assert stmt_tpl_grp, "This cannot be empty."
    if len(stmt_tpl_grp) == 1:
        s_tpl = stmt_tpl_grp[0]
        duplicate_ids = set()
    else:
        stmt_tpl_set = set(stmt_tpl_grp)
        preferred_tpls = {tpl for tpl in stmt_tpl_set
                          if tpl[1] in not_duplicates}
        if not preferred_tpls:
            s_tpl = stmt_tpl_set.pop()
        elif len(preferred_tpls) == 1:
            s_tpl = preferred_tpls.pop()
        else:  # len(preferred_stmts) > 1
            assert False, \
                ("Duplicate deduplicated statements found: %s"
                 % str(preferred_tpls))
        duplicate_ids = {tpl[1] for tpl in stmt_tpl_set
                         if tpl[1] not in not_duplicates}

    if get_full_stmts:
        stmt_json = json.loads(s_tpl[2].decode('utf-8'))
        ret_stmt = Statement._from_json(stmt_json)
    else:
        ret_stmt = s_tpl[1]
    return ret_stmt, duplicate_ids
Exemplo n.º 7
0
def read_stmts(fname):
    jsons = read_jsons(fname)
    stmts = []
    for js in jsons:
        st = Statement.from_json(json.dumps(js))
        stmts.append(st)
    return stmts
Exemplo n.º 8
0
 def get_statement(cls, cl_statement):
     """Get an INDRA Statement from cl-json"""
     stmt_json = cls.converter.cl_to_json(cl_statement)
     if not stmt_json:
         return None
     elif isinstance(stmt_json, list):
         return stmts_from_json(stmt_json)
     else:
         return Statement._from_json(stmt_json)
Exemplo n.º 9
0
def _get_reading_statement_dict(db, clauses=None, get_full_stmts=True):
    """Get a nested dict of statements, keyed by ref, content, and reading."""
    # Construct the query for metadata from the database.
    q = (db.session.query(db.TextRef, db.TextContent.id,
                          db.TextContent.source, db.Reading.id,
                          db.Reading.reader_version, db.RawStatements.id,
                          db.RawStatements.json)
         .filter(db.RawStatements.reading_id == db.Reading.id,
                 db.Reading.text_content_id == db.TextContent.id,
                 db.TextContent.text_ref_id == db.TextRef.id))
    if clauses:
        q = q.filter(*clauses)

    # Prime some counters.
    num_duplicate_evidence = 0
    num_unique_evidence = 0

    # Populate a dict with all the data.
    stmt_nd = NestedDict()
    for tr, tcid, src, rid, rv, sid, sjson in q.yield_per(1000):
        # Back out the reader name.
        for reader, rv_list in reader_versions.items():
            if rv in rv_list:
                break
        else:
            raise Exception("rv %s not recognized." % rv)

        # Get the json for comparison and/or storage
        stmt_json = json.loads(sjson.decode('utf8'))
        stmt = Statement._from_json(stmt_json)
        _set_evidence_text_ref(stmt, tr)

        # Hash the compbined stmt and evidence matches key.
        stmt_hash = stmt.get_hash(shallow=False)

        # For convenience get the endpoint statement dict
        s_dict = stmt_nd[tr.id][src][tcid][reader][rv][rid]

        # Initialize the value to a set, and count duplicates
        if stmt_hash not in s_dict.keys():
            s_dict[stmt_hash] = set()
            num_unique_evidence += 1
        else:
            num_duplicate_evidence += 1

        # Either store the statement, or the statement id.
        if get_full_stmts:
            s_dict[stmt_hash].add((sid, stmt))
        else:
            s_dict[stmt_hash].add((sid, None))

    # Report on the results.
    print("Found %d relevant text refs with statements." % len(stmt_nd))
    print("number of statement exact duplicates: %d" % num_duplicate_evidence)
    print("number of unique statements: %d" % num_unique_evidence)
    return stmt_nd
Exemplo n.º 10
0
    def dump_statements(self, db):
        tc_rows = set(self.text_content.values())
        tc_cols = ('text_ref_id', 'source', 'format', 'text_type', 'preprint')
        logger.info(f"Dumping {len(tc_rows)} text content.")
        db.copy_lazy('text_content', tc_rows, tc_cols)

        # Look up tcids for newly entered content.
        tcids = db.select_all(
            [db.TextContent.text_ref_id, db.TextContent.id],
            db.TextContent.text_ref_id.in_(self.statements.keys()),
            db.TextContent.format == 'xdd'
        )
        tcid_lookup = {trid: tcid for trid, tcid in tcids}

        # Compile reading and statements into rows.
        r_rows = set()
        r_cols = ('id', 'text_content_id', 'reader', 'reader_version',
                  'format', 'batch_id')
        s_rows = set()
        rd_batch_id = db.make_copy_batch_id()
        stmt_batch_id = db.make_copy_batch_id()
        stmts = []
        for trid, trid_set in self.statements.items():
            for reader, stmt_list in trid_set.items():
                tcid = tcid_lookup[trid]
                reader_version = self.reader_versions[reader.upper()]
                reading_id = generate_reading_id(tcid, reader, reader_version)
                r_rows.add((reading_id, tcid, reader.upper(), reader_version,
                            'xdd', rd_batch_id))
                for sj in stmt_list:
                    stmt = Statement._from_json(sj)
                    stmts.append(stmt)
                    sd = DatabaseStatementData(
                        stmt,
                        reading_id,
                        indra_version=self.indra_version
                    )
                    s_rows.add(sd.make_tuple(stmt_batch_id))

        logger.info(f"Dumping {len(r_rows)} readings.")
        db.copy_lazy('reading', r_rows, r_cols, commit=False)

        logger.info(f"Dumping {len(s_rows)} raw statements.")
        db.copy_lazy('raw_statements', s_rows,
                     DatabaseStatementData.get_cols(), commit=False)
        if len(stmts):
            insert_raw_agents(db, stmt_batch_id, stmts, verbose=False,
                              commit=False)

        update_rows = [(json.dumps(self.reader_versions), self.indra_version,
                        group.key[:-1])
                       for group in self.groups]
        db.copy('xdd_updates', update_rows,
                ('reader_versions', 'indra_version', 'day_str'))
        return
Exemplo n.º 11
0
def _simple_scorer_update(G, edge):
    evidence_list = []
    for stmt_data in G.edges[edge]['statements']:
        for k, v in stmt_data['source_counts'].items():
            if k in db_source_mapping:
                s = db_source_mapping[k]
            else:
                s = k
            for _ in range(v):
                evidence_list.append(Evidence(source_api=s))
    return simple_scorer.score_statement(st=Statement(evidence=evidence_list))
Exemplo n.º 12
0
def test_conversion_keying():
    stmt_json = {"type": "Conversion",
                 "subj": {"name": "inflammatory response", "db_refs": {}},
                 "obj_from": [{"name": "KNG1",
                               "db_refs": {"HGNC": "6383", "UP": "P01042"}}],
                 "obj_to": [{"name": "Kallidin",
                             "db_refs": {"SCHEM": "Kallidin"}}],
                 "id": "d2361669-dfe5-45e0-914a-c96a82ad25fb"}
    stmt_list = [Statement._from_json(stmt_json)]
    stmt_list[0].agent_list()
    list(_get_keyed_stmts(stmt_list))
    return
Exemplo n.º 13
0
    def insert_the_statements(self, input_tuples):
        print("Loading %d statements..." % len(input_tuples))
        cols = self.test_data['raw_statements']['cols'] + ('source_hash', )
        new_input_tuples = []
        for t in input_tuples:
            s = Statement._from_json(json.loads(t[-1].decode('utf-8')))
            t += (s.evidence[0].get_source_hash(), )
            new_input_tuples.append(t)

        self.test_db.copy('raw_statements', new_input_tuples, cols)
        print("Inserting agents...")
        dbu.insert_agents(self.test_db, 'raw')
        return
Exemplo n.º 14
0
def _get_input_stmt_tuples(num_stmts):
    print("\tPrepping the raw statements...")
    stmt_tuples, col_names = _get_stmt_tuples(num_stmts)
    copy_col_names = ('uuid', 'mk_hash', 'type', 'indra_version', 'json',
                      'reading_id', 'db_info_id')
    copy_stmt_tuples = []
    for tpl in stmt_tuples:
        entry_dict = dict(zip(col_names, tpl))
        json_bytes = entry_dict['json']
        stmt = Statement._from_json(json.loads(json_bytes.decode('utf-8')))
        entry_dict['mk_hash'] = stmt.get_hash()
        ret_tpl = tuple([entry_dict[col] for col in copy_col_names])
        copy_stmt_tuples.append(ret_tpl)
    return copy_stmt_tuples, copy_col_names
Exemplo n.º 15
0
def _json_str_to_stmts_dict(json_str):
    """Make a dict of statements keyed by their uuid's from json representation

    This function is the inverse of _stmts_dict_to_json_str()

    Parameters
    ----------
    json_str : str
        A json compatible string

    Returns
    -------
    stmt_dict : dict
        Dict with statements keyed by their uuid's: {uuid: stmt}
    """
    stmt_jsons = json.loads(json_str)
    stmts = [Statement._from_json(s) for s in stmt_jsons]
    return {s.uuid: s for s in stmts}
Exemplo n.º 16
0
def _simple_scorer_update(G, edge):
    evidence_list = []
    for stmt_data in G.edges[edge]['statements']:
        for k, v in stmt_data['source_counts'].items():
            if k in db_source_mapping:
                s = db_source_mapping[k]
            else:
                s = k
            for _ in range(v):
                evidence_list.append(Evidence(source_api=s))

    try:
        ag_belief = simple_scorer.score_statement(
                                            Statement(evidence=evidence_list))
    # Catch underflow
    except FloatingPointError as err:
        # Numpy precision
        NP_PRECISION = 10 ** -np.finfo(np.longfloat).precision
        logger.warning('%s: Resetting ag_belief to 10*np.longfloat precision '
                       '(%.0e)' % (err, Decimal(NP_PRECISION * 10)))
        ag_belief = NP_PRECISION * 10
    return ag_belief
Exemplo n.º 17
0
def str_imp(o, uuid=None, other_stmt_keys=None):
    if o is None:
        return '~'
    cname = o.__class__.__name__
    if cname == 'TextRef':
        return ('<TextRef: trid: %s, pmid: %s, pmcid: %s>'
                % (o.id, o.pmid, o.pmcid))
    if cname == 'TextContent':
        return ('<TextContent: tcid: %s, trid: %s, src: %s>'
                % (o.id, o.text_ref_id, o.source))
    if cname == 'Reading':
        return ('<Reading: rid: %s, tcid: %s, reader: %s, rv: %s>'
                % (o.id, o.text_content_id, o.reader, o.reader_version))
    if cname == 'RawStatements':
        s = Statement._from_json(json.loads(o.json.decode()))
        s_str = ('<RawStmt: %s sid: %s, uuid: %s, type: %s, iv: %s, hash: %s>'
                 % (str(s), o.id, o.uuid, o.type, o.indra_version, o.mk_hash))
        if other_stmt_keys and s.get_hash(shallow=True) in other_stmt_keys:
            s_str = '+' + s_str
        if s.uuid == uuid:
            s_str = '*' + s_str
        return s_str
Exemplo n.º 18
0
def get_filtered_db_stmts(db, get_full_stmts=False, clauses=None):
    """Get the set of statements/ids from databases minus exact duplicates."""
    # Only get the json if it's going to be used.
    if get_full_stmts:
        tbl_list = [db.RawStatements.json]
    else:
        tbl_list = [db.RawStatements.id]

    db_s_q = db.filter_query(tbl_list, db.RawStatements.db_info_id.isnot(None))

    # Add any other criterion specified at higher levels.
    if clauses:
        db_s_q = db_s_q.filter(*clauses)

    # Produce a generator of statement groups.
    db_stmt_data = db_s_q.yield_per(10000)
    if get_full_stmts:
        return {
            Statement._from_json(json.loads(s_json.decode('utf-8')))
            for s_json, in db_stmt_data
        }
    else:
        return {sid for sid, in db_stmt_data}
Exemplo n.º 19
0
def _get_statement_object(db_stmt):
    """Get an INDRA Statement object from a db_stmt."""
    return Statement._from_json(json.loads(db_stmt.json.decode('utf-8')))
Exemplo n.º 20
0
    def dump_statements(self, db):
        from indra_db.reading.read_db import DatabaseStatementData, \
            generate_reading_id
        tc_rows = set(self.text_content.values())
        tc_cols = ('text_ref_id', 'source', 'format', 'text_type', 'preprint')
        logger.info(f"Dumping {len(tc_rows)} text content.")
        db.copy_lazy('text_content', tc_rows, tc_cols)

        # Look up tcids for newly entered content.
        tcids = db.select_all([db.TextContent.text_ref_id, db.TextContent.id],
                              db.TextContent.text_ref_id.in_(
                                  self.statements.keys()),
                              db.TextContent.format == 'xdd')
        tcid_lookup = {trid: tcid for trid, tcid in tcids}

        # Compile reading and statements into rows.
        r_rows = set()
        r_cols = ('id', 'text_content_id', 'reader', 'reader_version',
                  'format', 'batch_id')
        s_rows = set()
        rd_batch_id = db.make_copy_batch_id()
        stmt_batch_id = db.make_copy_batch_id()
        stmts = []
        for trid, trid_set in self.statements.items():
            for reader, stmt_list in trid_set.items():
                tcid = tcid_lookup[trid]
                reader_version = self.reader_versions[reader.upper()]
                reading_id = generate_reading_id(tcid, reader, reader_version)
                r_rows.add((reading_id, tcid, reader.upper(), reader_version,
                            'xdd', rd_batch_id))
                for sj in stmt_list:
                    stmt = Statement._from_json(sj)
                    stmts.append(stmt)
                    sd = DatabaseStatementData(
                        stmt, reading_id, indra_version=self.indra_version)
                    s_rows.add(sd.make_tuple(stmt_batch_id))

        logger.info(f"Dumping {len(r_rows)} readings.")
        db.copy_lazy('reading',
                     r_rows,
                     r_cols,
                     commit=False,
                     constraint='reading-uniqueness')

        logger.info(f"Dumping {len(s_rows)} raw statements.")
        skipped = db.copy_report_lazy('raw_statements',
                                      s_rows,
                                      DatabaseStatementData.get_cols(),
                                      commit=False)
        skipped_uuids = {
            t[DatabaseStatementData.get_cols().index('uuid')]
            for t in skipped
        }
        new_stmts = [s for s in stmts if s.uuid not in skipped_uuids]
        if len(new_stmts):
            insert_raw_agents(db,
                              stmt_batch_id,
                              new_stmts,
                              verbose=False,
                              commit=False)
        return
Exemplo n.º 21
0
def fix_invalidities_stmt(stmt: Statement):
    """Fix invalidities of a single INDRA Statement in place."""
    for ev in stmt.evidence:
        fix_invalidities_evidence(ev)
    for agent in stmt.real_agent_list():
        fix_invalidities_agent(agent)
Exemplo n.º 22
0
def distill_stmts_from_reading(db, get_full_stmts=False, clauses=None):
    """Get a corpus of statements from clauses and filters duplicate evidence.

    Note that this will only get statements from reading.

    Parameters
    ----------
    db : :py:class:`DatabaseManager`
        A database manager instance to access the database.
    get_full_stmts : bool
        By default (False), only Statement ids (the primary index of Statements
        on the database) are returned. However, if set to True, serialized
        INDRA Statements will be returned. Note that this will in general be
        VERY large in memory, and therefore should be used with caution.
    clauses : None or list of sqlalchemy clauses
        By default None. Specify sqlalchemy clauses to reduce the scope of
        statements, e.g. `clauses=[db.Statements.type == 'Phosphorylation']` or
        `clauses=[db.Statements.uuid.in_([<uuids>])]`.

    Returns
    -------
    stmt_dn : NestedDict
        A deeply nested recursive dictionary, carrying the metadata for the
        Statements.
    stmt_ret : set
        A set of either statement ids or serialized statements, depending on
        `get_full_stmts`.
    """
    # Construct the query for metadata from the database.
    q = (db.session.query(db.TextContent.text_ref_id, db.TextContent.id,
                          db.TextContent.source, db.Readings.id,
                          db.Readings.reader_version, db.Statements.id,
                          db.Statements.json).filter(
                              db.TextContent.id == db.Readings.text_content_id,
                              db.Readings.id == db.Statements.reader_ref))
    if clauses:
        q.filter(*clauses)

    # Specify sources of fulltext content, and order priorities.
    full_text_content = ['manuscripts', 'pmc_oa', 'elsevier']

    # Specify versions of readers, and preference.
    sparser_versions = ['sept14-linux\n', 'sept14-linux']
    reach_versions = ['61059a-biores-e9ee36', '1.3.3-61059a-biores-']

    # Prime some counters.
    num_duplicate_evidence = 0
    num_unique_evidence = 0

    # Populate a dict with all the data.
    stmt_nd = NestedDict()
    for trid, tcid, src, rid, rv, sid, sjson in q.yield_per(1000):
        # Back out the reader name.
        if rv in sparser_versions:
            reader = 'sparser'
        elif rv in reach_versions:
            reader = 'reach'
        else:
            raise Exception("rv %s not recognized." % rv)

        # Get the json for comparison and/or storage
        stmt_json = json.loads(sjson.decode('utf8'))
        stmt = Statement._from_json(stmt_json)

        # Hash the compbined stmt and evidence matches key.
        m_key = stmt.matches_key() + stmt.evidence[0].matches_key()
        stmt_hash = hash(m_key)

        # For convenience get the endpoint statement dict
        s_dict = stmt_nd[trid][src][tcid][reader][rv][rid]

        # Initialize the value to a set, and count duplicates
        if stmt_hash not in s_dict.keys():
            s_dict[stmt_hash] = set()
            num_unique_evidence += 1
        else:
            num_duplicate_evidence += 1

        # Either store the statement, or the statement id.
        if get_full_stmts:
            s_dict[stmt_hash].add(stmt)
        else:
            s_dict[stmt_hash].add(sid)

    # Report on the results.
    print("Found %d relevant text refs with statements." % len(stmt_nd))
    print("number of statement exact duplicates: %d" % num_duplicate_evidence)
    print("number of unique statements: %d" % num_unique_evidence)

    # Now we filter and get the set of statements/statement ids.
    stmts = set()
    for trid, src_dict in stmt_nd.items():
        # Filter out unneeded fulltext.
        while sum([k != 'pubmed' for k in src_dict.keys()]) > 1:
            worst_src = min(src_dict,
                            key=lambda x: full_text_content.index(x[0]))
            del src_dict[worst_src]

        # Filter out the older reader versions
        for reader, rv_list in [('reach', reach_versions),
                                ('sparser', sparser_versions)]:
            for rv_dict in src_dict.gets(reader):
                best_rv = max(rv_dict, key=lambda x: rv_list.index(x))

                # Take any one of the duplicates. Statements/Statement ids are
                # already grouped into sets of duplicates keyed by the
                # Statement and Evidence matches key hashes. We only want one
                # of each.
                stmts |= {(ev_hash, list(ev_set)[0])
                          for ev_hash, ev_set in rv_dict[best_rv].items()}

    return stmt_nd, stmts
Exemplo n.º 23
0
def _stmt_from_json(stmt_json_bytes):
    return Statement._from_json(json.loads(stmt_json_bytes.decode('utf-8')))