Exemplo n.º 1
0
def get_db_agent_mod_stmts(filename, cached=True):
    with open(filename, 'rb') as fh:
        site_stmts = pickle.load(fh)
        return site_stmts

    def has_mod_agents(stmt):
        mod_agents = []
        for agent in stmt.agent_list():
            if agent is not None:
                for mc in agent.mods:
                    if has_site_pos(mc):
                        return True
        return False

    def has_site_pos(mc):
        return mc.position is not None and mc.residue is not None

    batch_size = 100000
    db = get_primary_db()
    site_stmts = []
    for idx, db_stmt_batch in db.select_all_batched(
            batch_size, db.RawStatements,
            db.RawStatements.reading_id.isnot(None)):
        stmt_tuples = get_raw_stmts_frm_db_list(db,
                                                db_stmt_batch,
                                                fix_refs=False)
        stmts = [s[1] for s in stmt_tuples]
        for stmt in stmts:
            if has_mod_agents(stmt):
                site_stmts.append(stmt)
        print('Finished batch %d' % idx)
        print('Currently have %d site statements' % len(site_stmts))
        with open(filename, 'wb') as f:
            pickle.dump(site_stmts, f)
    return site_stmts
Exemplo n.º 2
0
def _process_pa_statement_res_wev(db,
                                  stmt_iterable,
                                  count=1000,
                                  fix_refs=True):
    warnings.warn(('This module is being taken out of service, as the tools '
                   'have become deprecated. Moreover, the service has been '
                   're-implemented to use newer tools as best as possible, '
                   'but some results may be subtly different.'),
                  DeprecationWarning)
    # Iterate over the batches to create the statement objects.
    stmt_dict = {}
    ev_dict = {}
    raw_stmt_dict = {}
    total_ev = 0
    for stmt_pair_batch in batch_iter(stmt_iterable, count):
        # Instantiate the PA statement objects, and record the uuid
        # evidence (raw statement) links.
        raw_stmt_objs = []
        for pa_stmt_db_obj, raw_stmt_db_obj in stmt_pair_batch:
            k = pa_stmt_db_obj.mk_hash
            if k not in stmt_dict.keys():
                stmt_dict[k] = get_statement_object(pa_stmt_db_obj)
                ev_dict[k] = [
                    raw_stmt_db_obj.id,
                ]
            else:
                ev_dict[k].append(raw_stmt_db_obj.id)
            raw_stmt_objs.append(raw_stmt_db_obj)
            total_ev += 1

        logger.info("Up to %d pa statements, with %d pieces of "
                    "evidence in all." % (len(stmt_dict), total_ev))

        # Instantiate the raw statements.
        raw_stmt_sid_tpls = get_raw_stmts_frm_db_list(db,
                                                      raw_stmt_objs,
                                                      fix_refs,
                                                      with_sids=True)
        raw_stmt_dict.update({sid: s for sid, s in raw_stmt_sid_tpls})
        logger.info("Processed %d raw statements." % len(raw_stmt_sid_tpls))

    # Attach the evidence
    logger.info("Inserting evidence.")
    for k, sid_list in ev_dict.items():
        stmt_dict[k].evidence = [
            raw_stmt_dict[sid].evidence[0] for sid in sid_list
        ]
    return stmt_dict
Exemplo n.º 3
0
def get_evidence(pa_stmt_list, db=None, fix_refs=True, use_views=True):
    """Fill in the evidence for a list of pre-assembled statements.

    Parameters
    ----------
    pa_stmt_list : list[Statement]
        A list of unique statements, generally drawn from the database
        pa_statement table (via `get_statemetns`).
    db : DatabaseManager instance or None
        An instance of a database manager. If None, defaults to the "primary"
        database, as defined in the db_config.ini file in .config/indra.
    fix_refs : bool
        The paper refs within the evidence objects are not populated in the
        database, and thus must be filled using the relations in the database.
        If True (default), the `pmid` field of each Statement Evidence object
        is set to the correct PMIDs, or None if no PMID is available. If False,
        the `pmid` field defaults to the value populated by the reading
        system.

    Returns
    -------
    None - modifications are made to the Statements "in-place".
    """
    warnings.warn(('This module is being taken out of service, as the tools '
                   'have become deprecated. Moreover, the service has been '
                   're-implemented to use newer tools as best as possible, '
                   'but some results may be subtly different.'),
                  DeprecationWarning)
    if db is None:
        db = get_primary_db()

    # Turn the list into a dict.
    stmt_dict = {s.get_hash(shallow=True): s for s in pa_stmt_list}

    if use_views:
        if fix_refs:
            raw_links = db.select_all([
                db.FastRawPaLink.mk_hash, db.FastRawPaLink.raw_json,
                db.FastRawPaLink.reading_id
            ], db.FastRawPaLink.mk_hash.in_(stmt_dict.keys()))
            rel_refs = ['pmid', 'rid']
            ref_cols = [getattr(db.ReadingRefLink, k) for k in rel_refs]
        else:
            raw_links = db.select_all(
                [db.FastRawPaLink.mk_hash, db.FastRawPaLink.raw_json],
                db.FastRawPaLink.mk_hash.in_(stmt_dict.keys()))
        rid_ref_dict = {}
        myst_rid_rs_dict = defaultdict(list)
        for info in raw_links:
            if fix_refs:
                mk_hash, raw_json, rid = info
            else:
                mk_hash, raw_json = info
                rid = None
            json_dict = json.loads(raw_json.decode('utf-8'))
            ev_json = json_dict.get('evidence', [])
            assert len(ev_json) == 1, \
                "Raw statements must have one evidence, got %d." % len(ev_json)
            ev = Evidence._from_json(ev_json[0])
            stmt_dict[mk_hash].evidence.append(ev)
            if fix_refs:
                ref_dict = rid_ref_dict.get(rid)
                if ref_dict is None:
                    myst_rid_rs_dict[rid].append(ev)
                    if len(myst_rid_rs_dict) >= 1000:
                        ref_data_list = db.select_all(
                            ref_cols,
                            db.ReadingRefLink.rid.in_(myst_rid_rs_dict.keys()))
                        for pmid, rid in ref_data_list:
                            rid_ref_dict[rid] = pmid
                            for ev in myst_rid_rs_dict[rid]:
                                ev.pmid = pmid
                        myst_rid_rs_dict.clear()
                else:
                    ev.pmid = rid_ref_dict[rid]
    else:
        # Get the data from the database
        raw_list = db.select_all(
            [db.PAStatements.mk_hash, db.RawStatements],
            db.PAStatements.mk_hash.in_(stmt_dict.keys()),
            db.PAStatements.mk_hash == db.RawUniqueLinks.pa_stmt_mk_hash,
            db.RawUniqueLinks.raw_stmt_id == db.RawStatements.id)

        # Note that this step depends on the ordering being maintained.
        mk_hashes, raw_stmt_objs = zip(*raw_list)
        raw_stmts = get_raw_stmts_frm_db_list(db,
                                              raw_stmt_objs,
                                              fix_refs,
                                              with_sids=False)
        raw_stmt_mk_pairs = zip(mk_hashes, raw_stmts)

        # Now attach the evidence
        for mk_hash, raw_stmt in raw_stmt_mk_pairs:
            # Each raw statement can have just one piece of evidence.
            stmt_dict[mk_hash].evidence.append(raw_stmt.evidence[0])

    return
Exemplo n.º 4
0
def get_statements(clauses,
                   count=1000,
                   do_stmt_count=False,
                   db=None,
                   preassembled=True,
                   with_support=False,
                   fix_refs=True,
                   with_evidence=True):
    """Select statements according to a given set of clauses.

    Parameters
    ----------
    clauses : list
        list of sqlalchemy WHERE clauses to pass to the filter query.
    count : int
        Number of statements to retrieve and process in each batch.
    do_stmt_count : bool
        Whether or not to perform an initial statement counting step to give
        more meaningful progress messages.
    db : :py:class:`DatabaseManager`
        Optionally specify a database manager that attaches to something
        besides the primary database, for example a local database instance.
    preassembled : bool
        If true, statements will be selected from the table of pre-assembled
        statements. Otherwise, they will be selected from the raw statements.
        Default is True.
    with_support : bool
        Choose whether to populate the supports and supported_by list
        attributes of the Statement objects. General results in slower queries.
    with_evidence : bool
        Choose whether or not to populate the evidence list attribute of the
        Statements. As with `with_support`, setting this to True will take
        longer.
    fix_refs : bool
        The paper refs within the evidence objects are not populated in the
        database, and thus must be filled using the relations in the database.
        If True (default), the `pmid` field of each Statement Evidence object
        is set to the correct PMIDs, or None if no PMID is available. If False,
        the `pmid` field defaults to the value populated by the reading
        system.

    Returns
    -------
    list of Statements from the database corresponding to the query.
    """
    warnings.warn(('This module is being taken out of service, as the tools '
                   'have become deprecated. Moreover, the service has been '
                   're-implemented to use newer tools as best as possible, '
                   'but some results may be subtly different.'),
                  DeprecationWarning)
    cnt = count
    if db is None:
        db = get_primary_db()

    stmts_tblname = 'pa_statements' if preassembled else 'raw_statements'

    if not preassembled:
        stmts = []
        q = db.filter_query(stmts_tblname, *clauses)
        if do_stmt_count:
            logger.info("Counting statements...")
            num_stmts = q.count()
            logger.info("Total of %d statements" % num_stmts)
        db_stmts = q.yield_per(cnt)
        for subset in batch_iter(db_stmts, cnt):
            stmts.extend(
                get_raw_stmts_frm_db_list(db,
                                          subset,
                                          with_sids=False,
                                          fix_refs=fix_refs))
            if do_stmt_count:
                logger.info("%d of %d statements" % (len(stmts), num_stmts))
            else:
                logger.info("%d statements" % len(stmts))
    else:
        logger.info("Getting preassembled statements.")
        if with_evidence:
            logger.info("Getting preassembled statements.")
            # Get pairs of pa statements with their linked raw statements
            clauses += [
                db.PAStatements.mk_hash == db.RawUniqueLinks.pa_stmt_mk_hash,
                db.RawStatements.id == db.RawUniqueLinks.raw_stmt_id
            ]
            pa_raw_stmt_pairs = \
                db.select_all([db.PAStatements, db.RawStatements],
                              *clauses, yield_per=cnt)
            stmt_dict = _process_pa_statement_res_wev(db,
                                                      pa_raw_stmt_pairs,
                                                      count=cnt,
                                                      fix_refs=fix_refs)
        else:
            # Get just pa statements without their supporting raw statement(s).
            pa_stmts = db.select_all(db.PAStatements, *clauses, yield_per=cnt)
            stmt_dict = _process_pa_statement_res_nev(pa_stmts, count=cnt)

        # Populate the supports/supported by fields.
        if with_support:
            get_support(stmt_dict, db=db)

        stmts = list(stmt_dict.values())
        logger.info("In all, there are %d pa statements." % len(stmts))

    return stmts