示例#1
0
def get_statements_stats(fname=None, db=None, indra_version=None):
    if db is None:
        db = get_primary_db()
    tc_rdng_link = db.TextContent.id == db.Reading.text_content_id
    stmt_rdng_link = db.Reading.id == db.RawStatements.reader_ref

    __report_stat('\nStatement Statistics:', fname)
    __report_stat('---------------------', fname)
    if indra_version is not None:
        filters = [db.RawStatements.indra_version == indra_version]
    else:
        filters = []
    total_raw_statements = db.count(db.RawStatements, *filters)
    __report_stat("Total number of raw statements: %d" % total_raw_statements,
                  fname)
    readers = db.session.query(db.Reading.reader).distinct().all()
    sources = db.session.query(db.TextContent.source).distinct().all()
    stats = ''
    for reader, in readers:
        for src, in sources:
            cnt = db.count(db.RawStatements, stmt_rdng_link, tc_rdng_link,
                           db.Reading.reader == reader,
                           db.TextContent.source == src, *filters)
            stats += ('    Raw statements from %s reading %s: %d\n' %
                      (reader, src, cnt))
    __report_stat("Statements by reader and content source:\n%s" % stats,
                  fname)
    _report_groups(db, db.RawStatements.id, db.DBInfo.db_name, fname,
                   db.RawStatements.db_info_id == db.DBInfo.id)
    if indra_version is None:
        _report_groups(db, db.RawStatements.id, db.RawStatements.indra_version,
                       fname)
    return
示例#2
0
def main():
    parser = get_parser()
    args = parser.parse_args()
    if args.test:
        if 'test' not in args.database:
            from indra_db.tests.util import get_temp_db
            db = get_temp_db()
        else:
            db = get_db(args.database)
    elif args.database == 'primary':
        db = get_primary_db()
    else:
        db = get_db(args.database)

    readers = ['SPARSER', 'REACH', 'TRIPS', 'ISI', 'EIDOS', 'MTI']
    if args.method == 'local':
        bulk_manager = BulkLocalReadingManager(readers,
                                               buffer_days=args.buffer,
                                               n_procs=args.num_procs)
    elif args.method == 'aws':
        bulk_manager = BulkAwsReadingManager(readers,
                                             buffer_days=args.buffer,
                                             project_name=args.project_name)
    else:
        assert False, "This shouldn't be allowed."

    if args.task == 'read_all':
        bulk_manager.read_all(db)
    elif args.task == 'read_new':
        bulk_manager.read_new(db)
    return
示例#3
0
def get_db_statistics(fname=None, db=None, tables=None):
    """Get statistics on the contents of the database"""
    if db is None:
        db = get_primary_db()

    task_dict = {
        'text_ref': get_text_ref_stats,
        'text_content': get_text_content_stats,
        'readings': get_readings_stats,
        'raw_statements': get_statements_stats,
        'pa_statements': get_pa_statement_stats
    }

    task_order = [
        'text_ref', 'text_content', 'readings', 'raw_statements',
        'pa_statements'
    ]

    # Get the statistics
    if tables is None:
        for task_name in task_order:
            stat_meth = task_dict[task_name]
            stat_meth(fname, db)
    else:
        table_set = set(tables)
        for task_name in [tn for tn in task_order if tn in table_set]:
            task_dict[task_name](fname, db)

    return
示例#4
0
def get_text_content_stats(fname=None, db=None):
    if db is None:
        db = get_primary_db()
    tc_rdng_link = db.TextContent.id == db.Reading.text_content_id
    __report_stat("\nText Content statistics:", fname)
    __report_stat('------------------------', fname)
    total_content = db.count(db.TextContent)
    __report_stat("Total number of text content entries: %d" % total_content)
    latest_updates = (db.session.query(db.Updates.source,
                                       func.max(db.Updates.datetime)).group_by(
                                           db.Updates.source).all())
    __report_stat(
        ("Latest updates:\n    %s" %
         '\n    '.join(['%s: %s' % (s, d) for s, d in latest_updates])), fname)
    content_read = db.count(db.Reading.text_content_id)
    __report_stat("Total content read: %d" % content_read, fname)
    fulltext_content = db.count(db.TextContent,
                                db.TextContent.text_type == 'fulltext')
    __report_stat("Number of fulltext entries: %d" % fulltext_content, fname)
    fulltext_read = db.count(db.TextContent,
                             db.TextContent.text_type == 'fulltext',
                             tc_rdng_link)
    __report_stat("Number of fulltext entries read: %d" % fulltext_read, fname)
    _report_groups(db, db.TextContent.id, db.TextContent.source, fname)
    _report_groups(db, db.TextContent.id, db.TextContent.source, fname,
                   tc_rdng_link)
    return
示例#5
0
def read_db_ids_search_terms(id_search_terms, id_type):
    """Return extracted EmmaaStatements from INDRA database given an
    ID-search term dict.

    Parameters
    ----------
    id_search_terms : dict
        A dict representing a set of IDs pointing to search terms that
        produced them.

    Returns
    -------
    list[:py:class:`emmaa.model.EmmaaStatement`]
        A list of EmmaaStatements extracted from the given IDs.
    """
    ids = list(id_search_terms.keys())
    date = datetime.datetime.utcnow()
    db = get_primary_db()
    id_stmts = get_raw_stmt_jsons_from_papers(ids, id_type=id_type, db=db)
    estmts = []
    for _id, stmt_jsons in id_stmts.items():
        stmts = stmts_from_json(stmt_jsons)
        for stmt in stmts:
            es = EmmaaStatement(stmt, date, id_search_terms[_id])
            estmts.append(es)
    return estmts
示例#6
0
def get_db_agent_mod_stmts(filename, cached=True):
    with open(filename, 'rb') as fh:
        site_stmts = pickle.load(fh)
        return site_stmts

    def has_mod_agents(stmt):
        mod_agents = []
        for agent in stmt.agent_list():
            if agent is not None:
                for mc in agent.mods:
                    if has_site_pos(mc):
                        return True
        return False

    def has_site_pos(mc):
        return mc.position is not None and mc.residue is not None

    batch_size = 100000
    db = get_primary_db()
    site_stmts = []
    for idx, db_stmt_batch in db.select_all_batched(
            batch_size, db.RawStatements,
            db.RawStatements.reading_id.isnot(None)):
        stmt_tuples = get_raw_stmts_frm_db_list(db,
                                                db_stmt_batch,
                                                fix_refs=False)
        stmts = [s[1] for s in stmt_tuples]
        for stmt in stmts:
            if has_mod_agents(stmt):
                site_stmts.append(stmt)
        print('Finished batch %d' % idx)
        print('Currently have %d site statements' % len(site_stmts))
        with open(filename, 'wb') as f:
            pickle.dump(site_stmts, f)
    return site_stmts
示例#7
0
def read_db_pmid_search_terms(pmid_search_terms):
    """Return extracted EmmaaStatements from INDRA database given a
    PMID-search term dict.

    Parameters
    ----------
    pmid_search_terms : dict
        A dict representing a set of PMIDs pointing to search terms that
        produced them.

    Returns
    -------
    list[:py:class:`emmaa.model.EmmaaStatement`]
        A list of EmmaaStatements extracted from the given PMIDs.
    """
    pmids = list(pmid_search_terms.keys())
    date = datetime.datetime.utcnow()
    db = get_primary_db()
    pmid_stmts = get_statements_by_paper(pmids, id_type='pmid', db=db,
                                         preassembled=False)
    estmts = []
    for pmid, stmts in pmid_stmts.items():
        for stmt in stmts:
            es = EmmaaStatement(stmt, date, pmid_search_terms[pmid])
            estmts.append(es)
    return estmts
示例#8
0
def get_statement_jsons_from_papers(paper_refs, db=None, **kwargs):
    """Get the statements from a list of papers.

    Parameters
    ----------
    paper_refs : list[(<id_type>, <paper_id>)]
        A list of tuples, where each tuple indicates and id-type (e.g. 'pmid')
        and an id value for a particular paper.
    db : :py:class:`DatabaseManager`
        Optionally specify a database manager that attaches to something
        besides the primary database, for example a local databse instance.

    Some keyword arguments are passed directly to a lower level function:

    Other Parameters (kwargs)
    -------------------------
    max_stmts : int or None
        Limit the number of statements queried. If None, no restriction is
        applied.
    offset : int or None
        Start reading statements by a given offset. If None, no offset is
        applied. Most commonly used in conjunction with `max_stmts`.
    ev_limit : int or None
        Limit the amount of evidence returned per Statement.
    best_first : bool
        If True, the preassembled statements will be sorted by the amount of
        evidence they have, and those with the most evidence will be
        prioritized. When using `max_stmts`, this means you will get the "best"
        statements. If False, statements will be queried in arbitrary order.

    Returns
    -------
    A dictionary data structure containing, among other metadata, a dict of
    statement jsons under the key 'statements', themselves keyed by their
    shallow matches-key hashes.
    """
    if db is None:
        db = get_primary_db()

    # Create a sub-query on the reading metadata
    q = db.session.query(db.ReadingRefLink.rid.label('rid'))
    conditions = []
    for id_type, paper_id in paper_refs:
        tbl_attr = getattr(db.ReadingRefLink, id_type)
        if id_type in ['trid', 'tcid']:
            conditions.append(tbl_attr == paper_id)
        else:
            conditions.append(tbl_attr.like(paper_id))
    q = q.filter(or_(*conditions))
    sub_al = q.subquery('reading_ids')

    # Map the reading metadata query to mk_hashes with statement counts.
    mk_hashes_q = (db.session.query(
        db.PaMeta.mk_hash.label('mk_hash'),
        db.PaMeta.ev_count.label('ev_count')).filter(
            db.PaMeta.mk_hash == db.FastRawPaLink.mk_hash,
            db.FastRawPaLink.reading_id == sub_al.c.rid))

    return _get_pa_stmt_jsons_w_mkhash_subquery(db, mk_hashes_q, **kwargs)
示例#9
0
def get_statement_jsons_from_hashes(mk_hashes, db=None, **kwargs):
    """Get statement jsons using the appropriate hashes."""
    if db is None:
        db = get_primary_db()
    mk_hashes_q = (db.session.query(db.PaMeta.mk_hash,
                                    db.PaMeta.ev_count).filter(
                                        db.PaMeta.mk_hash.in_(mk_hashes)))
    return _get_pa_stmt_jsons_w_mkhash_subquery(db, mk_hashes_q, **kwargs)
示例#10
0
def get_stmt_count_from_db():
    """Not recommended, very slow."""
    hgnc_entries = get_hgnc_entries()
    random.seed(1)
    random.shuffle(hgnc_entries)

    db = get_primary_db()
    CHECKPOINT_FILE = 'checkpoint.pkl'

    if os.path.exists(CHECKPOINT_FILE):
        print("Loading from checkpoint")
        with open(CHECKPOINT_FILE, 'rb') as f:
            start_ix, stmt_counts = pickle.load(f)
        if start_ix == len(hgnc_entries):
            return stmt_counts
    else:
        start_ix = 0
        stmt_counts = {}

    start = time.time()
    CHECKPOINT_INTERVAL = 100
    for ix in range(start_ix, len(hgnc_entries)):
        hgnc_name, hgnc_id = hgnc_entries[ix]
        # Save the state of the dict
        if ix != 0 and ix % CHECKPOINT_INTERVAL == 0:
            print("Saving checkpoint")
            with open(CHECKPOINT_FILE, 'wb') as f:
                pickle.dump((ix, stmt_counts), f)
        # Run the query
        q = db.filter_query(db.RawStatements,
                db.RawAgents.stmt_id == db.RawStatements.id,
                 db.RawAgents.db_name.like('HGNC'),
                 db.RawAgents.db_id.like(str(hgnc_id)))
        # Get the statement count
        stmt_count = q.count()
        # Print some stats
        elapsed = time.time() - start
        time_per_gene = elapsed / (ix - start_ix + 1)
        num_remaining = len(hgnc_entries) - (ix + 1)
        sec_remaining = time_per_gene * num_remaining
        min_remaining = sec_remaining / 60.
        print("%d of %d: %d statements for %s (%s): Est %.2f min remaining" %
                    (ix+1, len(hgnc_entries), stmt_count, hgnc_name, hgnc_id,
                     min_remaining))
        # Put count into dict
        stmt_counts[hgnc_name] = stmt_count
    # Save final results
    with open(CHECKPOINT_FILE, 'wb') as f:
        pickle.dump(len(hgnc_entries), stmt_counts)

    return stmt_counts
示例#11
0
def get_pa_statement_stats(fname=None, db=None):
    if db is None:
        db = get_primary_db()
    __report_stat('\nStatement Statistics:', fname)
    __report_stat('---------------------', fname)
    stmt_q = db.filter_query(db.PAStatements)
    __report_stat("Total number of statments: %d" % stmt_q.count(), fname)
    statements_produced_by_indra_version = (db.session.query(
        db.PAStatements.indra_version, func.count(
            db.PAStatements.id)).group_by(db.PAStatements.indra_version).all())
    __report_stat(
        ("Number of statements by indra version:\n    %s" % '\n    '.join([
            '%s: %d' % (s, n) for s, n in statements_produced_by_indra_version
        ])), fname)
    return
示例#12
0
def get_statement_essentials(clauses, count=1000, db=None, preassembled=True):
    """Get the type, agents, and id data for the specified statements.

    This function is useful for light-weight searches of basic mechanistic
    information, without the need to follow as many links in the database to
    populate the Statement objects.

    To get full statements, use `get_statements`.

    Parameters
    ----------
    clauses : list
        list of sqlalchemy WHERE clauses to pass to the filter query.
    count : int
        Number of statements to retrieve and process in each batch.
    do_stmt_count : bool
        Whether or not to perform an initial statement counting step to give
        more meaningful progress messages.
    db : :py:class:`DatabaseManager`
        Optionally specify a database manager that attaches to something
        besides the primary database, for example a local database instance.
    preassembled : bool
        If true, statements will be selected from the table of pre-assembled
        statements. Otherwise, they will be selected from the raw statements.
        Default is True.

    Returns
    -------
    A list of tuples containing:
        `(uuid, sid, hash, type, (agent_1, agent_2, ...))`.
    """
    if db is None:
        db = get_primary_db()

    stmts_tblname = 'pa_statements' if preassembled else 'raw_statements'

    stmt_data = []
    db_stmts = db.select_all(stmts_tblname, *clauses, yield_per=count)
    for db_stmt in db_stmts:
        stmt = get_statement_object(db_stmt)
        sid = db_stmt.id if hasattr(db_stmt, 'id') else None
        stmt_data.append(
            (db_stmt.uuid, sid, stmt.get_hash(shallow=True), db_stmt.type,
             stmt.agent_list()))

    return stmt_data
示例#13
0
def get_curations(db=None, **params):
    """Get all curations for a certain level given certain criteria."""
    if db is None:
        db = get_primary_db()
    cur = db.Curation

    constraints = []
    for key, val in params.items():
        if key == 'hash_val':
            key = 'pa_hash'
        if key == 'ev_hash':
            key = 'source_hash'
        if isinstance(val, list) or isinstance(val, set) \
           or isinstance(val, tuple):
            constraints.append(getattr(cur, key).in_(val))
        else:
            constraints.append(getattr(cur, key) == val)

    return db.select_all(cur, *constraints)
示例#14
0
def get_curator_counts(db=None):
    """Return a Counter of the number of curations submitted by each user.

    Parameters
    ----------
    db : Optional[DatabaseManager]
        A database manager object used to access the database. If not given,
        the database configured as primary is used.

    Returns
    -------
    collections.Counter
        A Counter of curator users by the number of curations they have
        submitted.
    """
    if db is None:
        db = get_primary_db()
    res = db.select_all(db.Curation)
    curators = [r.curator for r in res]
    counter = Counter(curators)
    return counter
示例#15
0
def get_text_ref_stats(fname=None, db=None):
    if db is None:
        db = get_primary_db()
    tc_rdng_link = db.TextContent.id == db.Reading.text_content_id
    __report_stat("Text ref statistics:", fname)
    __report_stat("--------------------", fname)
    total_refs = db.count(db.TextRef)
    __report_stat('Total number of text refs: %d' % total_refs, fname)
    refs_with_content = db.count(db.TextContent.text_ref_id)
    __report_stat('Total number of refs with content: %d' % refs_with_content,
                  fname)
    refs_by_type = _report_groups(db, db.TextContent.text_ref_id,
                                  db.TextContent.text_type, fname)
    __report_stat(('Number of refs with only abstract: %d' %
                   (refs_with_content - refs_by_type['fulltext'])), fname)
    refs_with_reading = db.count(db.TextContent.text_ref_id, tc_rdng_link)
    __report_stat('Number of refs that have been read: %d' % refs_with_reading,
                  fname)
    _report_groups(db, db.TextContent.text_ref_id, db.TextContent.text_type,
                   fname, tc_rdng_link)
    return
示例#16
0
def get_readings_stats(fname=None, db=None):
    if db is None:
        db = get_primary_db()

    __report_stat('\nReading statistics:', fname)
    __report_stat('-------------------', fname)
    total_readings = db.count(db.Reading)
    __report_stat('Total number or readings: %d' % total_readings, fname)
    # There may be a way to do this more neatly with a group_by clause, however
    # the naive way of doing it leaves us with a miscount due to indistinct.
    reader_versions = (db.session.query(
        db.Reading.reader_version).distinct().all())
    sources = db.session.query(db.TextContent.source).distinct().all()
    stats = ''
    for rv, in reader_versions:
        for src, in sources:
            cnt = db.count(db.Reading,
                           db.TextContent.id == db.Reading.text_content_id,
                           db.TextContent.source == src,
                           db.Reading.reader_version == rv)
            stats += '    Readings by %s from %s: %d\n' % (rv, src, cnt)
    __report_stat("Readings by reader version and content source:\n%s" % stats,
                  fname)
    return
示例#17
0
def get_support(statements, db=None, recursive=False):
    """Populate the supports and supported_by lists of the given statements."""
    warnings.warn(('This module is being taken out of service, as the tools '
                   'have become deprecated. Moreover, the service has been '
                   're-implemented to use newer tools as best as possible, '
                   'but some results may be subtly different.'),
                  DeprecationWarning)
    # TODO: Allow recursive mode (argument should probably be an integer level)
    if db is None:
        db = get_primary_db()

    if not isinstance(statements, dict):
        stmt_dict = {s.get_hash(shallow=True): s for s in statements}
    else:
        stmt_dict = statements

    logger.info("Populating support links.")
    support_links = db.select_all(
        [
            db.PASupportLinks.supported_mk_hash,
            db.PASupportLinks.supporting_mk_hash
        ],
        or_(db.PASupportLinks.supported_mk_hash.in_(stmt_dict.keys()),
            db.PASupportLinks.supporting_mk_hash.in_(stmt_dict.keys())))
    for supped_hash, supping_hash in set(support_links):
        if supped_hash == supping_hash:
            assert False, 'Self-support found on-load.'
        supped_stmt = stmt_dict.get(supped_hash)
        if supped_stmt is None:
            supped_stmt = Unresolved(shallow_hash=supped_hash)
        supping_stmt = stmt_dict.get(supping_hash)
        if supping_stmt is None:
            supping_stmt = Unresolved(shallow_hash=supping_hash)
        supped_stmt.supported_by.append(supping_stmt)
        supping_stmt.supports.append(supped_stmt)
    return
示例#18
0
def submit_curation(hash_val,
                    tag,
                    curator,
                    ip,
                    api_key,
                    text=None,
                    ev_hash=None,
                    source='direct_client',
                    db=None):
    """Submit a curation for a given preassembled or raw extraction.

    Parameters
    ----------
    hash_val : int
        The hash corresponding to the statement.
    tag : str
        A very short phrase categorizing the error or type of curation.
    curator : str
        The name or identifier for the curator.
    ip : str
        The ip address of user's computer.
    api_key : str
        If you have one, this can help identify you as a curator, and may lend
        extra weight to your curation(s).
    text : str
        A brief description of the problem.
    ev_hash : int
        A hash of the sentence and other evidence information. Elsewhere
        referred to as `source_hash`.
    source : str
        The name of the access point through which the curation was performed.
        The default is 'direct_client', meaning this function was used
        directly. Any higher-level application should identify itself here.
    db : DatabaseManager
        A database manager object used to access the database.
    """
    if db is None:
        db = get_primary_db()

    inp = {
        'tag': tag,
        'text': text,
        'curator': curator,
        'ip': ip,
        'source': source,
        'pa_hash': hash_val,
        'source_hash': ev_hash
    }

    auth = db._get_auth_info(api_key)
    if auth is None:
        raise NoAuthError(api_key, 'curation')
    inp['auth_id'] = auth[0]

    logger.info("Adding curation: %s" % str(inp))

    try:
        dbid = db.insert(db.Curation, **inp)
    except IntegrityError as e:
        logger.error("Got a bad entry.")
        msg = e.args[0]
        detail_line = msg.splitlines()[1]
        m = re.match("DETAIL: .*?\(pa_hash\)=\((\d+)\).*?not present.*?pa.*?",
                     detail_line)
        if m is None:
            raise e
        else:
            h = m.groups()[0]
            assert int(h) == int(hash_val), \
                "Erred hash %s does not match input hash %s." % (h, hash_val)
            logger.error("Bad hash: %s" % h)
            raise BadHashError(h)
    return dbid
示例#19
0
def get_statement_jsons_from_agents(agents=None,
                                    stmt_type=None,
                                    db=None,
                                    **kwargs):
    """Get json's for statements given agent refs and Statement type.

    Parameters
    ----------
    agents : list[(<role>, <id>, <namespace>)]
        A list of agents, each specified by a tuple of information including:
        the `role`, which can be 'subject', 'object', or None, an `id`, such as
        the HGNC id, a CHEMBL id, or a FPLX id, etc, and the
        `namespace` which specifies which of the above is given in `id`.

        Some examples:
            (None, 'MEK', 'FPLX')
            ('object', '11998', 'HGNC')
            ('subject', 'MAP2K1', 'TEXT')

        Note that you will get the logical AND of the conditions given, in
        other words, each Statement will satisfy all constraints.
    stmt_type : str or None
        The type of statement to retrieve, e.g. 'Phosphorylation'. If None, no
        type restriction is imposed.
    db : :py:class:`DatabaseManager`
        Optionally specify a database manager that attaches to something
        besides the primary database, for example a local database instance.

    Some keyword arguments are passed directly to a lower level function:

    Other Parameters (kwargs)
    -------------------------
    max_stmts : int or None
        Limit the number of statements queried. If None, no restriction is
        applied.
    offset : int or None
        Start reading statements by a given offset. If None, no offset is
        applied. Most commonly used in conjunction with `max_stmts`.
    ev_limit : int or None
        Limit the amount of evidence returned per Statement.
    best_first : bool
        If True, the preassembled statements will be sorted by the amount of
        evidence they have, and those with the most evidence will be
        prioritized. When using `max_stmts`, this means you will get the "best"
        statements. If False, statements will be queried in arbitrary order.

    Returns
    -------
    A dictionary data structure containing, among other metadata, a dict of
    statement jsons under the key 'statements', themselves keyed by their
    shallow matches-key hashes.
    """
    # First look for statements matching the role'd agents.
    if db is None:
        db = get_primary_db()

    # TODO: Extend this to allow retrieval of raw statements.
    mk_hashes_q = None
    mk_hash_c = db.PaMeta.mk_hash.label('mk_hash')
    ev_count_c = db.PaMeta.ev_count.label('ev_count')
    for role, ag_dbid, ns in agents:
        # Make the id match paradigms for the database.
        ag_dbid = regularize_agent_id(ag_dbid, ns)

        # Create this query (for this agent)
        q = (db.session.query(mk_hash_c,
                              ev_count_c).filter(db.PaMeta.db_id.like(ag_dbid),
                                                 db.PaMeta.db_name.like(ns)))
        if stmt_type is not None:
            q = q.filter(db.PaMeta.type.like(stmt_type))

        if role is not None:
            q = q.filter(db.PaMeta.role == role.upper())

        # Intersect with the previous query.
        if mk_hashes_q:
            mk_hashes_q = mk_hashes_q.intersect(q)
        else:
            mk_hashes_q = q
    assert mk_hashes_q, "No conditions imposed."

    return _get_pa_stmt_jsons_w_mkhash_subquery(db, mk_hashes_q, **kwargs)
示例#20
0
                                       prioritize=True,
                                       verbose=self.verbose)
        logger.info("Made %d readings." % len(outputs))
        logger.info("Making statements...")
        rdb.produce_statements(outputs, n_proc=self.n_proc, db=db)
        return


if __name__ == '__main__':
    if args.test:
        if 'test' not in args.database:
            db = get_test_db()
        else:
            db = get_db(args.database)
    elif args.database == 'primary':
        db = get_primary_db()
    else:
        db = get_db(args.database)

    if args.method == 'local':
        bulk_managers = [
            BulkLocalReadingManager(reader_name,
                                    buffer_days=args.buffer,
                                    n_proc=args.num_procs)
            for reader_name in ['SPARSER', 'REACH']
        ]
    elif args.method == 'aws':
        bulk_managers = [
            BulkAwsReadingManager(reader_name,
                                  buffer_days=args.buffer,
                                  project_name=args.project_name)
示例#21
0
def get_statements(clauses,
                   count=1000,
                   do_stmt_count=False,
                   db=None,
                   preassembled=True,
                   with_support=False,
                   fix_refs=True,
                   with_evidence=True):
    """Select statements according to a given set of clauses.

    Parameters
    ----------
    clauses : list
        list of sqlalchemy WHERE clauses to pass to the filter query.
    count : int
        Number of statements to retrieve and process in each batch.
    do_stmt_count : bool
        Whether or not to perform an initial statement counting step to give
        more meaningful progress messages.
    db : :py:class:`DatabaseManager`
        Optionally specify a database manager that attaches to something
        besides the primary database, for example a local database instance.
    preassembled : bool
        If true, statements will be selected from the table of pre-assembled
        statements. Otherwise, they will be selected from the raw statements.
        Default is True.
    with_support : bool
        Choose whether to populate the supports and supported_by list
        attributes of the Statement objects. General results in slower queries.
    with_evidence : bool
        Choose whether or not to populate the evidence list attribute of the
        Statements. As with `with_support`, setting this to True will take
        longer.
    fix_refs : bool
        The paper refs within the evidence objects are not populated in the
        database, and thus must be filled using the relations in the database.
        If True (default), the `pmid` field of each Statement Evidence object
        is set to the correct PMIDs, or None if no PMID is available. If False,
        the `pmid` field defaults to the value populated by the reading
        system.

    Returns
    -------
    list of Statements from the database corresponding to the query.
    """
    warnings.warn(('This module is being taken out of service, as the tools '
                   'have become deprecated. Moreover, the service has been '
                   're-implemented to use newer tools as best as possible, '
                   'but some results may be subtly different.'),
                  DeprecationWarning)
    cnt = count
    if db is None:
        db = get_primary_db()

    stmts_tblname = 'pa_statements' if preassembled else 'raw_statements'

    if not preassembled:
        stmts = []
        q = db.filter_query(stmts_tblname, *clauses)
        if do_stmt_count:
            logger.info("Counting statements...")
            num_stmts = q.count()
            logger.info("Total of %d statements" % num_stmts)
        db_stmts = q.yield_per(cnt)
        for subset in batch_iter(db_stmts, cnt):
            stmts.extend(
                get_raw_stmts_frm_db_list(db,
                                          subset,
                                          with_sids=False,
                                          fix_refs=fix_refs))
            if do_stmt_count:
                logger.info("%d of %d statements" % (len(stmts), num_stmts))
            else:
                logger.info("%d statements" % len(stmts))
    else:
        logger.info("Getting preassembled statements.")
        if with_evidence:
            logger.info("Getting preassembled statements.")
            # Get pairs of pa statements with their linked raw statements
            clauses += [
                db.PAStatements.mk_hash == db.RawUniqueLinks.pa_stmt_mk_hash,
                db.RawStatements.id == db.RawUniqueLinks.raw_stmt_id
            ]
            pa_raw_stmt_pairs = \
                db.select_all([db.PAStatements, db.RawStatements],
                              *clauses, yield_per=cnt)
            stmt_dict = _process_pa_statement_res_wev(db,
                                                      pa_raw_stmt_pairs,
                                                      count=cnt,
                                                      fix_refs=fix_refs)
        else:
            # Get just pa statements without their supporting raw statement(s).
            pa_stmts = db.select_all(db.PAStatements, *clauses, yield_per=cnt)
            stmt_dict = _process_pa_statement_res_nev(pa_stmts, count=cnt)

        # Populate the supports/supported by fields.
        if with_support:
            get_support(stmt_dict, db=db)

        stmts = list(stmt_dict.values())
        logger.info("In all, there are %d pa statements." % len(stmts))

    return stmts
示例#22
0
def get_evidence(pa_stmt_list, db=None, fix_refs=True, use_views=True):
    """Fill in the evidence for a list of pre-assembled statements.

    Parameters
    ----------
    pa_stmt_list : list[Statement]
        A list of unique statements, generally drawn from the database
        pa_statement table (via `get_statemetns`).
    db : DatabaseManager instance or None
        An instance of a database manager. If None, defaults to the "primary"
        database, as defined in the db_config.ini file in .config/indra.
    fix_refs : bool
        The paper refs within the evidence objects are not populated in the
        database, and thus must be filled using the relations in the database.
        If True (default), the `pmid` field of each Statement Evidence object
        is set to the correct PMIDs, or None if no PMID is available. If False,
        the `pmid` field defaults to the value populated by the reading
        system.

    Returns
    -------
    None - modifications are made to the Statements "in-place".
    """
    warnings.warn(('This module is being taken out of service, as the tools '
                   'have become deprecated. Moreover, the service has been '
                   're-implemented to use newer tools as best as possible, '
                   'but some results may be subtly different.'),
                  DeprecationWarning)
    if db is None:
        db = get_primary_db()

    # Turn the list into a dict.
    stmt_dict = {s.get_hash(shallow=True): s for s in pa_stmt_list}

    if use_views:
        if fix_refs:
            raw_links = db.select_all([
                db.FastRawPaLink.mk_hash, db.FastRawPaLink.raw_json,
                db.FastRawPaLink.reading_id
            ], db.FastRawPaLink.mk_hash.in_(stmt_dict.keys()))
            rel_refs = ['pmid', 'rid']
            ref_cols = [getattr(db.ReadingRefLink, k) for k in rel_refs]
        else:
            raw_links = db.select_all(
                [db.FastRawPaLink.mk_hash, db.FastRawPaLink.raw_json],
                db.FastRawPaLink.mk_hash.in_(stmt_dict.keys()))
        rid_ref_dict = {}
        myst_rid_rs_dict = defaultdict(list)
        for info in raw_links:
            if fix_refs:
                mk_hash, raw_json, rid = info
            else:
                mk_hash, raw_json = info
                rid = None
            json_dict = json.loads(raw_json.decode('utf-8'))
            ev_json = json_dict.get('evidence', [])
            assert len(ev_json) == 1, \
                "Raw statements must have one evidence, got %d." % len(ev_json)
            ev = Evidence._from_json(ev_json[0])
            stmt_dict[mk_hash].evidence.append(ev)
            if fix_refs:
                ref_dict = rid_ref_dict.get(rid)
                if ref_dict is None:
                    myst_rid_rs_dict[rid].append(ev)
                    if len(myst_rid_rs_dict) >= 1000:
                        ref_data_list = db.select_all(
                            ref_cols,
                            db.ReadingRefLink.rid.in_(myst_rid_rs_dict.keys()))
                        for pmid, rid in ref_data_list:
                            rid_ref_dict[rid] = pmid
                            for ev in myst_rid_rs_dict[rid]:
                                ev.pmid = pmid
                        myst_rid_rs_dict.clear()
                else:
                    ev.pmid = rid_ref_dict[rid]
    else:
        # Get the data from the database
        raw_list = db.select_all(
            [db.PAStatements.mk_hash, db.RawStatements],
            db.PAStatements.mk_hash.in_(stmt_dict.keys()),
            db.PAStatements.mk_hash == db.RawUniqueLinks.pa_stmt_mk_hash,
            db.RawUniqueLinks.raw_stmt_id == db.RawStatements.id)

        # Note that this step depends on the ordering being maintained.
        mk_hashes, raw_stmt_objs = zip(*raw_list)
        raw_stmts = get_raw_stmts_frm_db_list(db,
                                              raw_stmt_objs,
                                              fix_refs,
                                              with_sids=False)
        raw_stmt_mk_pairs = zip(mk_hashes, raw_stmts)

        # Now attach the evidence
        for mk_hash, raw_stmt in raw_stmt_mk_pairs:
            # Each raw statement can have just one piece of evidence.
            stmt_dict[mk_hash].evidence.append(raw_stmt.evidence[0])

    return