예제 #1
0
def show_list(with_raw):
    """List the latest updates for each type of Statement."""
    import tabulate

    db = get_db('primary')
    rows = [(st, lu) for st, lu in list_last_updates(db).items()]
    header = ('Statement Type', 'Last Update')
    if with_raw:
        print("This may take a while...", end='', flush=True)
        raw_stmt_dates = list_latest_raw_stmts(db)
        print("\r", end='')
        new_rows = []
        for st, lu in rows:
            raw_date = raw_stmt_dates.get(st)
            if raw_date is None:
                new_rows.append((st, format_date(lu), "[None]", "No"))
            else:
                new_rows.append((st, format_date(lu), format_date(raw_date),
                                 "Yes" if raw_date > lu else "No"))
        rows = new_rows
        header += ('Latest Raw Stmt', 'Needs Update?')
    else:
        rows = [(st, format_date(lu)) for st, lu in rows]
    rows.sort()
    print(tabulate.tabulate(rows, header))
예제 #2
0
    def __check_stmts(self,
                      json_stmts,
                      check_support=False,
                      check_stmts=False):
        assert len(json_stmts) is not 0, \
            'Did not get any statements.'
        stmts = stmts_from_json(json_stmts)
        for s in stmts:
            assert s.evidence, "Statement lacks evidence."
            for ev in s.evidence:
                if ev.source_api in {'reach', 'sparser', 'trips'} \
                        and ev.pmid is None:

                    # Check because occasionally there is genuinely no pmid.
                    from indra_db.util import get_db
                    db = get_db('primary')
                    tr = db.select_one(db.TextRef,
                                       db.TextRef.id == ev.text_refs['TRID'])
                    assert tr.pmid is None, \
                        ('Statement from reading missing pmid:\n%s\n%s.'
                         % (s, json.dumps(ev.to_json(), indent=2)))

        # To allow for faster response-times, we currently do not include
        # support links in the response.
        if check_support:
            assert any([s.supports + s.supported_by for s in stmts]),\
                ("Some statements lack support: %s."
                 % str([str(s) for s in stmts if not s.supports+s.supported_by]))
            if check_stmts:
                assert all([not s1.matches(s2)
                            for s1, s2 in combinations(stmts, 2)]),\
                    ("Some statements match: %s."
                     % str([(s1, s2) for s1, s2 in combinations(stmts, 2)
                            if s1.matches(s2)]))
        return
예제 #3
0
    def __init__(self, tcids, reader, verbose=True, reading_mode='unread',
                 rslt_mode='all', batch_size=1000, db=None, n_proc=1):
        self.tcids = tcids
        self.reader = reader
        self.reader.reset()
        self.verbose = verbose
        self.reading_mode = reading_mode
        self.rslt_mode = rslt_mode
        self.batch_size = batch_size
        self.n_proc = n_proc
        if db is None:
            self._db = get_db('primary')
        else:
            self._db = db
        self._tc_rd_link = \
            self._db.TextContent.id == self._db.Reading.text_content_id
        logger.info("Instantiating reading handler for reader %s with version "
                    "%s using reading mode %s and statement mode %s for %d "
                    "tcids." % (reader.name, reader.get_version(),
                                reading_mode, rslt_mode, len(tcids)))

        # To be filled.
        self.extant_readings = []
        self.new_readings = []
        self.result_outputs = []
        self.starts = {}
        self.stops = {}
        return
예제 #4
0
def update_curations():
    CURATIONS['cache'] = {}

    attr_maps = [('tag', 'error_type'), ('text', 'comment'),
                 ('curator', 'email'), 'source', 'ip', 'date', 'id',
                 ('pa_hash', 'stmt_hash'), 'source_hash']

    # Build up the curation dict.
    db = get_db('primary')
    curations = db.select_all(db.Curation)
    for curation in curations:
        key = (curation.pa_hash, curation.source_hash)
        if key not in CURATIONS['cache']:
            CURATIONS['cache'][key] = []

        cur_dict = {}
        for attr_map in attr_maps:
            if isinstance(attr_map, tuple):
                db_attr, dict_key = attr_map
                cur_dict[dict_key] = getattr(curation, db_attr)
            else:
                cur_dict[attr_map] = getattr(curation, attr_map)
        CURATIONS['cache'][key].append(cur_dict)

    CURATIONS['last_updated'] = datetime.now()
    return
예제 #5
0
def get_raw_stmt_jsons_from_agents(agents=None, stmt_type=None, db=None,
                                   max_stmts=None, offset=None):
    """Get Raw statement jsons from a list of agent refs and Statement type."""
    if db is None:
        db = get_db('primary')

    if agents is None:
        agents = []

    # Turn the agents parameters into an intersection of queries for stmt ids.
    entity_queries = []
    for role, ag_dbid, ns in agents:
        # Make the id match paradigms for the database.
        ag_dbid = regularize_agent_id(ag_dbid, ns)

        # Sanitize wildcards.
        for char in ['%', '_']:
            ag_dbid = ag_dbid.replace(char, '\%s' % char)

        # Generate the query
        q = db.session.query(
            db.RawAgents.stmt_id.label('stmt_id')
        ).filter(
            db.RawAgents.db_id.like(ag_dbid)
        )

        if ns is not None:
            q = q.filter(db.RawAgents.db_name.like(ns))

        if role is not None:
            q = q.filter(db.RawAgents.role == role.upper())

        entity_queries.append(q)

    # Add a constraint for the statement type.
    if stmt_type is not None:
        q = db.session.query(
            db.RawStatements.id.label('stmt_id')
        ).filter(
            db.RawStatements.type == stmt_type
        )
        entity_queries.append(q)

    # Generate the sub-query.
    ag_query_al = intersect_all(*entity_queries).alias('intersection')
    ag_query = db.session.query(ag_query_al).distinct().subquery('ag_stmt_ids')

    # Get the raw statement JSONs from the database.
    res = get_raw_stmt_jsons([db.RawStatements.id == ag_query.c.stmt_id], db=db,
                             max_stmts=max_stmts, offset=offset)
    return res
예제 #6
0
def get_raw_statements_for_pmids(pmids, mode='all', batch_size=100):
    """Return EmmaaStatements based on extractions from given PMIDs.

    Parameters
    ----------
    pmids : set or list of str
        A set of PMIDs to find raw INDRA Statements for in the INDRA DB.
    mode : 'all' or 'distilled'
        The 'distilled' mode makes sure that the "best", non-redundant
        set of raw statements are found across potentially redundant text
        contents and reader versions. The 'all' mode doesn't do such
        distillation but is significantly faster.
    batch_size : Optional[int]
        Determines how many PMIDs to fetch statements for in each
        iteration. Default: 100.

    Returns
    -------
    dict
        A dict keyed by PMID with values INDRA Statements obtained
        from the given PMID.
    """
    db = get_db('primary')
    logger.info(f'Getting raw statements for {len(pmids)} PMIDs')
    all_stmts = defaultdict(list)
    for pmid_batch in tqdm.tqdm(batch_iter(pmids,
                                           return_func=set,
                                           batch_size=batch_size),
                                total=len(pmids) / batch_size):
        if mode == 'distilled':
            clauses = [
                db.TextRef.pmid.in_(pmid_batch),
                db.TextContent.text_ref_id == db.TextRef.id,
                db.Reading.text_content_id == db.TextContent.id,
                db.RawStatements.reading_id == db.Reading.id
            ]
            distilled_stmts = distill_stmts(db,
                                            get_full_stmts=True,
                                            clauses=clauses)
            for stmt in distilled_stmts:
                all_stmts[stmt.evidence[0].pmid].append(stmt)
        else:
            id_stmts = \
                get_raw_stmt_jsons_from_papers(pmid_batch, id_type='pmid',
                                               db=db)
            for pmid, stmt_jsons in id_stmts.items():
                all_stmts[pmid] += stmts_from_json(stmt_jsons)
    all_stmts = dict(all_stmts)
    return all_stmts
예제 #7
0
def run_preassembly(mode, project_name):
    """Construct a submitter and begin submitting jobs to Batch for preassembly.

    This function will determine which statement types need to be updated and
    how far back they go, and will create the appropriate
    :class:`PreassemblySubmitter
    <indra_db.preassembly.submitter.PreassemblySubmitter>`
    instance, and run the jobs with pre-set parameters on statement types that
    need updating.

    Parameters
    ----------
    project_name : str
        This name is used to gag the various AWS resources used for accounting
        purposes.
    """
    from indra_db.preassembly.submitter import VALID_STATEMENTS, \
        PreassemblySubmitter
    db = get_db('primary')
    if mode == 'update':
        # Find the latest update for each statement type.
        last_updates = list_last_updates(db)

        # Get the most recent raw statement datetimes
        latest_raw_stmts = list_latest_raw_stmts(db)

        # Only include statements types that have new raw statements.
        need_to_update = [
            s_type for s_type, last_upd in last_updates.items()
            if s_type in latest_raw_stmts.keys()
            and latest_raw_stmts[s_type] > last_upd
        ]
    else:
        # Make sure the pa_statements table is truly empty.
        if db.select_one(db.PAStatements):
            raise IndraDbException("Please clear the pa_statements table "
                                   "before running create. If you want to run "
                                   "an incremental update, please run with "
                                   "mode 'update'.")

        # Just run them all.
        need_to_update = VALID_STATEMENTS[:]

    # Create the submitter, and run it.
    basename = datetime.utcnow().strftime('%Y%m%d_%H%M%S')
    ps = PreassemblySubmitter(basename, mode, project_name=project_name)
    ps.set_max_jobs(4)
    ps.run(need_to_update, 100000, True, stagger=600, poll_interval=120)
예제 #8
0
def main(project_name):
    db = get_db('primary')
    pa_updates = db.select_all(db.PreassemblyUpdates)
    last_full_update = max(filter_updates(None, pa_updates))
    last_updates = {st: max(filter_updates(st, pa_updates) | {last_full_update})
                    for st in VALID_STATEMENTS}

    need_to_update = []
    for stmt_type, last_update in last_updates.items():
        res = db.select_one(db.RawStatements,
                            db.RawStatements.type == stmt_type,
                            db.RawStatements.create_date > last_update)
        if res:
            need_to_update.append(stmt_type)

    basename = datetime.utcnow().strftime('%Y%m%d_%H%M%S')
    ps = PreassemblySubmitter(basename, 'update', project_name=project_name)
    ps.set_max_jobs(4)
    ps.run(need_to_update, 100000, True, stagger=600, poll_interval=120)
예제 #9
0
def get_raw_stmt_jsons(clauses=None, db=None, max_stmts=None, offset=None):
    """Get Raw Statements from the principle database, given arbitrary clauses.
    """
    if db is None:
        db = get_db('primary')

    if clauses is None:
        clauses = []

    q = db.session.query(
        db.RawStatements.id,
        db.RawStatements.json,
        db.Reading.id,
        db.TextContent.id,
        db.TextRef
    ).filter(
        *clauses
    ).outerjoin(
        db.Reading,
        db.Reading.id == db.RawStatements.reading_id
    ).outerjoin(
        db.TextContent,
        db.TextContent.id == db.Reading.text_content_id
    ).outerjoin(
        db.TextRef,
        db.TextRef.id == db.TextContent.text_ref_id
    )

    if max_stmts is not None:
        q = q.limit(max_stmts)

    if offset is not None:
        q = q.offset(offset)

    raw_stmt_jsons = {}
    for sid, json_bytes, rid, tcid, tr in q.all():
        raw_j = json.loads(json_bytes)
        if rid is not None:
            _fix_evidence(raw_j['evidence'][0], rid, tcid, tr.get_ref_dict())
        raw_stmt_jsons[sid] = raw_j

    return raw_stmt_jsons
예제 #10
0
def get_curations(db=None, **params):
    """Get all curations for a certain level given certain criteria."""
    if db is None:
        db = get_db('primary')
    cur = db.Curation

    constraints = []
    for key, val in params.items():
        if key == 'hash_val':
            key = 'pa_hash'
        elif key == 'ev_hash':
            key = 'source_hash'

        if isinstance(val, list) or isinstance(val, set) \
           or isinstance(val, tuple):
            constraints.append(getattr(cur, key).in_(val))
        else:
            constraints.append(getattr(cur, key) == val)

    return [c.to_json() for c in db.select_all(cur, *constraints)]
예제 #11
0
def get_curator_counts(db=None):
    """Return a Counter of the number of curations submitted by each user.

    Parameters
    ----------
    db : Optional[DatabaseManager]
        A database manager object used to access the database. If not given,
        the database configured as primary is used.

    Returns
    -------
    collections.Counter
        A Counter of curator users by the number of curations they have
        submitted.
    """
    if db is None:
        db = get_db('primary')
    res = db.select_all(db.Curation)
    curators = [r.curator for r in res]
    counter = Counter(curators)
    return counter
예제 #12
0
파일: xdd_client.py 프로젝트: cthoyt/emmaa
def get_document_figures(paper_id, paper_id_type):
    """Get figures and tables from a given paper.

    Parameters
    ----------
    paper_id : str or int
        ID of a paper.
    paper_id_type : str
        A name of a paper ID type (PMID, PMCID, DOI, TRID).

    Returns
    -------
    figures : list[tuple]
        A list of tuples where each tuple is a figure title and bytes content.
    """
    paper_id_type = paper_id_type.upper()
    if paper_id_type == 'DOI':
        doi = paper_id
    else:
        db = get_db('primary')
        if paper_id_type == 'TRID':
            tr = db.select_one(db.TextRef, db.TextRef.id == paper_id)
        elif paper_id_type == 'PMID':
            tr = db.select_one(db.TextRef, db.TextRef.pmid == paper_id)
        elif paper_id_type == 'PMCID':
            tr = db.select_one(db.TextRef, db.TextRef.pmcid == paper_id)
        ref_dict = tr.get_ref_dict()
        doi = ref_dict.get('DOI')
    if not doi:
        logger.warning(f'Could not get DOI from {paper_id_type} {paper_id}, '
                       'returning 0 figures and tables')
        return []
    objects = get_document_objects(doi)
    if not objects:
        return []
    figures = []
    for obj in objects:
        figures.append(get_figure_from_document_object(obj))
    logger.info(f'Returning {len(figures)} figures and tables.')
    return figures
예제 #13
0
def _get_trid_title(trid):
    db = get_db('primary')
    tc = db.select_one(db.TextContent, db.TextContent.text_ref_id == trid,
                       db.TextContent.text_type == 'title')
    if tc:
        title = unpack(tc.content)
        return title
    tr = db.select_one(db.TextRef, db.TextRef.id == trid)
    ref_dict = tr.get_ref_dict()
    if 'PMID' in ref_dict:
        pmid = ref_dict['PMID']
        pmids_to_titles = _get_pmid_titles([pmid])
        if pmid in pmids_to_titles:
            return pmids_to_titles[pmid]
    if 'PMCID' in ref_dict:
        title = _get_pmcid_title(ref_dict['PMCID'])
        if title:
            return title
    if 'DOI' in ref_dict:
        title = _get_doi_title(ref_dict['DOI'])
        if title:
            return title
예제 #14
0
    def get_paper_titles_and_links(self, trids):
        """Return a dictionary mapping paper IDs to their titles."""
        if self.paper_id_type == 'pii':
            return {}, {}
        db = get_db('primary')
        trs = db.select_all(db.TextRef, db.TextRef.id.in_(trids))
        ref_dicts = [tr.get_ref_dict() for tr in trs]
        trid_to_title = {}
        trid_to_link = {}
        trid_to_pmids = {}
        trid_to_pmcids = {}
        trid_to_dois = {}
        check_in_db = []
        # Map TRIDs to available PMIDs, DOIs, PMCIDs in this order
        for ref_dict in ref_dicts:
            link = _get_publication_link(ref_dict)
            trid_to_link[str(ref_dict['TRID'])] = link
            if ref_dict.get('PMID'):
                trid_to_pmids[ref_dict['TRID']] = ref_dict['PMID']
            elif ref_dict.get('PMCID'):
                trid_to_pmcids[ref_dict['TRID']] = ref_dict['PMCID']
            elif ref_dict.get('DOI'):
                trid_to_dois[ref_dict['TRID']] = ref_dict['DOI']

        logger.info(f'From {len(trids)} TRIDs got {len(trid_to_pmids)} PMIDs,'
                    f' {len(trid_to_pmcids)} PMCIDs, {len(trid_to_dois)} DOIs')

        # First get titles for available PMIDs
        if trid_to_pmids:
            logger.info(f'Getting titles for {len(trid_to_pmids)} PMIDs')
            pmids = list(trid_to_pmids.values())
            pmids_to_titles = _get_pmid_titles(pmids)

            for trid, pmid in trid_to_pmids.items():
                if pmid in pmids_to_titles:
                    trid_to_title[str(trid)] = pmids_to_titles[pmid]
                else:
                    check_in_db.append(trid)

        # Then get titles for available PMCIDs
        if trid_to_pmcids:
            logger.info(f'Getting titles for {len(trid_to_pmcids)} PMCIDs')
            for trid, pmcid in trid_to_pmcids.items():
                title = _get_pmcid_title(pmcid)
                if title:
                    trid_to_title[str(trid)] = title
                else:
                    check_in_db.append(trid)

        # Then get titles for available DOIs
        if trid_to_dois:
            logger.info(f'Getting titles for {len(trid_to_dois)} DOIs')
            for trid, doi in trid_to_dois.items():
                title = _get_doi_title(doi)
                if title:
                    trid_to_title[str(trid)] = title
                else:
                    check_in_db.append(trid)

        # Try getting remaining titles from db
        if check_in_db:
            logger.info(f'Getting titles for {len(check_in_db)} remaining '
                        'TRIDs from DB')
            tcs = db.select_all(db.TextContent,
                                db.TextContent.text_ref_id.in_(check_in_db),
                                db.TextContent.text_type == 'title')
            for tc in tcs:
                title = unpack(tc.content)
                trid_to_title[str(tc.text_ref_id)] = title

        return trid_to_title, trid_to_link
예제 #15
0
def get_raw_stmt_jsons_from_papers(id_list, id_type='pmid', db=None,
                                   max_stmts=None, offset=None):
    """Get raw statement jsons for a given list of papers.

    Parameters
    ----------
    id_list : list
        A list of ints or strs that are ids of papers of type `id_type`.
    id_type : str
        Default is 'pmid'. The type of ids given in id_list, e.g. 'pmid',
        'pmcid', 'trid'.
    db : :py:class:`DatabaseManager`
        Optionally specify a database manager that attaches to something
        besides the primary database, for example a local database instance.

    Returns
    -------
    result_dict : dict
        A dictionary keyed by id (of `id_type`) with a list of raw statement
        json objects as each value. Ids for which no statements are found will
        not be included in the dict.
    """
    if db is None:
        db = get_db('primary')

    # Get the attribute for this id type.
    if id_type == 'pmid':
        id_constraint = db.TextRef.pmid_in(id_list, filter_ids=True)
    elif id_type == 'pmcid':
        id_constraint = db.TextRef.pmcid_in(id_list, filter_ids=True)
    elif id_type == 'doi':
        id_constraint = db.TextRef.doi_in(id_list, filter_ids=True)
    else:
        id_constraint = _get_id_col(db.TextRef, id_type).in_(id_list)

    # Get the results.
    res = db.select_all([db.TextRef, db.RawStatements.json], id_constraint,
                        *db.link(db.RawStatements, db.TextRef))

    # Organized the results into a dict of lists keyed by id value.
    # Fix pmids along the way.
    result_dict = defaultdict(list)
    for tr, rjson_bytes in res:
        id_val = _get_id_col(tr, id_type)

        # Decode and unpack the json
        rjson = json.loads(rjson_bytes.decode('utf-8'))

        # Fix the pmids in this json.
        rjson['evidence'][0]['pmid'] = tr.pmid

        # Set the text_refs in this json
        ev = rjson['evidence'][0]
        if 'text_refs' not in ev.keys():
            ev['text_refs'] = {}
        for idt in ['trid', 'pmid', 'pmcid', 'doi']:
            ev['text_refs'][idt.upper()] = _get_id_col(tr, idt)

        # Add this to the results.
        result_dict[id_val].append(rjson)

    return result_dict
예제 #16
0
def get_pmids_for_mesh_terms(mesh_list):
    db = get_db('primary')
    res = db.select_all(db.MeshRefAnnotations.pmid,
                        db.MeshRefAnnotations.mesh_id.in_(mesh_list))
    return [t[0] for t in res]
예제 #17
0
    def test_mesh_concept_ev_limit(self):
        """Test a specific bug in which evidence was duplicated.

        When querying for mesh concepts, with an evidence limit, the evidence
        was repeated numerous times.
        """
        db = get_db('primary')
        q = HasAgent('ACE2') & FromMeshIds(['C000657245'])
        resp, dt, size = self.__time_query('post',
                                           'statements/from_query_json',
                                           'limit=50&ev_limit=6',
                                           query=q.to_json(),
                                           with_auth=True)
        assert resp.status_code == 200, f"Query failed: {resp.data.decode()}"
        assert dt < 30, "Query would have timed out."
        if dt > 15:
            logger.warning(f"Query took a long time: {dt} seconds.")

        resp_json = json.loads(resp.data)
        pmids = set()
        for h, data in resp_json['results'].items():
            ev_list = data['evidence']
            assert len(ev_list) <= 6, "Evidence limit exceeded."
            ev_tuples = {(ev.get('text'), ev.get('source_hash'),
                          ev.get('source_api'), str(ev.get('text_refs')))
                         for ev in ev_list}
            assert len(ev_tuples) == len(ev_list), "Evidence is not unique."
            for ev in ev_list:
                found_pmid = False
                if 'pmid' in ev:
                    pmids.add(ev['pmid'])
                    found_pmid = True

                if 'text_refs' in ev:
                    tr_dict = ev['text_refs']
                    if 'TRID' in tr_dict:
                        tr = db.select_one(db.TextRef,
                                           db.TextRef.id == tr_dict['TRID'])
                        pmids.add(tr.pmid)
                        found_pmid = True
                    if 'PMID' in tr_dict:
                        pmids.add(tr_dict['PMID'])
                        found_pmid = True
                    if 'DOI' in tr_dict:
                        tr_list = db.select_all(
                            db.TextRef, db.TextRef.doi_in([tr_dict['DOI']]))
                        pmids |= {tr.pmid for tr in tr_list if tr.pmid}
                        found_pmid = True

                assert found_pmid,\
                    "How could this have been mapped to mesh?"
        pmids = {int(pmid) for pmid in pmids if pmid is not None}

        mesh_pmids = {
            n
            for n, in db.select_all(db.MeshRefAnnotations.pmid_num,
                                    db.MeshRefAnnotations.pmid_num.in_(pmids),
                                    db.MeshRefAnnotations.mesh_num == 657245,
                                    db.MeshRefAnnotations.is_concept.is_(True))
        }
        mesh_pmids |= {
            n
            for n, in db.select_all(
                db.MtiRefAnnotationsTest.pmid_num,
                db.MtiRefAnnotationsTest.pmid_num.in_(pmids),
                db.MtiRefAnnotationsTest.mesh_num == 657245,
                db.MtiRefAnnotationsTest.is_concept.is_(True))
        }

        assert pmids == mesh_pmids, "Not all pmids mapped ot mesh term."
예제 #18
0
def submit_curation(hash_val,
                    tag,
                    curator,
                    ip,
                    text=None,
                    ev_hash=None,
                    source='direct_client',
                    pa_json=None,
                    ev_json=None,
                    db=None):
    """Submit a curation for a given preassembled or raw extraction.

    Parameters
    ----------
    hash_val : int
        The hash corresponding to the statement.
    tag : str
        A very short phrase categorizing the error or type of curation.
    curator : str
        The name or identifier for the curator.
    ip : str
        The ip address of user's computer.
    text : str
        A brief description of the problem.
    ev_hash : int
        A hash of the sentence and other evidence information. Elsewhere
        referred to as `source_hash`.
    source : str
        The name of the access point through which the curation was performed.
        The default is 'direct_client', meaning this function was used
        directly. Any higher-level application should identify itself here.
    pa_json : Optional[dict]
        The JSON of a preassembled or raw statement that was curated. If None,
        we will try to get the pa_json from the database.
    ev_json : Optional[dict]
        The JSON of the evidence that was curated. This cannot be retrieved from
        the database if not given.
    db : DatabaseManager
        A database manager object used to access the database.
    """
    if db is None:
        db = get_db('primary')

    if pa_json is None:
        pa_json_strs = db.select_one(db.PAStatements.json,
                                     db.PAStatements.mk_hash == int(hash_val))
        if pa_json_strs is not None:
            pa_json = json.loads(pa_json_strs[0])

    inp = {
        'tag': tag,
        'text': text,
        'curator': curator,
        'ip': ip,
        'source': source,
        'pa_hash': hash_val,
        'source_hash': ev_hash,
        'pa_json': pa_json,
        'ev_json': ev_json
    }

    logger.info("Adding curation: %s" % str(inp))

    try:
        dbid = db.insert(db.Curation, **inp)
    except IntegrityError as e:
        logger.error("Got a bad entry.")
        msg = e.args[0]
        detail_line = msg.splitlines()[1]
        m = re.match("DETAIL: .*?\(pa_hash\)=\((\d+)\).*?not present.*?pa.*?",
                     detail_line)
        if m is None:
            raise e
        else:
            h = m.groups()[0]
            assert int(h) == int(hash_val), \
                "Erred hash %s does not match input hash %s." % (h, hash_val)
            logger.error("Bad hash: %s" % h)
            raise BadHashError(h)
    return dbid
        drug_list.append((compound, counts_by_name[name]))
    with open('indra_drug_list.tsv', 'w') as fh:
        for compound in drug_list:
            fh.write(
                '%s\t%s\t%s\n' %
                (compound[0], compound[1], 'INDRA (text mining/databases)'))


misgrounding_map = {
    'CTSL': ['MEP'],
    'CTSB': ['APPs'],
    'FURIN': ['pace', 'Fur']
}

if __name__ == '__main__':
    db = get_db('primary')
    db_curations = get_curations(db=db)
    tp = tas.process_from_web()
    #targets = ['TMPRSS2', 'ACE2', 'FURIN', 'CTSB', 'CTSL']
    targets = [
        'PIKFYVE', 'INPP5E', 'PIK3C2A', 'PIK3C2B', 'PIK3C2G', 'PI4K2A',
        'PI4K2B', 'PI4KB', 'EHD3', 'PIK3C3'
    ]
    all_stmts = []
    all_ev_counts = {}
    with open('ctd_drugbank_tas_pikfyve.pkl', 'rb') as f:
        all_ctd_stmts = pickle.load(f)
        all_ctd_stmts = filter_neg(all_ctd_stmts)
    for target in targets:
        stmts = get_statements(target)
        fname = '%s.html' % target
예제 #20
0
def get_direct_raw_stmt_jsons_from_agents(agents=None, stmt_type=None, db=None,
                                          max_stmts=None, offset=None):
    """Get Raw statement jsons from a list of agent refs and Statement type."""
    if db is None:
        db = get_db('primary')

    # Turn the agents parameters into an intersection of queries for stmt ids.
    entity_queries = []
    for role, ag_dbid, ns in agents:
        # Make the id match paradigms for the database.
        ag_dbid = regularize_agent_id(ag_dbid, ns)

        # Sanitize wildcards.
        for char in ['%', '_']:
            ag_dbid = ag_dbid.replace(char, '\%s' % char)

        # Generate the query
        q = (db.session
             .query(db.RawAgents.stmt_id.label('stmt_id'))
             .filter(db.RawAgents.db_id.like(ag_dbid)))

        if ns is not None:
            q = q.filter(db.RawAgents.db_name.like(ns))

        if role is not None:
            q = q.filter(db.RawAgents.role == role.upper())

        entity_queries.append(q)

    ag_query_al = intersect_all(*entity_queries).alias('intersection')
    ag_query = db.session.query(ag_query_al).distinct().subquery('ag_stmt_ids')

    # Create a query for the raw statement json
    rid_c = db.RawStatements.reading_id.label('rid')
    json_q = (db.session.query(db.RawStatements.json, rid_c, ag_query)
              .filter(db.RawStatements.id == ag_query.c.stmt_id))

    # Filter by type, if applicable.
    if stmt_type is not None:
        json_q = json_q.filter(db.RawStatements.type == stmt_type)

    # Apply count limits and such.
    if max_stmts is not None:
        json_q = json_q.limit(max_stmts)

    if offset is not None:
        json_q = json_q.offset(offset)

    # Construct final query, that joins with text ref info on the database.
    json_q = json_q.subquery('json_content')
    ref_q = (db.session
             .query(json_q, db.Reading.text_content_id.label('tcid'),
                    db.TextRef)
             .outerjoin(db.Reading, db.Reading.id == json_q.c.rid)
             .join(db.TextContent,
                   db.TextContent.id == db.Reading.text_content_id)
             .join(db.TextRef, db.TextRef.id == db.TextContent.text_ref_id))

    # Process the jsons, filling text ref info.
    raw_stmt_jsons = {}
    for json_bytes, rid, sid, tcid, tr in ref_q.all():
        raw_j = json.loads(json_bytes)
        ev = raw_j['evidence'][0]
        ev['text_refs'] = tr.get_ref_dict()
        ev['text_refs']['TCID'] = tcid
        ev['text_refs']['READING_ID'] = rid
        if tr.pmid:
            ev['pmid'] = tr.pmid

        raw_stmt_jsons[sid] = raw_j

    return raw_stmt_jsons