Exemplo n.º 1
0
def upload_readings(output_list, db=None):
    """Put the reading output on the database."""
    if db is None:
        db = get_primary_db()

    # Create the list of records to be copied, ensuring no uniqueness conflicts
    r_list = db.select_all(
        db.Reading,
        db.Reading.text_content_id.in_([rd.tcid for rd in output_list]))
    exisiting_tcid_set = set([r.text_content_id for r in r_list])
    upload_list = []
    for reading_data in output_list:
        # First check if this tcid is even in the set of existing tcids in the
        # readings table.
        if reading_data.tcid in exisiting_tcid_set:
            r_tcid_list = [
                r for r in r_list if r.text_content_id == reading_data.tcid
            ]
            # Now check for any exact matches:
            if any([reading_data.matches(r) for r in r_tcid_list]):
                continue

        # If there were no conflicts, we can add this to the copy list.
        upload_list.append(reading_data.make_tuple())

    # Copy into the database.
    logger.info("Adding %d/%d reading entries to the database." %
                (len(upload_list), len(output_list)))
    db.copy('reading', upload_list, ReadingData.get_cols())
    return
Exemplo n.º 2
0
def produce_statements(output_list,
                       enrich=True,
                       no_upload=False,
                       pickle_file=None,
                       n_proc=1,
                       db=None):
    """Convert the reader output into a list of StatementData instances."""
    if db is None:
        db = get_primary_db()

    if enrich:
        _enrich_reading_data(output_list, db=db)

    stmt_data_list = make_statements(output_list, n_proc)

    if not no_upload:
        try:
            upload_statements(stmt_data_list, db=db)
        except Exception as e:
            logger.exception(e)
            if pickle_file is None:
                pickle_file = ("failure_stmt_dump_%s.pkl" %
                               datetime.now().strftime('%Y%m%d_%H%M%S'))
            logger.error(
                "Could not upload statements. Results pickled in: %s." %
                pickle_file)
    if pickle_file is not None:
        with open(pickle_file, 'wb') as f:
            pickle.dump([sd.statement for sd in stmt_data_list], f)
        print("Statements pickled in %s." % pickle_file)

    return stmt_data_list
Exemplo n.º 3
0
def get_priority_tcids(id_dict, priorities, always_add=None, db=None):
    """For all ids, besides tcids, choose best content available.

    This function will convert all ids to tcids.
    """
    if db is None:
        db = get_primary_db()

    def is_better(new, old):
        if new in priorities and old in priorities:
            return priorities.index(new) < priorities.index(old)
        return False

    logger.debug("Getting content prioritized by %s." % str(priorities))
    tcids = set(id_dict.pop('tcid', []))
    clauses = get_clauses(id_dict, db)
    tcid_source = set()
    for clause in clauses:
        q = (db.session.query(db.TextRef.id, db.TextContent.id,
                              db.TextContent.source).filter(
                                  db.TextContent.text_ref_id == db.TextRef.id,
                                  clause))
        id_set = set(q.all())
        logger.debug("Got %d more ids." % len(id_set))
        tcid_source |= id_set
    logger.debug("Got %d id's total." % len(tcid_source))
    tr_best = {}
    for trid, tcid, source in tcid_source:
        if trid not in tr_best.keys() or is_better(source, tr_best[trid][0]):
            tr_best[trid] = (source, tcid)
        if always_add is not None and source in always_add:
            tcids.add(tcid)
    tcids |= {tcid for _, tcid in tr_best.values()}
    return tcids
Exemplo n.º 4
0
def get_unique_text_refs():
    """Get unique INDRA DB TextRef IDs for all identifiers in CORD19.

    Queries TextRef IDs with PMIDs, PMCIDs, and DOIs from CORD19, then
    deduplicates to obtain a unique set of TextRefs.

    Returns
    -------
    set of ints
        Unique TextRef IDs.
    """
    pmcids = get_ids('pmcid')
    pmids = [fix_pmid(pmid) for pmid in get_ids('pubmed_id')]
    dois = [fix_doi(doi) for doi in get_ids('doi')]
    # Get unique text_refs from the DB
    db = get_primary_db()
    print("Getting TextRefs by PMCID")
    tr_pmcids = db.select_all(db.TextRef.id, db.TextRef.pmcid_in(pmcids))
    print("Getting TextRefs by PMID")
    tr_pmids = db.select_all(db.TextRef.id, db.TextRef.pmid_in(pmids))
    tr_dois = []
    for ix, doi_batch in enumerate(batch_iter(dois, 10000)):
        print("Getting Text Refs by DOI batch", ix)
        tr_doi_batch = db.select_all(
            db.TextRef.id, db.TextRef.doi_in(doi_batch, filter_ids=True))
        tr_dois.extend(tr_doi_batch)
    ids = set([
        res.id for res_list in (tr_dois, tr_pmcids, tr_pmids)
        for res in res_list
    ])
    print(len(ids), "unique TextRefs in DB")
    trs = db.select_all(db.TextRef, db.TextRef.id.in_(ids))
    return trs
Exemplo n.º 5
0
    def __init__(self,
                 tcids,
                 reader,
                 verbose=True,
                 reading_mode='unread',
                 rslt_mode='all',
                 batch_size=1000,
                 db=None,
                 n_proc=1):
        self.tcids = tcids
        self.reader = reader
        self.reader.reset()
        self.verbose = verbose
        self.reading_mode = reading_mode
        self.rslt_mode = rslt_mode
        self.batch_size = batch_size
        self.n_proc = n_proc
        if db is None:
            self._db = get_primary_db()
        else:
            self._db = db
        self._tc_rd_link = \
            self._db.TextContent.id == self._db.Reading.text_content_id
        logger.info("Instantiating reading handler for reader %s with version "
                    "%s using reading mode %s and statement mode %s for %d "
                    "tcids." % (reader.name, reader.get_version(),
                                reading_mode, rslt_mode, len(tcids)))

        # To be filled.
        self.extant_readings = []
        self.new_readings = []
        self.result_outputs = []
        self.starts = {}
        self.stops = {}
        return
Exemplo n.º 6
0
def get_text_refs_for_pubmed_search_term(search_term, **kwargs):
    """"Returns text ref IDs for PMIDs obtained using a PubMed search."""
    print('Searching for %s' % search_term)
    pmids = pubmed_client.get_ids(search_term, **kwargs)
    print('Getting TextRefs for %d PMIDs' % len(pmids))
    db = get_primary_db()
    tr_pmids = db.select_all(db.TextRef.id, db.TextRef.pmid_in(pmids))
    trids = {res.id for res in tr_pmids}
    return trids
Exemplo n.º 7
0
def get_reach_readings(tr_dicts, dump_dir=None):
    db = get_primary_db()
    # Get text ref dicts with article metadata aligned between DB and CORD19
    # Get REACH readings
    reach_data = db.select_all((db.Reading, db.TextRef, db.TextContent.source,
                                db.TextContent.text_type),
                               db.TextRef.id.in_(tr_dicts.keys()),
                               db.TextContent.text_ref_id == db.TextRef.id,
                               db.Reading.text_content_id == db.TextContent.id,
                               db.Reading.reader == 'REACH')

    # Group readings by TextRef
    def tr_id_key_func(rd):
        return rd[1].id

    def content_priority_func(rd):
        text_type_priorities = {'fulltext': 0, 'abstract': 1, 'title': 2}
        source_priorities = {
            'pmc_oa': 0,
            'manuscripts': 1,
            'elsevier': 2,
            'pubmed': 3
        }
        return (rd[1].id, text_type_priorities[rd[3]],
                source_priorities[rd[2]])

    # Sort by TextRef ID and content type/source
    reach_data.sort(key=content_priority_func)
    # Iterate over groups
    rds_filt = []
    for tr_id, tr_group in groupby(reach_data, tr_id_key_func):
        rds = list(tr_group)
        best_reading = rds[0]
        tr_dicts[tr_id]['READING_ID'] = best_reading.Reading.id
        rds_filt.append(best_reading)
    # If a dump directory is given, put all files in it
    trs_by_cord = {}
    if dump_dir:
        json_dir = join(dump_dir, 'json')
        os.mkdir(json_dir)
        for reading_result in rds_filt:
            tr = reading_result.TextRef
            reading = reading_result.Reading
            # If the reading output is empty, skip
            if not reading.bytes:
                continue
            text_ref = tr_dicts[tr.id]
            cord_uid = text_ref['CORD19_UID']
            trs_by_cord[cord_uid] = text_ref
            with open(join(json_dir, f'{cord_uid}.json'), 'wt') as f:
                content = zlib.decompress(reading.bytes, 16 + zlib.MAX_WBITS)
                f.write(content.decode('utf8'))
        # Dump the metadata dictionary
        with open(join(dump_dir, 'metadata.json'), 'wt') as f:
            json.dump(trs_by_cord, f, indent=2)
    return rds_filt
Exemplo n.º 8
0
def get_raw_stmts(tr_dicts, date_limit=None):
    """Return all raw stmts in INDRA DB for a given set of TextRef IDs.

    Parameters
    ----------
    tr_dicts : dict of text ref information
        Keys are text ref IDs (ints) mapped to dictionaries of text ref
        metadata.

    date_limit : Optional[int]
        A number of days to check the readings back.

    Returns
    -------
    list of stmts
        Raw INDRA Statements retrieved from the INDRA DB.
    """
    # Get raw statement IDs from the DB for the given TextRefs
    db = get_primary_db()
    # Get statements for the given text refs
    text_ref_ids = list(tr_dicts.keys())
    print(f"Distilling statements for {len(text_ref_ids)} TextRefs")
    start = time.time()
    clauses = [
        db.TextRef.id.in_(text_ref_ids),
        db.TextContent.text_ref_id == db.TextRef.id,
        db.Reading.text_content_id == db.TextContent.id,
        db.RawStatements.reading_id == db.Reading.id
    ]
    if date_limit:
        start_date = (datetime.datetime.utcnow() -
                      datetime.timedelta(days=date_limit))
        print(f'Limiting to stmts from readings in the last {date_limit} days')
        clauses.append(db.Reading.create_date > start_date)
    db_stmts = distill_stmts(db, get_full_stmts=True, clauses=clauses)
    # Group lists of statements by the IDs TextRef that they come from
    stmts_by_trid = {}
    for stmt in db_stmts:
        trid = stmt.evidence[0].text_refs['TRID']
        if trid not in stmts_by_trid:
            stmts_by_trid[trid] = [stmt]
        else:
            stmts_by_trid[trid].append(stmt)
    # For every statement, update the text ref dictionary of the evidence
    # object with the aligned DB/CORD19 dictionaries obtained from the
    # function cord19_metadata_for_trs:
    stmts_flat = []
    for tr_id, stmt_list in stmts_by_trid.items():
        tr_dict = tr_dicts[tr_id]
        if tr_dict:
            for stmt in stmt_list:
                stmt.evidence[0].text_refs.update(tr_dict)
        stmts_flat += stmt_list
    elapsed = time.time() - start
    print(f"{elapsed} seconds")
    return stmts_flat
Exemplo n.º 9
0
def get_raw_stmt_jsons_from_papers(id_list, id_type='pmid', db=None):
    """Get raw statement jsons for a given list of papers.

    Parameters
    ----------
    id_list : list
        A list of ints or strs that are ids of papers of type `id_type`.
    id_type : str
        Default is 'pmid'. The type of ids given in id_list, e.g. 'pmid',
        'pmcid', 'trid'.
    db : :py:class:`DatabaseManager`
        Optionally specify a database manager that attaches to something
        besides the primary database, for example a local database instance.

    Returns
    -------
    result_dict : dict
        A dictionary keyed by id (of `id_type`) with a list of raw statement
        json objects as each value. Ids for which no statements are found will
        not be included in the dict.
    """
    if db is None:
        db = get_primary_db()

    # Get the attribute for this id type.
    id_attr = _get_id_col(db.TextRef, id_type)

    # Get the results.
    res = db.select_all([db.TextRef, db.RawStatements.json],
                        id_attr.in_(id_list),
                        *db.link(db.RawStatements, db.TextRef))

    # Organized the results into a dict of lists keyed by id value.
    # Fix pmids along the way.
    result_dict = defaultdict(list)
    for tr, rjson_bytes in res:
        id_val = _get_id_col(tr, id_type)

        # Decode and unpack the json
        rjson = json.loads(rjson_bytes.decode('utf-8'))

        # Fix the pmids in this json.
        rjson['evidence'][0]['pmid'] = tr.pmid

        # Set the text_refs in this json
        ev = rjson['evidence'][0]
        if 'text_refs' not in ev.keys():
            ev['text_refs'] = {}
        for idt in ['trid', 'pmid', 'pmcid', 'doi']:
            ev['text_refs'][idt.upper()] = _get_id_col(tr, idt)

        # Add this to the results.
        result_dict[id_val].append(rjson)

    return result_dict
Exemplo n.º 10
0
def get_id_dict(id_str_list):
    """Parse the list of id string into a dict."""
    id_types = get_primary_db().TextRef.__table__.columns.keys()
    id_types.remove('id')
    id_types += ['trid', 'tcid']
    id_dict = {id_type: [] for id_type in id_types}
    for id_entry in id_str_list:
        id_type, id_val = _convert_id_entry(id_entry, id_types)
        if id_type in ['trid', 'tcid']:
            id_dict[id_type].append(int(id_val))
        else:
            id_dict[id_type].append(id_val)
    return id_dict
Exemplo n.º 11
0
    def test_statements_by_hashes_large_query(self):
        # TODO: Figure out a way to query hashes that isn't excruciatingly slow.
        # Get a set of hashes.
        db = get_primary_db()
        res = db.select_sample_from_table(1000, db.EvidenceCounts)
        hash_cnt_dict = {ev_cts.mk_hash: ev_cts.ev_count for ev_cts in res}

        # Run the test.
        resp, dt, size = self.__time_query('post',
                                           'statements/from_hashes',
                                           hashes=list(hash_cnt_dict.keys()))
        resp_dict = json.loads(resp.data.decode('utf-8'))
        self.__check_stmts(resp_dict['statements'].values())
        self.__check_time(dt, time_goal=20)
        return
Exemplo n.º 12
0
def upload_statements(stmt_data_list, db=None):
    """Upload the statements to the database."""
    if db is None:
        db = get_primary_db()

    logger.info("Uploading %d statements to the database." %
                len(stmt_data_list))
    db.copy('raw_statements', [s.make_tuple() for s in stmt_data_list],
            StatementData.get_cols())

    logger.info("Uploading agents to the database.")
    reading_id_set = set([sd.reading_id for sd in stmt_data_list])
    if len(reading_id_set):
        db_stmts = (db.select_one(db.RawStatements,
                                  db.RawStatements.uuid.like(s.statement.uuid))
                    for s in stmt_data_list)
        insert_agents(db, 'raw', db_stmts, verbose=True)
    return
Exemplo n.º 13
0
def _enrich_reading_data(reading_data_iter, db=None):
    """Get db ids for all ReadingData objects that correspond to a db ref.

    Note that the objects are modified IN PLACE, so nothing is returned, and if
    a copy of the objects is passed as an argument, this function will have no
    effect. This does nothing if the readings are not in the database.
    """
    logger.debug("Enriching the reading data with database refs.")
    if db is None:
        db = get_primary_db()
    possible_matches = db.select_all(
        'reading',
        db.Reading.text_content_id.in_(
            [rd.tcid for rd in reading_data_iter if rd.reading_id is None]))
    for rdata in reading_data_iter:
        for reading in possible_matches:
            if rdata.matches(reading):
                rdata.reading_id = reading.id
                break
    return
Exemplo n.º 14
0
def get_db_readings(id_dict,
                    readers,
                    force_fulltext=False,
                    batch_size=1000,
                    db=None):
    """Get readings from the database."""
    if db is None:
        db = get_primary_db()

    # Get any previous readings. Note that we do this BEFORE posting the new
    # readings. Otherwise we would have duplicates.
    previous_readings_query = get_readings_query(id_dict,
                                                 readers,
                                                 db=db,
                                                 force_fulltext=force_fulltext)
    if previous_readings_query is not None:
        prev_readings = [
            ReadingData.from_db_reading(r)
            for r in previous_readings_query.yield_per(batch_size)
        ]
    else:
        prev_readings = []
    return prev_readings
Exemplo n.º 15
0
def get_pmids_for_mesh_terms(mesh_list):
    num_mesh_list = [int(mid[1:]) for mid in mesh_list]
    db = get_primary_db()
    res = db.select_all(db.MeshRefAnnotations.pmid_num,
                        db.MeshRefAnnotations.mesh_num.in_(num_mesh_list))
    return [t[0] for t in res]
Exemplo n.º 16
0
def get_direct_raw_stmt_jsons_from_agents(agents=None,
                                          stmt_type=None,
                                          db=None,
                                          max_stmts=None,
                                          offset=None):
    """Get Raw statement jsons from a list of agent refs and Statement type."""
    if db is None:
        db = get_primary_db()

    # Turn the agents parameters into an intersection of queries for stmt ids.
    entity_queries = []
    for role, ag_dbid, ns in agents:
        # Make the id match paradigms for the database.
        ag_dbid = regularize_agent_id(ag_dbid, ns)

        # Sanitize wildcards.
        for char in ['%', '_']:
            ag_dbid = ag_dbid.replace(char, '\%s' % char)

        # Generate the query
        q = (db.session.query(db.RawAgents.stmt_id.label('stmt_id')).filter(
            db.RawAgents.db_id.like(ag_dbid)))

        if ns is not None:
            q = q.filter(db.RawAgents.db_name.like(ns))

        if role is not None:
            q = q.filter(db.RawAgents.role == role.upper())

        entity_queries.append(q)

    ag_query_al = intersect_all(*entity_queries).alias('intersection')
    ag_query = db.session.query(ag_query_al).distinct().subquery('ag_stmt_ids')

    # Create a query for the raw statement json
    rid_c = db.RawStatements.reading_id.label('rid')
    json_q = (db.session.query(
        db.RawStatements.json, rid_c,
        ag_query).filter(db.RawStatements.id == ag_query.c.stmt_id))

    # Filter by type, if applicable.
    if stmt_type is not None:
        json_q = json_q.filter(db.RawStatements.type == stmt_type)

    # Apply count limits and such.
    if max_stmts is not None:
        json_q = json_q.limit(max_stmts)

    if offset is not None:
        json_q = json_q.offset(offset)

    # Construct final query, that joins with text ref info on the database.
    json_q = json_q.subquery('json_content')
    ref_q = (db.session.query(
        json_q, db.Reading.text_content_id.label('tcid'),
        db.TextRef).outerjoin(db.Reading, db.Reading.id == json_q.c.rid).join(
            db.TextContent,
            db.TextContent.id == db.Reading.text_content_id).join(
                db.TextRef, db.TextRef.id == db.TextContent.text_ref_id))

    # Process the jsons, filling text ref info.
    raw_stmt_jsons = {}
    for json_bytes, rid, sid, tcid, tr in ref_q.all():
        raw_j = json.loads(json_bytes)
        ev = raw_j['evidence'][0]
        ev['text_refs'] = tr.get_ref_dict()
        ev['text_refs']['TCID'] = tcid
        ev['text_refs']['READING_ID'] = rid
        if tr.pmid:
            ev['pmid'] = tr.pmid

        raw_stmt_jsons[sid] = raw_j

    return raw_stmt_jsons
Exemplo n.º 17
0
def produce_readings(id_dict,
                     reader_list,
                     verbose=False,
                     read_mode='unread',
                     get_preexisting=True,
                     force_fulltext=False,
                     batch_size=1000,
                     no_upload=False,
                     pickle_file=None,
                     db=None,
                     log_readers=True,
                     prioritize=False):
    """Produce the reading output for the given ids, and upload them to db.

    This function will also retrieve pre-existing readings from the database,
    thus improving performance.

    Parameters
    ----------
    id_dict : dict {<id_type>:[<id value>, ...]}
        A dict of lists of the id's to be read, keyed by id_type.
    reader_list : list [Reader]
        A list of Reader descendents to be used in reading.
    verbose : bool
        Optional, default False - If True, log and print the output of the
        commandline reader utilities, if False, don't.
    read_mode : str : 'all', 'unread', or 'none'
        Optional, default 'undread' - If 'all', read everything (generally
        slow); if 'unread', only read things that were unread, (the cache of old
        readings may still be used if `stmt_mode='all'` to get everything); if
        'none', don't read, and only retrieve existing readings.
    get_preexisting : bool
        Optional, default True. If True, retrieve old readings where available
        (if `read_mode` is not 'all'). If False, don't retrieve old readings.
    force_fulltext : bool
        Optional, default False - If True, only read fulltext article, ignoring
        abstracts.
    batch_size : int
        Optional, default 1000 - The number of text content entries to be
        yielded by the database at a given time.
    no_read : bool
        Optional, default False - If True, do not perform any new readings, and
        only retrieve existing readings from the database.
    no_upload : bool
        Optional, default False - If True, do not upload content to the
        database.
    pickle_file : str or None
        Optional, default None - otherwise the path to a file in which the
        reading data will be saved.
    db : indra_db.DatabaseManager instance
        Optional, default is None, in which case the primary database provided
        by `get_primary_db` function is used. Used to interface with a
        different databse.
    log_readers : bool
        Default True. If True, stash the logs of the readers in a file.
    prioritize : bool
        Default False. If True, choose only the best content to read.

    Returns
    -------
    outputs : list [ReadingData]
        A list of the outputs of the readings in the form of ReadingData
        instances.
    """
    # Get a database instance.
    logger.debug("Producing readings in %s mode." % read_mode)
    if db is None:
        db = get_primary_db()

    # Sort out our priorities
    if prioritize:
        logger.debug("Prioritizing...")
        tcids = get_priority_tcids(id_dict,
                                   ['pmc_oa', 'manuscripts', 'elsevier'],
                                   always_add=['pubmed'],
                                   db=db)
        id_dict = {'tcid': list(tcids)}

    # Handle the cases where I need to retrieve old readings.
    prev_readings = []
    skip_reader_tcid_dict = None
    if get_preexisting and read_mode != 'all':
        prev_readings = get_db_readings(id_dict,
                                        reader_list,
                                        force_fulltext,
                                        batch_size,
                                        db=db)
        skip_reader_tcid_dict = {r.name: [] for r in reader_list}
        logger.info("Found %d pre-existing readings." % len(prev_readings))
        if read_mode != 'none':
            for rd in prev_readings:
                skip_reader_tcid_dict[rd.reader].append(rd.tcid)

    # Now produce any new readings that need to be produced.
    outputs = []
    if read_mode != 'none':
        outputs = make_db_readings(id_dict,
                                   reader_list,
                                   verbose=verbose,
                                   skip_dict=skip_reader_tcid_dict,
                                   db=db,
                                   force_fulltext=force_fulltext,
                                   force_read=(read_mode == 'all'),
                                   batch_size=batch_size,
                                   log=log_readers)
        logger.info("Made %d new readings." % len(outputs))

    if not no_upload:
        try:
            upload_readings(outputs, db=db)
        except Exception as e:
            logger.exception(e)
            if pickle_file is None:
                pickle_file = ("failure_reading_dump_%s.pkl" %
                               datetime.now().strftime('%Y%m%d_%H%M%S'))
            logger.error(
                "Cound not upload readings. Results are pickled in: " +
                pickle_file)

    outputs += prev_readings

    if pickle_file is not None:
        with open(pickle_file, 'wb') as f:
            pickle.dump([output.make_tuple() for output in outputs], f)
        print("Reading outputs stored in %s." % pickle_file)

    return outputs
Exemplo n.º 18
0
def make_db_readings(id_dict,
                     readers,
                     batch_size=1000,
                     force_fulltext=False,
                     force_read=False,
                     skip_dict=None,
                     db=None,
                     **kwargs):
    """Read contents retrieved from the database.

    The content will be retrieved in batchs, given by the `batch` argument.
    This prevents the system RAM from being overloaded.

    Parameters
    ----------
    id_dict : dict {<id_type>:[<id value>, ...]}
        A dict of lists of the id's to be read, keyed by id_type.
    readers : list of reader objects
        A list of the readers that will be use, for example ['reach'] if you
        wanted to use the reach reader.
    batch_size : int
        The number of content entries read for each batch. Default 1000.
    force_fulltext : bool
        If True, only get fulltext content from the database. Default False.
    force_read : bool
        If True, read even if text_content id is found in skip_dict.
    skip_dict : dict {<reader> : list [int]}
        A dict containing text content id's to be skipped.
    db : indra_db.DatabaseManager instance
        A handle to a database. Default None; if None, a handle to the primary
        database (see indra_db) is retrieved.

    Other keyword arguments are passed to the `read` methods of the readers.

    Returns
    -------
    outputs : list of ReadingData instances
        The results of the readings with relevant metadata.
    """
    if db is None:
        db = get_primary_db()

    # Get the iterator.
    logger.debug("Getting iterator.")
    tc_read_q = get_content_query(id_dict,
                                  readers,
                                  db=db,
                                  force_fulltext=force_fulltext,
                                  force_read=force_read)
    logger.debug("Begginning to iterate.")
    batch_list_dict = {r.name: [] for r in readers}
    new_outputs = []
    if tc_read_q is not None:
        for text_content in tc_read_q.yield_per(batch_size):
            # The get_content function returns an iterator which yields
            # results in batches, so as not to overwhelm RAM. We need to read
            # in batches for much the same reason.
            for r in readers:
                if not force_read:
                    if skip_dict is not None:
                        if text_content.id in skip_dict[r.name]:
                            continue
                    else:
                        # Try to get a previous reading from this reader.
                        reading = db.select_one(
                            db.Reading,
                            db.Reading.text_content_id == text_content.id,
                            _get_matches_clause(db, r))
                        if reading is not None:
                            continue
                processed_content = process_content(text_content)
                if processed_content is not None:
                    batch_list_dict[r.name].append(processed_content)

                if (len(batch_list_dict[r.name]) + 1) % batch_size is 0:
                    # TODO: this is a bit cludgy...maybe do this better?
                    # Perhaps refactor read_content.
                    logger.debug("Reading batch of files for %s." % r.name)
                    results = r.read(batch_list_dict[r.name], **kwargs)
                    if results is not None:
                        new_outputs += results
                    batch_list_dict[r.name] = []
        logger.debug("Finished iteration.")
        # Pick up any stragglers.
        for r in readers:
            if len(batch_list_dict[r.name]) > 0:
                logger.debug("Reading remaining files for %s." % r.name)
                results = r.read(batch_list_dict[r.name], **kwargs)
                if results is not None:
                    new_outputs += results
    return new_outputs
Exemplo n.º 19
0
def run():
    db = get_primary_db()
    stmts = load_mock_statements(db)
    return calculate_belief(stmts)
Exemplo n.º 20
0
def get_content_query(ids,
                      readers,
                      db=None,
                      force_fulltext=False,
                      force_read=False,
                      debug=False,
                      print_summary=False):
    """Construct a query to access all the content that will be read.

    If ids is not 'all', and does not contain any ids, None is returned.

    Parameters
    ----------
    ids : 'all' or dict {<id type> : [str/int]}
        If 'all', then all the content will be included in the query. Otherwise
        a the content will be constrained to that corresponding to the ids in
        id_dict, which are matched using text refs.
    readers : list [Reader child instances]
        A list of the reader objects, which contain the required metadata (name
        and version of the reader) used to find content that needs to be read.
    db : indra_db.DatabaseManager instance
        Optional, default None, in which case the primary database is used. If
        specified, the alternative database will be used. This function should
        not alter the database.
    force_fulltext : bool
        Optional, default False - If True, only fulltext content will be read,
        as opposed to including abstracts.
    force_read : bool
        Optional, default False - If True, all content will be returned,
        whether it has been read or not.

    Returns
    -------
    tc_tbr_query : sqlalchemy query object or None
        The query of the text content to be read (tc_tbr). If there are no ids
        contained in ids, or it is not 'all', return None.
    """
    if debug:
        logger.setLevel(logging.DEBUG)
    if db is None:
        db = get_primary_db()
    logger.debug("Got db handle.")

    # These allow conditions on different tables to equal conditions on the
    # dependent tables.
    tc_tr_binding = db.TextContent.text_ref_id == db.TextRef.id
    rd_tc_binding = db.Reading.text_content_id == db.TextContent.id

    # Begin the list of clauses with the binding between text content and
    # text refs.
    clauses = [tc_tr_binding]

    # Add a fulltext requirement, if applicable.
    if force_fulltext:
        clauses.append(db.TextContent.text_type == texttypes.FULLTEXT)

    # If we are actually getting anything, else we return None.
    if ids == 'all' or any([len(id_list) > 0 for id_list in ids.values()]):
        if ids is not 'all':
            sub_clauses = get_clauses(ids, db)
            if len(sub_clauses) > 1:
                clauses.append(sql.or_(*sub_clauses))
            else:
                clauses.append(*sub_clauses)

        # Get the text content query object
        tc_query = db.filter_query(db.TextContent, *clauses).distinct()

        if not force_read:
            logger.debug("Getting content to be read.")
            # Each sub query is a set of content that has been read by one of
            # the readers.
            tc_q_subs = [
                tc_query.filter(rd_tc_binding, _get_matches_clause(db, r))
                for r in readers
            ]
            tc_tbr_query = tc_query.except_(sql.intersect(*tc_q_subs))
        else:
            logger.debug('All content will be read (force_read).')
            tc_tbr_query = tc_query

        if print_summary:
            try:
                logger.debug("Going to try to make a nice summary...")
                logger.info(get_text_content_summary_string(tc_tbr_query, db))
            except Exception:
                logger.debug("Could not print summary of results.")
    else:
        logger.debug("No ids in id_dict, so no query formed.")
        return None

    return tc_tbr_query.distinct()
Exemplo n.º 21
0
        gatherer.add('refs', len(filtered_tr_records))

        # Process the text content data
        filtered_tc_records, flawed_tcs = \
                            self.filter_text_content(db, mod_tc_data)

        # Upload the text content data.
        logger.info('Adding %d more text content entries...' %
                    len(filtered_tc_records))
        self.copy_into_db(db, 'text_content', filtered_tc_records,
                          self.tc_cols)
        gatherer.add('content', len(filtered_tc_records))
        return {
            'filtered_tr_records': filtered_tr_records,
            'flawed_tr_records': flawed_tr_records,
            'mod_tc_data': mod_tc_data,
            'filtered_tc_records': filtered_tc_records
        }


if __name__ == '__main__':
    download_latest_data()
    md = get_metadata_dict()
    md = [
        e for e in md
        if e['doi'] and e['doi'].upper() != '0.1126/SCIENCE.ABB7331'
    ]
    cm = Cord19Manager(md)
    db = get_primary_db()
    res = cm.populate(db)
Exemplo n.º 22
0
def get_readings_query(ids, readers, db=None, force_fulltext=False):
    """Create a query to access all the relevant existing readings.

    Note that if ids is not 'all' and ids is a dict with no ids in it,
    this function returns None.

    Parameters
    ----------
    ids : 'all' or dict {<id_type> : [str/int]}
        If 'all', then all possible readings in the database matching the given
        readers and other conditions will be returned. Otherwise, only those
        that correspond to one of the ids in ids dict will be contained. If an
        ids dict has no ids in it, None is returned.
    readers : list [Reader child instances]
        A list of the readers whose names and versions you wish to match in the
        readings queried from the database.
    db : indra_db.DatabaseManager instance
        Optional, default None, in which case the primary database is used. If
        specified, the alternative database will be used. This function should
        not alter the database.
    force_fulltext : bool
        Optional, default False - If True, only readings corresponding to
        fulltext content will be read, as opposed to including readings created
        from abstracts.

    Returns
    -------
    readings_query : sql query instance or None
        Returns a query that can be used to access the specified content, or
        else None if no content was specified.
    """
    if db is None:
        db = get_primary_db()
    clauses = [
        # Bind conditions on readings to conditions on content.
        db.Reading.text_content_id == db.TextContent.id,

        # Bind text content to text refs
        db.TextContent.text_ref_id == db.TextRef.id,

        # Check if at least one of the readers has read the content
        sql.or_(*[_get_matches_clause(db, reader) for reader in readers])
    ]
    if force_fulltext:
        clauses.append(db.TextContent.text_type == texttypes.FULLTEXT)

    if ids == 'all' or any([id_list for id_list in ids.values()]):
        if ids != 'all':
            sub_clauses = get_clauses(ids, db)
            if len(sub_clauses) > 1:
                clauses.append(sql.or_(*sub_clauses))
            else:
                clauses.append(*sub_clauses)

        readings_query = db.filter_query(
            db.Reading,

            # Bind conditions on readings to conditions on content.
            db.Reading.text_content_id == db.TextContent.id,

            # Bind text content to text refs
            db.TextContent.text_ref_id == db.TextRef.id,

            # Check if at least one of the readers has read the content
            sql.or_(*[_get_matches_clause(db, reader) for reader in readers]),

            # Conditions generated from the list of ids. These include a
            # text-ref text-content binding to connect with id data.
            *clauses)
    else:
        return None

    return readings_query.distinct()
Exemplo n.º 23
0
def get_pmids_for_mesh_terms(mesh_list):
    db = get_primary_db()
    res = db.select_all(db.MeshRefAnnotations.pmid,
                        db.MeshRefAnnotations.mesh_id.in_(mesh_list))
    return [t[0] for t in res]