Exemplo n.º 1
0
def dump_sif(df_file=None,
             db_res_file=None,
             csv_file=None,
             src_count_file=None,
             reload=False,
             reconvert=True,
             ro=None):
    if ro is None:
        ro = get_db('primary')

    # Get the db content from a new DB dump or from file
    db_content = load_db_content(reload=reload,
                                 ns_list=NS_LIST,
                                 pkl_filename=db_res_file,
                                 ro=ro)

    # Convert the database query result into a set of pairwise relationships
    df = make_dataframe(pkl_filename=df_file,
                        reconvert=reconvert,
                        db_content=db_content)

    if csv_file:
        if isinstance(csv_file, str) and csv_file.startswith('s3:'):
            csv_file = S3Path.from_string(csv_file)
        # Aggregate rows by genes and stmt type
        logger.info("Saving to CSV...")
        filt_df = df.filter(items=[
            'agA_ns', 'agA_id', 'agA_name', 'agB_ns', 'agB_id', 'agB_name',
            'stmt_type', 'evidence_count'
        ])
        type_counts = filt_df.groupby(by=[
            'agA_ns', 'agA_id', 'agA_name', 'agB_ns', 'agB_id', 'agB_name',
            'stmt_type'
        ]).sum()
        # This requires package s3fs under the hood. See:
        # https://pandas.pydata.org/pandas-docs/stable/whatsnew/v0.20.0.html#s3-file-handling
        if isinstance(csv_file, S3Path):
            try:
                type_counts.to_csv(csv_file.to_string())
            except Exception as e:
                try:
                    logger.warning('Failed to upload csv to s3 using direct '
                                   's3 url, trying boto3: %s.' % e)
                    s3 = get_s3_client(unsigned=False)
                    csv_buf = StringIO()
                    type_counts.to_csv(csv_buf)
                    s3.put_object(Body=csv_buf.getvalue(), **csv_file.kw())
                    logger.info('Uploaded CSV file to s3')
                except Exception as second_e:
                    logger.error('Failed to upload csv file with fallback '
                                 'method')
                    logger.exception(second_e)
        # save locally
        else:
            type_counts.to_csv(csv_file)

    if src_count_file:
        _ = get_source_counts(src_count_file, ro=ro)
    return
Exemplo n.º 2
0
def main():
    args = get_parser().parse_args()

    ymd = args.s3_ymd
    if args.s3:
        logger.info('Uploading to %s/%s/%s on s3 instead of saving locally'
                    % (S3_SIF_BUCKET, S3_SUBDIR, ymd))
    db_res_file = _pseudo_key(args.db_dump, ymd) if args.s3 and args.db_dump\
        else args.db_dump
    df_file = _pseudo_key(args.dataframe, ymd) if args.s3 and args.dataframe\
        else args.dataframe
    csv_file = _pseudo_key(args.csv_file, ymd) if args.s3 and args.csv_file\
        else args.csv_file
    src_count_file = _pseudo_key(args.src_counts, ymd) if args.s3 and \
        args.src_counts else args.src_counts

    reload = args.reload
    if reload:
        logger.info('Reloading the database content from the database')
    else:
        logger.info('Loading cached database content from %s' % db_res_file)

    reconvert = args.reconvert
    if reconvert:
        logger.info('Reconverting database content into pandas dataframe')
    else:
        logger.info('Loading cached dataframe from %s' % df_file)

    for f in [db_res_file, df_file, csv_file, src_count_file]:
        if f:
            logger.info('Using file name %s' % f)
        else:
            continue

    dump_sif(df_file, db_res_file, csv_file, src_count_file, reload, reconvert,
             get_db('primary') if args.principal else get_ro('primary'))
Exemplo n.º 3
0
def dump_sif(src_count_file, res_pos_file, belief_file, df_file=None,
             db_res_file=None, csv_file=None, reload=True, reconvert=True,
             ro=None, normalize_names: bool = True):
    """Build and dump a sif dataframe of PA statements with grounded agents

    Parameters
    ----------
    src_count_file : Union[str, S3Path]
        A location to load the source count dict from. Can be local file
        path, an s3 url string or an S3Path instance.
    res_pos_file : Union[str, S3Path]
        A location to load the residue-postion dict from. Can be local file
        path, an s3 url string or an S3Path instance.
    belief_file : Union[str, S3Path]
        A location to load the belief dict from. Can be local file path,
        an s3 url string or an S3Path instance.
    df_file : Optional[Union[str, S3Path]]
        If provided, dump the sif to this location. Can be local file path,
        an s3 url string or an S3Path instance.
    db_res_file : Optional[Union[str, S3Path]]
        If provided, save the db content to this location. Can be local file
        path, an s3 url string or an S3Path instance.
    csv_file : Optional[str, S3Path]
        If provided, calculate dataframe statistics and save to local file
        or s3. Can be local file path, an s3 url string or an S3Path instance.
    reconvert : bool
        Whether to generate a new DataFrame from the database content or
        to load and return a DataFrame from `df_file`. If False, `df_file`
        must be given. Default: True.
    reload : bool
        If True, load new content from the database and make a new
        dataframe. If False, content can be loaded from provided files.
        Default: True.
    ro : Optional[PrincipalDatabaseManager]
        Provide a DatabaseManager to load database content from. If not
        provided, `get_ro('primary')` will be used.
    normalize_names :
        If True, detect and try to merge name duplicates (same entity with
        different names, e.g. Loratadin vs loratadin). Default: False
    """
    def _load_file(path):
        if isinstance(path, str) and path.startswith('s3:') or \
                isinstance(path, S3Path):
            if isinstance(path, str):
                s3path = S3Path.from_string(path)
            else:
                s3path = path
            if s3path.to_string().endswith('pkl'):
                return load_pickle_from_s3(s3path)
            elif s3path.to_string().endswith('json'):
                return load_json_from_s3(s3path)
            else:
                raise ValueError(f'Unknown file format of {path}')
        else:
            if path.endswith('pkl'):
                with open(path, 'rb') as f:
                    return pickle.load(f)
            elif path.endswith('json'):
                with open(path, 'r') as f:
                    return json.load(f)

    if ro is None:
        ro = get_db('primary')

    # Get the db content from a new DB dump or from file
    db_content = load_db_content(reload=reload, ns_list=NS_LIST,
                                 pkl_filename=db_res_file, ro=ro)

    # Load supporting files
    res_pos = _load_file(res_pos_file)
    src_count = _load_file(src_count_file)
    belief = _load_file(belief_file)

    # Convert the database query result into a set of pairwise relationships
    df = make_dataframe(pkl_filename=df_file, reconvert=reconvert,
                        db_content=db_content, src_count_dict=src_count,
                        res_pos_dict=res_pos, belief_dict=belief,
                        normalize_names=normalize_names)

    if csv_file:
        if isinstance(csv_file, str) and csv_file.startswith('s3:'):
            csv_file = S3Path.from_string(csv_file)
        # Aggregate rows by genes and stmt type
        logger.info("Saving to CSV...")
        filt_df = df.filter(items=['agA_ns', 'agA_id', 'agA_name',
                                   'agB_ns', 'agB_id', 'agB_name',
                                   'stmt_type', 'evidence_count'])
        type_counts = filt_df.groupby(by=['agA_ns', 'agA_id', 'agA_name',
                                          'agB_ns', 'agB_id', 'agB_name',
                                          'stmt_type']).sum()
        # This requires package s3fs under the hood. See:
        # https://pandas.pydata.org/pandas-docs/stable/whatsnew/v0.20.0.html#s3-file-handling
        if isinstance(csv_file, S3Path):
            try:
                type_counts.to_csv(csv_file.to_string())
            except Exception as e:
                try:
                    logger.warning('Failed to upload csv to s3 using direct '
                                   's3 url, trying boto3: %s.' % e)
                    s3 = get_s3_client(unsigned=False)
                    csv_buf = StringIO()
                    type_counts.to_csv(csv_buf)
                    csv_file.upload(s3, csv_buf)
                    logger.info('Uploaded CSV file to s3')
                except Exception as second_e:
                    logger.error('Failed to upload csv file with fallback '
                                 'method')
                    logger.exception(second_e)
        # save locally
        else:
            type_counts.to_csv(csv_file)
    return
Exemplo n.º 4
0
def get_pa_stmt_jsons(clauses=None, with_evidence=True, db=None, limit=1000):
    """Load preassembled Statements from the principal database."""
    if db is None:
        db = get_db('primary')

    if clauses is None:
        clauses = []

    # Construct the core query.
    if with_evidence:
        text_ref_cols = [
            db.Reading.id, db.TextContent.id, db.TextRef.pmid,
            db.TextRef.pmcid, db.TextRef.doi, db.TextRef.url, db.TextRef.pii
        ]
        text_ref_types = tuple([
            str if isinstance(col.type, String) else int
            for col in text_ref_cols
        ])
        text_ref_cols = tuple([
            cast(col, String) if not isinstance(col.type, String) else col
            for col in text_ref_cols
        ])
        text_ref_labels = ('rid', 'tcid', 'pmid', 'pmcid', 'doi', 'url', 'pii')
        core_q = db.session.query(
            db.PAStatements.mk_hash.label('mk_hash'),
            db.PAStatements.json.label('json'),
            func.array_agg(db.RawStatements.json).label("raw_jsons"),
            func.array_agg(array(text_ref_cols)).label("text_refs")
        ).outerjoin(
            db.RawUniqueLinks,
            db.RawUniqueLinks.pa_stmt_mk_hash == db.PAStatements.mk_hash).join(
                db.RawStatements,
                db.RawStatements.id == db.RawUniqueLinks.raw_stmt_id
            ).outerjoin(
                db.Reading,
                db.Reading.id == db.RawStatements.reading_id).outerjoin(
                    db.TextContent,
                    db.TextContent.id == db.Reading.text_content_id).outerjoin(
                        db.TextRef,
                        db.TextRef.id == db.TextContent.text_ref_id)
    else:
        text_ref_types = None
        text_ref_labels = None
        core_q = db.session.query(db.PAStatements.mk_hash.label('mk_hash'),
                                  db.PAStatements.json.label('json'),
                                  null().label('raw_jsons'),
                                  null().label('text_refs'))
    core_q = core_q.filter(*clauses).group_by(db.PAStatements.mk_hash,
                                              db.PAStatements.json)
    if limit:
        core_q = core_q.limit(limit)
    core_sq = core_q.subquery().alias('core')

    # Construct the layer of the query that gathers agent info.
    agent_tuple = (cast(db.PAAgents.ag_num,
                        String), db.PAAgents.db_name, db.PAAgents.db_id)
    at_sq = db.session.query(
        core_sq.c.mk_hash, core_sq.c.json, core_sq.c.raw_jsons,
        core_sq.c.text_refs,
        func.array_agg(array(agent_tuple)).label('db_refs')).filter(
            db.PAAgents.stmt_mk_hash == core_sq.c.mk_hash).group_by(
                core_sq.c.mk_hash, core_sq.c.json, core_sq.c.raw_jsons,
                core_sq.c.text_refs).subquery().alias('agent_tuples')

    # Construct the layer of the query that gathers supports/supported by.
    sup_from = aliased(db.PASupportLinks, name='sup_from')
    sup_to = aliased(db.PASupportLinks, name='sup_to')
    q = db.session.query(
        at_sq.c.mk_hash, at_sq.c.json, at_sq.c.raw_jsons, at_sq.c.text_refs,
        at_sq.c.db_refs,
        func.array_agg(sup_from.supporting_mk_hash).label('supporting_hashes'),
        func.array_agg(
            sup_to.supported_mk_hash).label('supported_hashes')).outerjoin(
                sup_from,
                sup_from.supported_mk_hash == at_sq.c.mk_hash).outerjoin(
                    sup_to,
                    sup_to.supporting_mk_hash == at_sq.c.mk_hash).group_by(
                        at_sq.c.mk_hash, at_sq.c.json, at_sq.c.raw_jsons,
                        at_sq.c.text_refs, at_sq.c.db_refs)

    # Run and parse the query.
    stmt_jsons = {}
    stmts_by_hash = {}
    for h, sj, rjs, text_refs, db_refs, supping, supped in q.all():
        # Gather the agent refs.
        db_ref_dicts = defaultdict(lambda: defaultdict(list))
        for ag_num, db_name, db_id in db_refs:
            db_ref_dicts[int(ag_num)][db_name].append(db_id)
        db_ref_dicts = {k: dict(v) for k, v in db_ref_dicts.items()}

        # Clean supping and supped.
        supping = [h for h in set(supping) if h is not None]
        supped = [h for h in set(supped) if h is not None]

        # Parse the JSON bytes into JSON.
        stmt_json = json.loads(sj)
        if 'supports' not in stmt_json:
            stmt_json['supports'] = []
        if 'supported_by' not in stmt_json:
            stmt_json['supported_by'] = []

        # Load the evidence.
        if rjs is not None:
            for rj, text_ref_values in zip(rjs, text_refs):
                raw_json = json.loads(rj)
                ev = raw_json['evidence'][0]
                if any(v is not None for v in text_ref_values):
                    tr_dict = {
                        lbl.upper(): None if val == "None" else typ(val)
                        for lbl, typ, val in zip(
                            text_ref_labels, text_ref_types, text_ref_values)
                    }
                    _fix_evidence(ev, tr_dict.pop('RID'), tr_dict.pop('TCID'),
                                  tr_dict)
                if 'evidence' not in stmt_json:
                    stmt_json['evidence'] = []
                stmt_json['evidence'].append(ev)

        # Resolve supports supported-by, as much as possible.
        stmts_by_hash[h] = stmt_json
        for supped_h in (h for h in supped if h in stmts_by_hash):
            stmt_json['supports'].append(stmts_by_hash[supped_h]['id'])
            stmts_by_hash[supped_h]['supported_by'].append(stmt_json['id'])
        for supping_h in (h for h in supping if h in stmts_by_hash):
            stmt_json['supported_by'].append(stmts_by_hash[supping_h]['id'])
            stmts_by_hash[supping_h]['supports'].append(stmt_json['id'])

        # Put it together in a dictionary.
        result_dict = {
            "mk_hash": h,
            "stmt": stmt_json,
            "db_refs": db_ref_dicts,
            "supports_hashes": supping,
            "supported_by_hashes": supped
        }
        stmt_jsons[h] = result_dict
    return stmt_jsons