Exemplo n.º 1
0
def load_db_content(ns_list, pkl_filename=None, ro=None, reload=False):
    if isinstance(pkl_filename, str) and pkl_filename.startswith('s3:'):
        pkl_filename = S3Path.from_string(pkl_filename)
    # Get the raw data
    if reload or not pkl_filename:
        if not ro:
            ro = get_ro('primary')
        logger.info("Querying the database for statement metadata...")
        results = []
        for ns in ns_list:
            logger.info("Querying for {ns}".format(ns=ns))
            res = ro.select_all([
                ro.PaMeta.mk_hash, ro.PaMeta.db_name, ro.PaMeta.db_id,
                ro.PaMeta.ag_num, ro.PaMeta.ev_count, ro.PaMeta.type_num
            ], ro.PaMeta.db_name.like(ns))
            results.extend(res)
        results = {(h, dbn, dbi, ag_num, ev_cnt, ro_type_map.get_str(tn))
                   for h, dbn, dbi, ag_num, ev_cnt, tn in results}
        if pkl_filename:
            if isinstance(pkl_filename, S3Path):
                upload_pickle_to_s3(results, pkl_filename)
            else:
                with open(pkl_filename, 'wb') as f:
                    pickle.dump(results, f)
    # Get a cached pickle
    else:
        logger.info("Loading database content from %s" % pkl_filename)
        if pkl_filename.startswith('s3:'):
            results = load_pickle_from_s3(pkl_filename)
        else:
            with open(pkl_filename, 'rb') as f:
                results = pickle.load(f)
    logger.info("{len} stmts loaded".format(len=len(results)))
    return results
Exemplo n.º 2
0
def get_source_counts(pkl_filename=None, ro=None):
    """Returns a dict of dicts with evidence count per source, per statement

    The dictionary is at the top level keyed by statement hash and each
    entry contains a dictionary keyed by the source that support the
    statement where the entries are the evidence count for that source."""
    logger.info('Getting source counts per statement')
    if isinstance(pkl_filename, str) and pkl_filename.startswith('s3:'):
        pkl_filename = S3Path.from_string(pkl_filename)
    if not ro:
        ro = get_ro('primary-ro')
    ev = {h: j for h, j in ro.select_all([ro.SourceMeta.mk_hash,
                                          ro.SourceMeta.src_json])}

    if pkl_filename:
        if isinstance(pkl_filename, S3Path):
            upload_pickle_to_s3(obj=ev, s3_path=pkl_filename)
        else:
            with open(pkl_filename, 'wb') as f:
                pickle.dump(ev, f)
    return ev
Exemplo n.º 3
0
def load_res_pos(ro=None):
    """Return residue/position data keyed by hash"""
    logger.info('Getting residue and position info')
    if ro is None:
        ro = get_ro('primary')
    res = {'residue': {}, 'position': {}}
    for stmt_type in get_all_descendants(Modification):
        stmt_name = stmt_type.__name__
        if stmt_name in ('Modification', 'AddModification',
                         'RemoveModification'):
            continue
        logger.info(f'Getting statements for type {stmt_name}')
        type_num = ro_type_map.get_int(stmt_name)
        query = ro.select_all(ro.FastRawPaLink.pa_json,
                              ro.FastRawPaLink.type_num == type_num)
        for jsb, in query:
            js = json.loads(jsb)
            if 'residue' in js:
                res['residue'][int(js['matches_hash'])] = js['residue']
            if 'position' in js:
                res['position'][int(js['matches_hash'])] = js['position']
    return res
Exemplo n.º 4
0
def main():
    args = get_parser().parse_args()

    ymd = args.s3_ymd
    if args.s3:
        logger.info('Uploading to %s/%s/%s on s3 instead of saving locally'
                    % (S3_SIF_BUCKET, S3_SUBDIR, ymd))
    db_res_file = _pseudo_key(args.db_dump, ymd) if args.s3 and args.db_dump\
        else args.db_dump
    df_file = _pseudo_key(args.dataframe, ymd) if args.s3 and args.dataframe\
        else args.dataframe
    csv_file = _pseudo_key(args.csv_file, ymd) if args.s3 and args.csv_file\
        else args.csv_file
    src_count_file = _pseudo_key(args.src_counts, ymd) if args.s3 and \
        args.src_counts else args.src_counts

    reload = args.reload
    if reload:
        logger.info('Reloading the database content from the database')
    else:
        logger.info('Loading cached database content from %s' % db_res_file)

    reconvert = args.reconvert
    if reconvert:
        logger.info('Reconverting database content into pandas dataframe')
    else:
        logger.info('Loading cached dataframe from %s' % df_file)

    for f in [db_res_file, df_file, csv_file, src_count_file]:
        if f:
            logger.info('Using file name %s' % f)
        else:
            continue

    dump_sif(df_file, db_res_file, csv_file, src_count_file, reload, reconvert,
             get_db('primary') if args.principal else get_ro('primary'))
Exemplo n.º 5
0
def load_db_content(ns_list, pkl_filename=None, ro=None, reload=False):
    """Get preassembled stmt metadata from the DB for export.

    Queries the NameMeta, TextMeta, and OtherMeta tables as needed to get
    agent/stmt metadata for agents from the given namespaces.

    Parameters
    ----------
    ns_list : list of str
        List of agent namespaces to include in the metadata query.
    pkl_filename : str
        Name of pickle file to save to (if reloading) or load from (if not
        reloading). If an S3 path is given (i.e., pkl_filename starts with
        `s3:`), the file is loaded to/saved from S3. If not given,
        automatically reloads the content (overriding reload).
    ro : ReadonlyDatabaseManager
        Readonly database to load the content from. If not given, calls
        `get_ro('primary')` to get the primary readonly DB.
    reload : bool
        Whether to re-query the database for content or to load the content
        from from `pkl_filename`. Note that even if `reload` is False,
        if no `pkl_filename` is given, data will be reloaded anyway.

    Returns
    -------
    set of tuples
        Set of tuples containing statement information organized
        by agent. Tuples contain (stmt_hash, agent_ns, agent_id, agent_num,
        evidence_count, stmt_type).
    """
    if isinstance(pkl_filename, str) and pkl_filename.startswith('s3:'):
        pkl_filename = S3Path.from_string(pkl_filename)
    # Get the raw data
    if reload or not pkl_filename:
        if not ro:
            ro = get_ro('primary')
        logger.info("Querying the database for statement metadata...")
        results = {}
        for ns in ns_list:
            logger.info("Querying for {ns}".format(ns=ns))
            filters = []
            if ns == 'NAME':
                tbl = ro.NameMeta
            elif ns == 'TEXT':
                tbl = ro.TextMeta
            else:
                tbl = ro.OtherMeta
                filters.append(tbl.db_name.like(ns))
            filters.append(tbl.is_complex_dup == False)
            res = ro.select_all([tbl.mk_hash, tbl.db_id, tbl.ag_num,
                                 tbl.ev_count, tbl.type_num], *filters)
            results[ns] = res
        results = {(h, dbn, dbi, ag_num, ev_cnt, ro_type_map.get_str(tn))
                   for dbn, value_list in results.items()
                   for h, dbi, ag_num, ev_cnt, tn in value_list}
        if pkl_filename:
            if isinstance(pkl_filename, S3Path):
                upload_pickle_to_s3(results, pkl_filename)
            else:
                with open(pkl_filename, 'wb') as f:
                    pickle.dump(results, f)
    # Get a cached pickle
    else:
        logger.info("Loading database content from %s" % pkl_filename)
        if pkl_filename.startswith('s3:'):
            results = load_pickle_from_s3(pkl_filename)
        else:
            with open(pkl_filename, 'rb') as f:
                results = pickle.load(f)
    logger.info("{len} stmts loaded".format(len=len(results)))
    return results