def load_db_content(ns_list, pkl_filename=None, ro=None, reload=False): if isinstance(pkl_filename, str) and pkl_filename.startswith('s3:'): pkl_filename = S3Path.from_string(pkl_filename) # Get the raw data if reload or not pkl_filename: if not ro: ro = get_ro('primary') logger.info("Querying the database for statement metadata...") results = [] for ns in ns_list: logger.info("Querying for {ns}".format(ns=ns)) res = ro.select_all([ ro.PaMeta.mk_hash, ro.PaMeta.db_name, ro.PaMeta.db_id, ro.PaMeta.ag_num, ro.PaMeta.ev_count, ro.PaMeta.type_num ], ro.PaMeta.db_name.like(ns)) results.extend(res) results = {(h, dbn, dbi, ag_num, ev_cnt, ro_type_map.get_str(tn)) for h, dbn, dbi, ag_num, ev_cnt, tn in results} if pkl_filename: if isinstance(pkl_filename, S3Path): upload_pickle_to_s3(results, pkl_filename) else: with open(pkl_filename, 'wb') as f: pickle.dump(results, f) # Get a cached pickle else: logger.info("Loading database content from %s" % pkl_filename) if pkl_filename.startswith('s3:'): results = load_pickle_from_s3(pkl_filename) else: with open(pkl_filename, 'rb') as f: results = pickle.load(f) logger.info("{len} stmts loaded".format(len=len(results))) return results
def get_source_counts(pkl_filename=None, ro=None): """Returns a dict of dicts with evidence count per source, per statement The dictionary is at the top level keyed by statement hash and each entry contains a dictionary keyed by the source that support the statement where the entries are the evidence count for that source.""" logger.info('Getting source counts per statement') if isinstance(pkl_filename, str) and pkl_filename.startswith('s3:'): pkl_filename = S3Path.from_string(pkl_filename) if not ro: ro = get_ro('primary-ro') ev = {h: j for h, j in ro.select_all([ro.SourceMeta.mk_hash, ro.SourceMeta.src_json])} if pkl_filename: if isinstance(pkl_filename, S3Path): upload_pickle_to_s3(obj=ev, s3_path=pkl_filename) else: with open(pkl_filename, 'wb') as f: pickle.dump(ev, f) return ev
def load_res_pos(ro=None): """Return residue/position data keyed by hash""" logger.info('Getting residue and position info') if ro is None: ro = get_ro('primary') res = {'residue': {}, 'position': {}} for stmt_type in get_all_descendants(Modification): stmt_name = stmt_type.__name__ if stmt_name in ('Modification', 'AddModification', 'RemoveModification'): continue logger.info(f'Getting statements for type {stmt_name}') type_num = ro_type_map.get_int(stmt_name) query = ro.select_all(ro.FastRawPaLink.pa_json, ro.FastRawPaLink.type_num == type_num) for jsb, in query: js = json.loads(jsb) if 'residue' in js: res['residue'][int(js['matches_hash'])] = js['residue'] if 'position' in js: res['position'][int(js['matches_hash'])] = js['position'] return res
def main(): args = get_parser().parse_args() ymd = args.s3_ymd if args.s3: logger.info('Uploading to %s/%s/%s on s3 instead of saving locally' % (S3_SIF_BUCKET, S3_SUBDIR, ymd)) db_res_file = _pseudo_key(args.db_dump, ymd) if args.s3 and args.db_dump\ else args.db_dump df_file = _pseudo_key(args.dataframe, ymd) if args.s3 and args.dataframe\ else args.dataframe csv_file = _pseudo_key(args.csv_file, ymd) if args.s3 and args.csv_file\ else args.csv_file src_count_file = _pseudo_key(args.src_counts, ymd) if args.s3 and \ args.src_counts else args.src_counts reload = args.reload if reload: logger.info('Reloading the database content from the database') else: logger.info('Loading cached database content from %s' % db_res_file) reconvert = args.reconvert if reconvert: logger.info('Reconverting database content into pandas dataframe') else: logger.info('Loading cached dataframe from %s' % df_file) for f in [db_res_file, df_file, csv_file, src_count_file]: if f: logger.info('Using file name %s' % f) else: continue dump_sif(df_file, db_res_file, csv_file, src_count_file, reload, reconvert, get_db('primary') if args.principal else get_ro('primary'))
def load_db_content(ns_list, pkl_filename=None, ro=None, reload=False): """Get preassembled stmt metadata from the DB for export. Queries the NameMeta, TextMeta, and OtherMeta tables as needed to get agent/stmt metadata for agents from the given namespaces. Parameters ---------- ns_list : list of str List of agent namespaces to include in the metadata query. pkl_filename : str Name of pickle file to save to (if reloading) or load from (if not reloading). If an S3 path is given (i.e., pkl_filename starts with `s3:`), the file is loaded to/saved from S3. If not given, automatically reloads the content (overriding reload). ro : ReadonlyDatabaseManager Readonly database to load the content from. If not given, calls `get_ro('primary')` to get the primary readonly DB. reload : bool Whether to re-query the database for content or to load the content from from `pkl_filename`. Note that even if `reload` is False, if no `pkl_filename` is given, data will be reloaded anyway. Returns ------- set of tuples Set of tuples containing statement information organized by agent. Tuples contain (stmt_hash, agent_ns, agent_id, agent_num, evidence_count, stmt_type). """ if isinstance(pkl_filename, str) and pkl_filename.startswith('s3:'): pkl_filename = S3Path.from_string(pkl_filename) # Get the raw data if reload or not pkl_filename: if not ro: ro = get_ro('primary') logger.info("Querying the database for statement metadata...") results = {} for ns in ns_list: logger.info("Querying for {ns}".format(ns=ns)) filters = [] if ns == 'NAME': tbl = ro.NameMeta elif ns == 'TEXT': tbl = ro.TextMeta else: tbl = ro.OtherMeta filters.append(tbl.db_name.like(ns)) filters.append(tbl.is_complex_dup == False) res = ro.select_all([tbl.mk_hash, tbl.db_id, tbl.ag_num, tbl.ev_count, tbl.type_num], *filters) results[ns] = res results = {(h, dbn, dbi, ag_num, ev_cnt, ro_type_map.get_str(tn)) for dbn, value_list in results.items() for h, dbi, ag_num, ev_cnt, tn in value_list} if pkl_filename: if isinstance(pkl_filename, S3Path): upload_pickle_to_s3(results, pkl_filename) else: with open(pkl_filename, 'wb') as f: pickle.dump(results, f) # Get a cached pickle else: logger.info("Loading database content from %s" % pkl_filename) if pkl_filename.startswith('s3:'): results = load_pickle_from_s3(pkl_filename) else: with open(pkl_filename, 'rb') as f: results = pickle.load(f) logger.info("{len} stmts loaded".format(len=len(results))) return results