def add_model_from_s3(self, model_id, config=None, number_of_updates=3, bucket=EMMAA_BUCKET_NAME): """Add data for one model from S3 files.""" if not config: config = load_config_from_s3(model_id) test_corpora = config['test']['test_corpus'] if isinstance(test_corpora, str): test_corpora = [test_corpora] stmt_files = sort_s3_files_by_date_str( bucket, f'assembled/{model_id}/statements_', '.gz') stmt_files_to_use = stmt_files[:number_of_updates] for stmt_file in stmt_files_to_use: date = strip_out_date(stmt_file, 'date') dt = strip_out_date(stmt_file, 'datetime') # First get and add statements stmt_jsons = load_gzip_json_from_s3(bucket, stmt_file) self.add_statements(model_id, date, stmt_jsons) # Also update the path counts from each test corpus for test_corpus in test_corpora: key = f'results/{model_id}/results_{test_corpus}_{dt}.json' try: results = load_json_from_s3(bucket, key) path_counts = results[0].get('path_stmt_counts') if path_counts: self.update_statements_path_counts( model_id, date, path_counts) except ClientError as e: if e.response['Error']['Code'] == 'NoSuchKey': logger.warning(f'No results file for {key}, skipping') continue else: raise e
def load_from_s3(klass, model_name, bucket=EMMAA_BUCKET_NAME): """Load the latest model state from S3. Parameters ---------- model_name : str Name of model to load. This function expects the latest model to be found on S3 in the emmaa bucket with key 'models/{model_name}/model_{date_string}', and the model config file at 'models/{model_name}/config.json'. Returns ------- emmaa.model.EmmaaModel Latest instance of EmmaaModel with the given name, loaded from S3. """ config = load_config_from_s3(model_name, bucket=bucket) stmts, stmts_key = load_stmts_from_s3(model_name, bucket=bucket) date = strip_out_date(stmts_key) # Stmts and papers should be from the same date key = f'papers/{model_name}/paper_ids_{date}.json' try: paper_ids = load_json_from_s3(bucket, key) except ClientError as e: logger.warning(f'Could not find paper IDs mapping due to: {e}') paper_ids = None em = klass(model_name, config, paper_ids) em.stmts = stmts if not paper_ids: em.paper_ids = em.get_paper_ids_from_stmts(stmts) return em
def get_assembled_statements(model, bucket=EMMAA_BUCKET_NAME): latest_file_key = find_latest_s3_file(bucket, f'assembled/{model}/statements_', '.json') if not latest_file_key: logger.info(f'No assembled statements found for {model}.') return None, None logger.info(f'Loading assembled statements for {model} from S3.') stmt_jsons = load_json_from_s3(bucket, latest_file_key) stmts = stmts_from_json(stmt_jsons) return stmts, latest_file_key
def load_latest_statements(model): if does_exist(EMMAA_BUCKET_NAME, f'assembled/{model}/latest_statements_{model}'): fkey = f'assembled/{model}/latest_statements_{model}.json' elif does_exist(EMMAA_BUCKET_NAME, f'assembled/{model}/statements_'): fkey = find_latest_s3_file(EMMAA_BUCKET_NAME, f'assembled/{model}/statements_', '.json') else: fkey = None if fkey: stmt_jsons = load_json_from_s3(EMMAA_BUCKET_NAME, fkey) link = f'https://{EMMAA_BUCKET_NAME}.s3.amazonaws.com/{fkey}' else: stmt_jsons = [] link = '' return {'statements': stmt_jsons, 'link': link}
def load_config_from_s3(model_name, bucket=EMMAA_BUCKET_NAME): """Return a JSON dict of config settings for a model from S3. Parameters ---------- model_name : str The name of the model whose config should be loaded. Returns ------- config : dict A JSON dictionary of the model configuration loaded from S3. """ base_key = f'models/{model_name}' config_key = f'{base_key}/config.json' logger.info(f'Loading model config from {config_key}') config = load_json_from_s3(bucket, config_key) return config
def _get_previous_json_stats(self): key = find_latest_s3_file( self.bucket, f'model_stats/{self.model_name}/model_stats_', '.json') # This is the first time statistics is generated for this model if key is None: logger.info(f'Could not find a key to the previous statistics ') return # If stats for this date exists, previous stats is the second latest if strip_out_date(key) == self.latest_round.date_str: logger.info(f'Statistics for latest round already exists') key = find_nth_latest_s3_file( 1, self.bucket, f'model_stats/{self.model_name}/model_stats_', '.json') # Store the date string to find previous round with it self.previous_date_str = strip_out_date(key) logger.info(f'Loading earlier statistics from {key}') previous_json_stats = load_json_from_s3(self.bucket, key) return previous_json_stats
def load_from_statements(cls, model_name, mode='local', date=None, bucket=EMMAA_BUCKET_NAME): config = load_config_from_s3(model_name, bucket=bucket) if date: prefix = f'papers/{model_name}/paper_ids_{date}' else: prefix = f'papers/{model_name}/paper_ids_' paper_key = find_latest_s3_file(bucket, prefix, 'json') if paper_key: paper_ids = load_json_from_s3(bucket, paper_key) else: paper_ids = None model = EmmaaModel(model_name, config, paper_ids) # Loading assembled statements to avoid reassembly stmts, fname = get_assembled_statements(model_name, date, bucket) model.assembled_stmts = stmts model.date_str = strip_out_date(fname, 'datetime') mm = cls(model, mode=mode) return mm
def get_assembled_statements(model, date=None, bucket=EMMAA_BUCKET_NAME): """Load and return a list of assembled statements. Parameters ---------- model : str A name of a model. date : str or None Date in "YYYY-MM-DD" format for which to load the statements. If None, loads the latest available statements. bucket : str Name of S3 bucket to look for a file. Defaults to 'emmaa'. Returns ------- stmts : list[indra.statements.Statement] A list of assembled statements. latest_file_key : str Key of a file with statements on s3. """ if not date: prefix = f'assembled/{model}/statements_' else: prefix = f'assembled/{model}/statements_{date}' # Try loading gzip file latest_file_key = find_latest_s3_file(bucket, prefix, '.gz') if not latest_file_key: # Could be saved with .zip extension latest_file_key = find_latest_s3_file(bucket, prefix, '.zip') if latest_file_key: stmt_jsons = load_gzip_json_from_s3(bucket, latest_file_key) else: # Try loading json file latest_file_key = find_latest_s3_file(bucket, prefix, '.json') if latest_file_key: stmt_jsons = load_json_from_s3(bucket, latest_file_key) # Didn't get gzip, zip or json else: logger.info(f'No assembled statements found for {model}.') return None, None stmts = stmts_from_json(stmt_jsons) return stmts, latest_file_key
def load_from_s3_key(cls, key, bucket=EMMAA_BUCKET_NAME): logger.info(f'Loading json from {key}') json_results = load_json_from_s3(bucket, key) date_str = json_results[0].get('date_str', strip_out_date(key)) return cls(json_results, date_str)
def load_custom_grounding_map(model, bucket=EMMAA_BUCKET_NAME): key = f'models/{model}/grounding_map.json' gr_map = load_json_from_s3(bucket, key) return gr_map
def get_model_stats(model, mode, tests=None, date=None, extension='.json', n=0, bucket=EMMAA_BUCKET_NAME): """Gets the latest statistics for the given model Parameters ---------- model : str Model name to look for mode : str Type of stats to generate (model or test) tests : str A name of a test corpus. Default is large_corpus_tests. date : str or None Date for which the stats will be returned in "YYYY-MM-DD" format. extension : str Extension of the file. n : int Index of the file in list of S3 files sorted by date (0-indexed). bucket : str Name of bucket on S3. Returns ------- model_data : json The json formatted data containing the statistics for the model """ if not tests: tests = _default_test(model, bucket=bucket) # If date is not specified, get the latest or the nth if not date: if mode == 'model': date = last_updated_date(model, 'model_stats', 'date', extension=extension, n=n, bucket=bucket) elif mode == 'test': date = last_updated_date(model, 'test_stats', 'date', tests=tests, extension=extension, n=n, bucket=bucket) else: raise TypeError('Mode must be either model or tests') # Try find new formatted stats (separate for model and tests) if mode == 'model': # File name example: # model_stats/skcm/model_stats_2019-08-20-17-34-40.json prefix = f'model_stats/{model}/model_stats_{date}' latest_file_key = find_latest_s3_file(bucket=bucket, prefix=prefix, extension=extension) elif mode == 'test': # File name example: # stats/skcm/test_stats_large_corpus_tests_2019-08-20-17-34-40.json prefix = f'stats/{model}/test_stats_{tests}_{date}' latest_file_key = find_latest_s3_file(bucket=bucket, prefix=prefix, extension=extension) else: raise TypeError('Mode must be either model or tests') # This might be an older file with model and test stats combined. # File name example: stats/skcm/stats_2019-08-20-17-34-40.json if not latest_file_key and (mode == 'model' or (mode == 'test' and tests == _default_test(model))): prefix = f'stats/{model}/stats_{date}' latest_file_key = find_latest_s3_file(bucket=bucket, prefix=prefix, extension=extension) # If we still didn't filnd the file it probably does not exist if not latest_file_key: return None, None return (load_json_from_s3(bucket, latest_file_key), latest_file_key)