def save_to_s3(self, bucket=EMMAA_BUCKET_NAME): """Dump the model state to S3.""" date_str = make_date_str() fname = f'models/{self.name}/model_{date_str}' # Dump as pickle save_pickle_to_s3(self.stmts, bucket, key=fname + '.pkl') # Dump as json save_json_to_s3(self.stmts, bucket, key=fname + '.json')
def save_to_s3(self, bucket=EMMAA_BUCKET_NAME): """Dump the model state to S3.""" fname = f'models/{self.name}/model_{self.date_str}' # Dump as pickle save_pickle_to_s3(self.stmts, bucket, key=fname+'.pkl') # Save ids to stmt hashes mapping as json id_fname = f'papers/{self.name}/paper_ids_{self.date_str}.json' save_json_to_s3(list(self.paper_ids), bucket, key=id_fname)
def upload_results(self, test_corpus='large_corpus_tests', test_data=None, bucket=EMMAA_BUCKET_NAME): """Upload results to s3 bucket.""" json_dict = self.results_to_json(test_data) result_key = (f'results/{self.model.name}/results_' f'{test_corpus}_{self.date_str}.json') logger.info(f'Uploading test results to {result_key}') save_json_to_s3(json_dict, bucket, result_key)
def save_tests_to_s3(tests, bucket, key, save_format='pkl'): """Save tests in pkl, json or jsonl format.""" if save_format == 'pkl': save_pickle_to_s3(tests, bucket, key) elif save_format in ['json', 'jsonl']: if isinstance(tests, list): stmts = [test.stmt for test in tests] elif isinstance(tests, dict): stmts = [test.stmt for test in tests['tests']] stmts_json = stmts_to_json(stmts) save_json_to_s3(stmts_json, bucket, key, save_format)
def save_assembled_statements(self, bucket=EMMAA_BUCKET_NAME): """Upload assembled statements jsons to S3 bucket.""" stmt_jsons = self.assembled_stmts_to_json() # Save a timestapmed version and a generic latest version of files key1 = f'assembled/{self.model.name}/statements_{self.date_str}.json' key2 = f'assembled/{self.model.name}/' \ f'latest_statements_{self.model.name}.json' logger.info(f'Uploading assembled statements to {key1}') save_json_to_s3(stmt_jsons, bucket, key1) logger.info(f'Uploading assembled statements to {key2}') save_json_to_s3(stmt_jsons, bucket, key2)
def save_config_to_s3(model_name, config, bucket=EMMAA_BUCKET_NAME): """Upload config settings for a model to S3. Parameters ---------- model_name : str The name of the model whose config should be saved to S3. config : dict A JSON dict of configurations for the model. """ base_key = f'models/{model_name}' config_key = f'{base_key}/config.json' logger.info(f'Saving model config to {config_key}') save_json_to_s3(config, bucket, config_key)
def save_stmts(stmts, model_name): stmts_json = stmts_to_json(stmts) # Save a timestapmed version and a generic latest version of files dated_key = f'assembled/{model_name}/statements_{self.date_str}' latest_key = f'assembled/{model_name}/' \ f'latest_statements_{model_name}' for ext in ('json', 'jsonl'): latest_obj_key = latest_key + '.' + ext logger.info('Uploading assembled statements to ' f'{latest_obj_key}') save_json_to_s3(stmts_json, bucket, latest_obj_key, ext) dated_jsonl = dated_key + '.jsonl' dated_zip = dated_key + '.gz' logger.info(f'Uploading assembled statements to {dated_jsonl}') save_json_to_s3(stmts_json, bucket, dated_jsonl, 'jsonl') logger.info(f'Uploading assembled statements to {dated_zip}') save_gzip_json_to_s3(stmts_json, bucket, dated_zip, 'json')
def save_assembled_statements(self, bucket=EMMAA_BUCKET_NAME): """Upload assembled statements jsons to S3 bucket.""" stmts = self.model.assembled_stmts stmts_json = stmts_to_json(stmts) # Save a timestapmed version and a generic latest version of files dated_key = f'assembled/{self.model.name}/statements_{self.date_str}' latest_key = f'assembled/{self.model.name}/' \ f'latest_statements_{self.model.name}' for ext in ('json', 'jsonl'): latest_obj_key = latest_key + '.' + ext logger.info(f'Uploading assembled statements to {latest_obj_key}') save_json_to_s3(stmts_json, bucket, latest_obj_key, ext) dated_jsonl = dated_key + '.jsonl' dated_zip = dated_key + '.gz' logger.info(f'Uploading assembled statements to {dated_jsonl}') save_json_to_s3(stmts_json, bucket, dated_jsonl, 'jsonl') logger.info(f'Uploading assembled statements to {dated_zip}') save_gzip_json_to_s3(stmts_json, bucket, dated_zip, 'json')
def test_util_s3_intelligent_tiering(): from emmaa.util import save_json_to_s3, get_s3_client, get_s3_archive_status # Generate a file that's larger than 128kB large_file = ''.join(['a' for i in range(1024 * 129)]) # Put it in a dict in order to serialize it as JSON large_file_dict = {'large_file': large_file} # Save it to S3 client = get_s3_client() bucket = client.create_bucket(Bucket=TEST_BUCKET_NAME, ACL='public-read') key = 'intelligent_tiering_test/large_file.json' save_json_to_s3(obj=large_file_dict, bucket=TEST_BUCKET_NAME, key=key, intelligent_tiering=True) # Check the archive status status = get_s3_archive_status(TEST_BUCKET_NAME, key) assert status['intelligent_tiering'] is True assert status['archived'] is False
def upload_results(self, test_corpus='large_corpus_tests', test_data=None, bucket=EMMAA_BUCKET_NAME): """Upload results to s3 bucket.""" json_dict, json_lines = self.results_to_json(test_data) result_key = (f'results/{self.model.name}/results_' f'{test_corpus}_{self.date_str}.json') paths_key = (f'paths/{self.model.name}/paths_{test_corpus}_' f'{self.date_str}.jsonl') latest_paths_key = (f'paths/{self.model.name}/{test_corpus}' '_latest_paths.jsonl') logger.info(f'Uploading test results to {result_key}') save_json_to_s3(json_dict, bucket, result_key) logger.info(f'Uploading test paths to {paths_key}') save_json_to_s3(json_lines, bucket, paths_key, save_format='jsonl') save_json_to_s3(json_lines, bucket, latest_paths_key, 'jsonl')
def upload_results(self, test_corpus='large_corpus_tests', test_data=None, upload_to_db=True, bucket=EMMAA_BUCKET_NAME): """Upload results to s3 bucket.""" json_dict, json_lines = self.results_to_json(test_data) result_key = (f'results/{self.model.name}/results_' f'{test_corpus}_{self.date_str}.json') paths_key = (f'paths/{self.model.name}/paths_{test_corpus}_' f'{self.date_str}.jsonl') latest_paths_key = (f'paths/{self.model.name}/{test_corpus}' '_latest_paths.jsonl') logger.info(f'Uploading test results to {result_key}') save_json_to_s3(json_dict, bucket, result_key) logger.info(f'Uploading test paths to {paths_key}') save_json_to_s3(json_lines, bucket, paths_key, save_format='jsonl') save_json_to_s3(json_lines, bucket, latest_paths_key, 'jsonl') # Also save the path counts to the database if requested if upload_to_db: db = get_db('stmt') db.update_statements_path_counts( self.model.name, self.date_str[:10], self.path_stmt_counts)
def save_to_s3_key(self, stats_key): if self.json_stats: logger.info(f'Uploading statistics to {stats_key}') save_json_to_s3(self.json_stats, self.bucket, stats_key)