def test_util_find_on_s3_functions(): # Local imports are recommended when using moto from emmaa.util import sort_s3_files_by_date_str, find_latest_s3_file, \ find_nth_latest_s3_file, find_number_of_files_on_s3 # Bucket has mm (pkl) and results (json) files, both in results folder client = setup_bucket(add_mm=True, add_results=True) # Get both files = sort_s3_files_by_date_str(TEST_BUCKET_NAME, 'results/test/') assert len(files) == 2 # Specific extension files = sort_s3_files_by_date_str(TEST_BUCKET_NAME, 'results/test/', '.json') assert len(files) == 1 # Longer prefix files = sort_s3_files_by_date_str(TEST_BUCKET_NAME, 'results/test/results_') assert len(files) == 1 assert find_latest_s3_file(TEST_BUCKET_NAME, 'results/test/results_') assert not find_nth_latest_s3_file(1, TEST_BUCKET_NAME, 'results/test/results_') assert find_nth_latest_s3_file(1, TEST_BUCKET_NAME, 'results/test/') assert find_number_of_files_on_s3(TEST_BUCKET_NAME, 'results/test/') == 2 assert find_number_of_files_on_s3(TEST_BUCKET_NAME, 'results/test/results_') == 1 assert find_number_of_files_on_s3(TEST_BUCKET_NAME, 'results/test/', '.json') == 1
def add_model_from_s3(self, model_id, config=None, number_of_updates=3, bucket=EMMAA_BUCKET_NAME): """Add data for one model from S3 files.""" if not config: config = load_config_from_s3(model_id) test_corpora = config['test']['test_corpus'] if isinstance(test_corpora, str): test_corpora = [test_corpora] stmt_files = sort_s3_files_by_date_str( bucket, f'assembled/{model_id}/statements_', '.gz') stmt_files_to_use = stmt_files[:number_of_updates] for stmt_file in stmt_files_to_use: date = strip_out_date(stmt_file, 'date') dt = strip_out_date(stmt_file, 'datetime') # First get and add statements stmt_jsons = load_gzip_json_from_s3(bucket, stmt_file) self.add_statements(model_id, date, stmt_jsons) # Also update the path counts from each test corpus for test_corpus in test_corpora: key = f'results/{model_id}/results_{test_corpus}_{dt}.json' try: results = load_json_from_s3(bucket, key) path_counts = results[0].get('path_stmt_counts') if path_counts: self.update_statements_path_counts( model_id, date, path_counts) except ClientError as e: if e.response['Error']['Code'] == 'NoSuchKey': logger.warning(f'No results file for {key}, skipping') continue else: raise e