예제 #1
0
파일: test_s3.py 프로젝트: indralab/emmaa
def test_util_find_on_s3_functions():
    # Local imports are recommended when using moto
    from emmaa.util import sort_s3_files_by_date_str, find_latest_s3_file, \
        find_nth_latest_s3_file, find_number_of_files_on_s3
    # Bucket has mm (pkl) and results (json) files, both in results folder
    client = setup_bucket(add_mm=True, add_results=True)
    # Get both
    files = sort_s3_files_by_date_str(TEST_BUCKET_NAME, 'results/test/')
    assert len(files) == 2
    # Specific extension
    files = sort_s3_files_by_date_str(TEST_BUCKET_NAME, 'results/test/',
                                      '.json')
    assert len(files) == 1
    # Longer prefix
    files = sort_s3_files_by_date_str(TEST_BUCKET_NAME,
                                      'results/test/results_')
    assert len(files) == 1
    assert find_latest_s3_file(TEST_BUCKET_NAME, 'results/test/results_')
    assert not find_nth_latest_s3_file(1, TEST_BUCKET_NAME,
                                       'results/test/results_')
    assert find_nth_latest_s3_file(1, TEST_BUCKET_NAME, 'results/test/')
    assert find_number_of_files_on_s3(TEST_BUCKET_NAME, 'results/test/') == 2
    assert find_number_of_files_on_s3(TEST_BUCKET_NAME,
                                      'results/test/results_') == 1
    assert find_number_of_files_on_s3(TEST_BUCKET_NAME, 'results/test/',
                                      '.json') == 1
예제 #2
0
파일: manager.py 프로젝트: bgyori/emmaa
 def add_model_from_s3(self, model_id, config=None, number_of_updates=3,
                       bucket=EMMAA_BUCKET_NAME):
     """Add data for one model from S3 files."""
     if not config:
         config = load_config_from_s3(model_id)
     test_corpora = config['test']['test_corpus']
     if isinstance(test_corpora, str):
         test_corpora = [test_corpora]
     stmt_files = sort_s3_files_by_date_str(
         bucket, f'assembled/{model_id}/statements_', '.gz')
     stmt_files_to_use = stmt_files[:number_of_updates]
     for stmt_file in stmt_files_to_use:
         date = strip_out_date(stmt_file, 'date')
         dt = strip_out_date(stmt_file, 'datetime')
         # First get and add statements
         stmt_jsons = load_gzip_json_from_s3(bucket, stmt_file)
         self.add_statements(model_id, date, stmt_jsons)
         # Also update the path counts from each test corpus
         for test_corpus in test_corpora:
             key = f'results/{model_id}/results_{test_corpus}_{dt}.json'
             try:
                 results = load_json_from_s3(bucket, key)
                 path_counts = results[0].get('path_stmt_counts')
                 if path_counts:
                     self.update_statements_path_counts(
                         model_id, date, path_counts)
             except ClientError as e:
                 if e.response['Error']['Code'] == 'NoSuchKey':
                     logger.warning(f'No results file for {key}, skipping')
                     continue
                 else:
                     raise e