Пример #1
0
def test_util_find_on_s3_functions():
    # Local imports are recommended when using moto
    from emmaa.util import sort_s3_files_by_date_str, find_latest_s3_file, \
        find_nth_latest_s3_file, find_number_of_files_on_s3
    # Bucket has mm (pkl) and results (json) files, both in results folder
    client = setup_bucket(add_mm=True, add_results=True)
    # Get both
    files = sort_s3_files_by_date_str(TEST_BUCKET_NAME, 'results/test/')
    assert len(files) == 2
    # Specific extension
    files = sort_s3_files_by_date_str(TEST_BUCKET_NAME, 'results/test/',
                                      '.json')
    assert len(files) == 1
    # Longer prefix
    files = sort_s3_files_by_date_str(TEST_BUCKET_NAME,
                                      'results/test/results_')
    assert len(files) == 1
    assert find_latest_s3_file(TEST_BUCKET_NAME, 'results/test/results_')
    assert not find_nth_latest_s3_file(1, TEST_BUCKET_NAME,
                                       'results/test/results_')
    assert find_nth_latest_s3_file(1, TEST_BUCKET_NAME, 'results/test/')
    assert find_number_of_files_on_s3(TEST_BUCKET_NAME, 'results/test/') == 2
    assert find_number_of_files_on_s3(TEST_BUCKET_NAME,
                                      'results/test/results_') == 1
    assert find_number_of_files_on_s3(TEST_BUCKET_NAME, 'results/test/',
                                      '.json') == 1
Пример #2
0
def get_model_stats(model, extension='.json'):
    """Gets the latest statistics for the given model

    Parameters
    ----------
    model : str
        Model name to look for
    extension : str

    Returns
    -------
    model_data : json
        The json formatted data containing the statistics for the model
    """
    s3 = get_s3_client()

    # Need jsons for model meta data and test statistics. File name examples:
    # stats/skcm/stats_2019-08-20-17-34-40.json
    prefix = f'stats/{model}/stats_'
    latest_file_key = find_latest_s3_file(bucket=EMMAA_BUCKET_NAME,
                                          prefix=prefix,
                                          extension=extension)
    model_data_object = s3.get_object(Bucket=EMMAA_BUCKET_NAME,
                                      Key=latest_file_key)
    return json.loads(model_data_object['Body'].read().decode('utf8'))
Пример #3
0
def model_last_updated(model, extension='.pkl'):
    """Find the most recent pickle file of model and return its creation date

    Example file name:
    models/aml/model_2018-12-13-18-11-54.pkl

    Parameters
    ----------
    model : str
        Model name to look for
    extension : str
        The extension the model file needs to have. Default is '.pkl'

    Returns
    -------
    last_updated : str
        A string of the format "YYYY-MM-DD-HH-mm-ss"
    """
    prefix = f'models/{model}/model_'
    try:
        return strip_out_date(
            find_latest_s3_file(bucket=EMMAA_BUCKET_NAME,
                                prefix=prefix,
                                extension=extension))
    except TypeError:
        logger.info('Could not find latest update date')
        return ''
Пример #4
0
def get_emmaa_model_statements(model):
    s3 = get_s3_client(unsigned=True)
    latest_model_json = find_latest_s3_file('emmaa', 'assembled/%s' % model)
    stmts_obj = s3.get_object(Bucket='emmaa', Key=latest_model_json)
    stmts = stmts_from_json(
        json.loads(stmts_obj['Body'].read().decode('utf-8')))
    return stmts
Пример #5
0
 def _get_latest_round(self):
     latest_key = find_latest_s3_file('emmaa',
                                      f'results/{self.model_name}/results_',
                                      extension='.json')
     if latest_key is None:
         logger.info(f'Could not find a key to the latest test results '
                     f'for {self.model_name} model.')
         return
     tr = TestRound.load_from_s3_key(latest_key)
     return tr
Пример #6
0
def load_test_results_from_s3(model_name):
    base_key = f'results/{model_name}'
    latest_result_key = find_latest_s3_file('emmaa',
                                            f'{base_key}/results_',
                                            extension='.json')
    client = get_s3_client()
    logger.info(f'Loading test results from {latest_result_key}')
    obj = client.get_object(Bucket='emmaa', Key=latest_result_key)
    test_results = json.loads(obj['Body'].read().decode('utf8'))
    return test_results
Пример #7
0
def get_assembled_statements(model, date=None, bucket=EMMAA_BUCKET_NAME):
    """Load and return a list of assembled statements.

    Parameters
    ----------
    model : str
        A name of a model.
    date : str or None
        Date in "YYYY-MM-DD" format for which to load the statements. If None,
        loads the latest available statements.
    bucket : str
        Name of S3 bucket to look for a file. Defaults to 'emmaa'.

    Returns
    -------
    stmts : list[indra.statements.Statement]
        A list of assembled statements.
    latest_file_key : str
        Key of a file with statements on s3.
    """
    if not date:
        prefix = f'assembled/{model}/statements_'
    else:
        prefix = f'assembled/{model}/statements_{date}'
    # Try loading gzip file
    latest_file_key = find_latest_s3_file(bucket, prefix, '.gz')
    if not latest_file_key:
        # Could be saved with .zip extension
        latest_file_key = find_latest_s3_file(bucket, prefix, '.zip')
    if latest_file_key:
        stmt_jsons = load_gzip_json_from_s3(bucket, latest_file_key)
    else:
        # Try loading json file
        latest_file_key = find_latest_s3_file(bucket, prefix, '.json')
        if latest_file_key:
            stmt_jsons = load_json_from_s3(bucket, latest_file_key)
        # Didn't get gzip, zip or json
        else:
            logger.info(f'No assembled statements found for {model}.')
            return None, None
    stmts = stmts_from_json(stmt_jsons)
    return stmts, latest_file_key
Пример #8
0
def _load_stmts_from_cache(model):
    stmts, file_key = stmts_cache.get(model, (None, None))
    latest_on_s3 = find_latest_s3_file(EMMAA_BUCKET_NAME,
                                       f'assembled/{model}/statements_',
                                       '.json')
    if file_key != latest_on_s3:
        stmts, file_key = get_assembled_statements(model, EMMAA_BUCKET_NAME)
        stmts_cache[model] = (stmts, file_key)
    else:
        logger.info(f'Loaded assembled stmts for {model} from cache.')
    return stmts
Пример #9
0
def get_assembled_statements(model, bucket=EMMAA_BUCKET_NAME):
    latest_file_key = find_latest_s3_file(bucket,
                                          f'assembled/{model}/statements_',
                                          '.json')
    if not latest_file_key:
        logger.info(f'No assembled statements found for {model}.')
        return None, None
    logger.info(f'Loading assembled statements for {model} from S3.')
    stmt_jsons = load_json_from_s3(bucket, latest_file_key)
    stmts = stmts_from_json(stmt_jsons)
    return stmts, latest_file_key
Пример #10
0
 def _get_latest_round(self):
     latest_key = find_latest_s3_file(
         self.bucket,
         f'results/{self.model_name}/model_manager_',
         extension='.pkl')
     if latest_key is None:
         logger.info(f'Could not find a key to the latest model manager '
                     f'for {self.model_name} model.')
         return
     logger.info(f'Loading latest round from {latest_key}')
     mr = ModelRound.load_from_s3_key(latest_key, bucket=self.bucket)
     return mr
Пример #11
0
def get_latest_statements_url(model):
    if does_exist(EMMAA_BUCKET_NAME,
                  f'assembled/{model}/latest_statements_{model}'):
        fkey = f'assembled/{model}/latest_statements_{model}.json'
        link = f'https://{EMMAA_BUCKET_NAME}.s3.amazonaws.com/{fkey}'
    elif does_exist(EMMAA_BUCKET_NAME, f'assembled/{model}/statements_'):
        fkey = find_latest_s3_file(EMMAA_BUCKET_NAME,
                                   f'assembled/{model}/statements_', '.json')
        link = f'https://{EMMAA_BUCKET_NAME}.s3.amazonaws.com/{fkey}'
    else:
        link = None
    return {'link': link}
Пример #12
0
 def _get_latest_round(self):
     latest_key = find_latest_s3_file(
         self.bucket,
         f'results/{self.model_name}/results_{self.test_corpus}',
         extension='.json')
     if latest_key is None:
         logger.info(f'Could not find a key to the latest test results '
                     f'for {self.model_name} model.')
         return
     logger.info(f'Loading latest round from {latest_key}')
     tr = TestRound.load_from_s3_key(latest_key, bucket=self.bucket)
     return tr
Пример #13
0
def _load_tests_from_cache(test_corpus):
    tests, file_key = tests_cache.get(test_corpus, (None, None))
    latest_on_s3 = find_latest_s3_file(EMMAA_BUCKET_NAME,
                                       f'tests/{test_corpus}', '.pkl')
    if file_key != latest_on_s3:
        tests, file_key = load_tests_from_s3(test_corpus, EMMAA_BUCKET_NAME)
        if isinstance(tests, dict):
            tests = tests['tests']
        tests_cache[test_corpus] = (tests, file_key)
    else:
        logger.info(f'Loaded {test_corpus} from cache.')
    return tests
Пример #14
0
 def _get_previous_json_stats(self):
     key = find_latest_s3_file('emmaa',
                               f'stats/{self.model_name}/stats_',
                               extension='.json')
     if key is None:
         logger.info(f'Could not find a key to the previous statistics '
                     f'for {self.model_name} model.')
         return
     client = get_s3_client()
     logger.info(f'Loading earlier statistics from {key}')
     obj = client.get_object(Bucket='emmaa', Key=key)
     previous_json_stats = json.loads(obj['Body'].read().decode('utf8'))
     return previous_json_stats
Пример #15
0
def load_model_manager_from_cache(model_name, bucket=EMMAA_BUCKET_NAME):
    model_manager = model_manager_cache.get(model_name)
    if model_manager:
        latest_on_s3 = find_latest_s3_file(
            bucket, f'results/{model_name}/model_manager_', '.pkl')
        cached_date = model_manager.date_str
        logger.info(f'Found model manager cached on {cached_date} and '
                    f'latest file on S3 is {latest_on_s3}')
        if cached_date in latest_on_s3:
            logger.info(f'Loaded model manager for {model_name} from cache.')
            return model_manager
    logger.info(f'Loading model manager for {model_name} from S3.')
    model_manager = load_model_manager_from_s3(model_name=model_name,
                                               bucket=bucket)
    model_manager_cache[model_name] = model_manager
    return model_manager
Пример #16
0
def load_latest_statements(model):
    if does_exist(EMMAA_BUCKET_NAME,
                  f'assembled/{model}/latest_statements_{model}'):
        fkey = f'assembled/{model}/latest_statements_{model}.json'
    elif does_exist(EMMAA_BUCKET_NAME, f'assembled/{model}/statements_'):
        fkey = find_latest_s3_file(EMMAA_BUCKET_NAME,
                                   f'assembled/{model}/statements_', '.json')
    else:
        fkey = None
    if fkey:
        stmt_jsons = load_json_from_s3(EMMAA_BUCKET_NAME, fkey)
        link = f'https://{EMMAA_BUCKET_NAME}.s3.amazonaws.com/{fkey}'
    else:
        stmt_jsons = []
        link = ''
    return {'statements': stmt_jsons, 'link': link}
Пример #17
0
def load_model_manager_from_s3(model_name=None,
                               key=None,
                               bucket=EMMAA_BUCKET_NAME):
    # First try find the file from specified key
    if key:
        try:
            model_manager = load_pickle_from_s3(bucket, key)
            if not model_manager.model.assembled_stmts:
                stmts, _ = get_assembled_statements(model_manager.model.name,
                                                    strip_out_date(
                                                        model_manager.date_str,
                                                        'date'),
                                                    bucket=bucket)
                model_manager.model.assembled_stmts = stmts
            return model_manager
        except Exception as e:
            logger.info('Could not load the model manager directly')
            logger.info(e)
            if not model_name:
                model_name = key.split('/')[1]
            date = strip_out_date(key, 'date')
            logger.info('Trying to load model manager from statements')
            try:
                model_manager = ModelManager.load_from_statements(
                    model_name, date=date, bucket=bucket)
                return model_manager
            except Exception as e:
                logger.info('Could not load the model manager from '
                            'statements')
                logger.info(e)
                return None
    # Now try find the latest key for given model
    if model_name:
        # Versioned
        key = find_latest_s3_file(bucket,
                                  f'results/{model_name}/model_manager_',
                                  '.pkl')
        if key is None:
            # Non-versioned
            key = f'results/{model_name}/latest_model_manager.pkl'
        return load_model_manager_from_s3(model_name=model_name,
                                          key=key,
                                          bucket=bucket)
    # Could not find either from key or from model name.
    logger.info('Could not find the model manager.')
    return None
Пример #18
0
 def _get_previous_json_stats(self):
     key = find_latest_s3_file(
         self.bucket, f'model_stats/{self.model_name}/model_stats_',
         '.json')
     # This is the first time statistics is generated for this model
     if key is None:
         logger.info(f'Could not find a key to the previous statistics ')
         return
     # If stats for this date exists, previous stats is the second latest
     if strip_out_date(key) == self.latest_round.date_str:
         logger.info(f'Statistics for latest round already exists')
         key = find_nth_latest_s3_file(
             1, self.bucket, f'model_stats/{self.model_name}/model_stats_',
             '.json')
     # Store the date string to find previous round with it
     self.previous_date_str = strip_out_date(key)
     logger.info(f'Loading earlier statistics from {key}')
     previous_json_stats = load_json_from_s3(self.bucket, key)
     return previous_json_stats
Пример #19
0
def load_stmts_from_s3(model_name, bucket=EMMAA_BUCKET_NAME):
    """Return the list of EMMAA Statements constituting the latest model.

    Parameters
    ----------
    model_name : str
        The name of the model whose config should be loaded.

    Returns
    -------
    stmts : list of emmaa.statements.EmmaaStatement
        The list of EMMAA Statements in the latest model version.
    """
    base_key = f'models/{model_name}'
    latest_model_key = find_latest_s3_file(bucket, f'{base_key}/model_',
                                           extension='.pkl')
    logger.info(f'Loading model state from {latest_model_key}')
    stmts = load_pickle_from_s3(bucket, latest_model_key)
    return stmts, latest_model_key
Пример #20
0
 def load_from_statements(cls, model_name, mode='local', date=None,
                          bucket=EMMAA_BUCKET_NAME):
     config = load_config_from_s3(model_name, bucket=bucket)
     if date:
         prefix = f'papers/{model_name}/paper_ids_{date}'
     else:
         prefix = f'papers/{model_name}/paper_ids_'
     paper_key = find_latest_s3_file(bucket, prefix, 'json')
     if paper_key:
         paper_ids = load_json_from_s3(bucket, paper_key)
     else:
         paper_ids = None
     model = EmmaaModel(model_name, config, paper_ids)
     # Loading assembled statements to avoid reassembly
     stmts, fname = get_assembled_statements(model_name, date, bucket)
     model.assembled_stmts = stmts
     model.date_str = strip_out_date(fname, 'datetime')
     mm = cls(model, mode=mode)
     return mm
Пример #21
0
def load_stmts_from_s3(model_name):
    """Return the list of EMMAA Statements constituting the latest model.

    Parameters
    ----------
    model_name : str
        The name of the model whose config should be loaded.

    Returns
    -------
    stmts : list of emmaa.statements.EmmaaStatement
        The list of EMMAA Statements in the latest model version.
    """
    client = get_s3_client()
    base_key = f'models/{model_name}'
    latest_model_key = find_latest_s3_file('emmaa',
                                           f'{base_key}/model_',
                                           extension='.pkl')
    logger.info(f'Loading model state from {latest_model_key}')
    obj = client.get_object(Bucket='emmaa', Key=latest_model_key)
    stmts = pickle.loads(obj['Body'].read())
    return stmts
Пример #22
0
def load_tests_from_s3(test_name, bucket=EMMAA_BUCKET_NAME):
    """Load Emmaa Tests with the given name from S3.

    Parameters
    ----------
    test_name : str
        Looks for a test file in the emmaa bucket on S3 with key
        'tests/{test_name}'.

    Return
    ------
    list of EmmaaTest
        List of EmmaaTest objects loaded from S3.
    """
    prefix = f'tests/{test_name}'
    try:
        test_key = find_latest_s3_file(bucket, prefix, '.pkl')
    except ValueError:
        test_key = f'tests/{test_name}.pkl'
    logger.info(f'Loading tests from {test_key}')
    tests = load_pickle_from_s3(bucket, test_key)
    return tests, test_key
Пример #23
0
def load_model_manager_from_s3(model_name=None,
                               key=None,
                               bucket=EMMAA_BUCKET_NAME):
    # First try find the file from specified key
    if key:
        try:
            model_manager = load_pickle_from_s3(bucket, key)
            return model_manager
        except Exception as e:
            logger.info('Could not load the model manager')
            logger.info(e)
    # Now try find the latest key for given model
    if model_name:
        # Versioned
        key = find_latest_s3_file(bucket,
                                  f'results/{model_name}/model_manager_',
                                  '.pkl')
        if key is None:
            # Non-versioned
            key = f'results/{model_name}/latest_model_manager.pkl'
        return load_model_manager_from_s3(key=key, bucket=bucket)
    # Could not find either from key or from model name.
    logger.info('Could not find the model manager.')
    return None
Пример #24
0
def get_model_stats(model,
                    mode,
                    tests=None,
                    date=None,
                    extension='.json',
                    n=0,
                    bucket=EMMAA_BUCKET_NAME):
    """Gets the latest statistics for the given model

    Parameters
    ----------
    model : str
        Model name to look for
    mode : str
        Type of stats to generate (model or test)
    tests : str
        A name of a test corpus. Default is large_corpus_tests.
    date : str or None
        Date for which the stats will be returned in "YYYY-MM-DD" format.
    extension : str
        Extension of the file.
    n : int
        Index of the file in list of S3 files sorted by date (0-indexed).
    bucket : str
        Name of bucket on S3.
    Returns
    -------
    model_data : json
        The json formatted data containing the statistics for the model
    """
    if not tests:
        tests = _default_test(model, bucket=bucket)
    # If date is not specified, get the latest or the nth
    if not date:
        if mode == 'model':
            date = last_updated_date(model,
                                     'model_stats',
                                     'date',
                                     extension=extension,
                                     n=n,
                                     bucket=bucket)
        elif mode == 'test':
            date = last_updated_date(model,
                                     'test_stats',
                                     'date',
                                     tests=tests,
                                     extension=extension,
                                     n=n,
                                     bucket=bucket)
        else:
            raise TypeError('Mode must be either model or tests')

    # Try find new formatted stats (separate for model and tests)
    if mode == 'model':
        # File name example:
        # model_stats/skcm/model_stats_2019-08-20-17-34-40.json
        prefix = f'model_stats/{model}/model_stats_{date}'
        latest_file_key = find_latest_s3_file(bucket=bucket,
                                              prefix=prefix,
                                              extension=extension)
    elif mode == 'test':
        # File name example:
        # stats/skcm/test_stats_large_corpus_tests_2019-08-20-17-34-40.json
        prefix = f'stats/{model}/test_stats_{tests}_{date}'
        latest_file_key = find_latest_s3_file(bucket=bucket,
                                              prefix=prefix,
                                              extension=extension)
    else:
        raise TypeError('Mode must be either model or tests')
    # This might be an older file with model and test stats combined.
    # File name example: stats/skcm/stats_2019-08-20-17-34-40.json
    if not latest_file_key and (mode == 'model' or
                                (mode == 'test'
                                 and tests == _default_test(model))):
        prefix = f'stats/{model}/stats_{date}'
        latest_file_key = find_latest_s3_file(bucket=bucket,
                                              prefix=prefix,
                                              extension=extension)
    # If we still didn't filnd the file it probably does not exist
    if not latest_file_key:
        return None, None
    return (load_json_from_s3(bucket, latest_file_key), latest_file_key)
Пример #25
0
def get_all_statements_page(model):
    sort_by = request.args.get('sort_by', 'evidence')
    page = int(request.args.get('page', 1))
    filter_curated = request.args.get('filter_curated', False)
    filter_curated = (filter_curated == 'true')
    offset = (page - 1) * 1000
    stmts = _load_stmts_from_cache(model)
    stmts_by_hash = {}
    for stmt in stmts:
        stmts_by_hash[str(stmt.get_hash())] = stmt
    msg = None
    curations = get_curations()
    cur_counts = _count_curations(curations, stmts_by_hash)
    if filter_curated:
        stmts = [
            stmt for stmt in stmts if str(stmt.get_hash()) not in cur_counts
        ]
    test_stats, _ = get_model_stats(model, 'test')
    stmt_counts = test_stats['test_round_summary'].get('path_stmt_counts', [])
    stmt_counts_dict = dict(stmt_counts)
    if len(stmts) % 1000 == 0:
        total_pages = len(stmts) // 1000
    else:
        total_pages = len(stmts) // 1000 + 1
    if page + 1 <= total_pages:
        next_page = page + 1
    else:
        next_page = None
    if page != 1:
        prev_page = page - 1
    else:
        prev_page = None
    if sort_by == 'evidence':
        stmts = sorted(stmts, key=lambda x: len(x.evidence),
                       reverse=True)[offset:offset + 1000]
    elif sort_by == 'paths':
        if not stmt_counts:
            msg = 'Sorting by paths is not available, sorting by evidence'
            stmts = sorted(stmts, key=lambda x: len(x.evidence),
                           reverse=True)[offset:offset + 1000]
        else:
            stmts = []
            for (stmt_hash, count) in stmt_counts[offset:offset + 1000]:
                try:
                    stmts.append(stmts_by_hash[stmt_hash])
                except KeyError:
                    continue
    stmt_rows = []
    for stmt in stmts:
        stmt_row = _get_stmt_row(stmt, 'model_statement', model, cur_counts,
                                 None, stmt_counts_dict)
        stmt_rows.append(stmt_row)
    table_title = f'All statements in {model.upper()} model.'

    if does_exist(EMMAA_BUCKET_NAME,
                  f'assembled/{model}/latest_statements_{model}'):
        fkey = f'assembled/{model}/latest_statements_{model}.json'
        link = f'https://{EMMAA_BUCKET_NAME}.s3.amazonaws.com/{fkey}'
    elif does_exist(EMMAA_BUCKET_NAME, f'assembled/{model}/statements_'):
        fkey = find_latest_s3_file(EMMAA_BUCKET_NAME,
                                   f'assembled/{model}/statements_', '.json')
        link = f'https://{EMMAA_BUCKET_NAME}.s3.amazonaws.com/{fkey}'
    else:
        link = None
    return render_template('evidence_template.html',
                           stmt_rows=stmt_rows,
                           model=model,
                           source='model_statement',
                           table_title=table_title,
                           msg=msg,
                           is_all_stmts=True,
                           prev=prev_page,
                           next=next_page,
                           filter_curated=filter_curated,
                           sort_by=sort_by,
                           link=link)