def get_all_stats(model_name, test_corpora, date): """Get all stats for a model and its test corpora. Parameters ---------- model_name : str A name of the model to get the updates for. test_corpora : list[str] A list of test corpora names to get the test updates for. date : str A date for which the updates should be generated. Returns ------- model_stats : dict A dictionary containing the stats for the given model. test_stats_by_corpus : dict A dictionary of test statistics keyed by test corpus name. """ model_stats, _ = get_model_stats(model_name, 'model', date=date) test_stats_by_corpus = {} for test_corpus in test_corpora: test_stats, _ = get_model_stats(model_name, 'test', tests=test_corpus, date=date) if not test_stats: logger.info(f'Could not find test stats for {test_corpus}') test_stats_by_corpus[test_corpus] = test_stats return model_stats, test_stats_by_corpus
def get_statement_evidence_page(): stmt_hashes = request.args.getlist('stmt_hash') source = request.args.get('source') model = request.args.get('model') test_corpus = request.args.get('test_corpus', '') display_format = request.args.get('format', 'html') stmts = [] if source == 'model_statement': test_stats, _ = get_model_stats(model, 'test') stmt_counts = test_stats['test_round_summary'].get( 'path_stmt_counts', []) stmt_counts_dict = dict(stmt_counts) all_stmts = _load_stmts_from_cache(model) for stmt in all_stmts: for stmt_hash in stmt_hashes: if str(stmt.get_hash()) == str(stmt_hash): stmts.append(stmt) elif source == 'test': if not test_corpus: abort(Response(f'Need test corpus name to load evidence', 404)) tests = _load_tests_from_cache(test_corpus) stmt_counts_dict = None for t in tests: for stmt_hash in stmt_hashes: if str(t.stmt.get_hash()) == str(stmt_hash): stmts.append(t.stmt) else: abort(Response(f'Source should be model_statement or test', 404)) if display_format == 'html': stmt_rows = [] stmts_by_hash = {} for stmt in stmts: stmts_by_hash[str(stmt.get_hash())] = stmt curations = get_curations(pa_hash=stmt_hashes) cur_dict = defaultdict(list) for cur in curations: cur_dict[(cur.pa_hash, cur.source_hash)].append({'error_type': cur.tag}) cur_counts = _count_curations(curations, stmts_by_hash) for stmt in stmts: stmt_row = _get_stmt_row(stmt, source, model, cur_counts, test_corpus, stmt_counts_dict, cur_dict, True) stmt_rows.append(stmt_row) else: stmt_json = json.dumps(stmts[0].to_json(), indent=1) return Response(stmt_json, mimetype='application/json') return render_template('evidence_template.html', stmt_rows=stmt_rows, model=model, source=source, test_corpus=test_corpus if test_corpus else None, table_title='Statement Evidence and Curation', msg=None, is_all_stmts=False)
def test_get_model_statistics(): # Local imports are recommended when using moto from emmaa.model import get_model_stats client = setup_bucket(add_model=True, add_model_stats=True, add_test_stats=True) # Get latest model stats model_stats, key = get_model_stats('test', 'model', bucket=TEST_BUCKET_NAME) assert isinstance(model_stats, dict) assert key.startswith('model_stats/test/model_stats_') # Get latest test stats test_stats, key = get_model_stats('test', 'test', 'simple_tests', bucket=TEST_BUCKET_NAME) assert isinstance(test_stats, dict) assert key.startswith('stats/test/test_stats_') # Try with a different date new_stats, key = get_model_stats('test', 'model', date='2020-01-01', bucket=TEST_BUCKET_NAME) assert not new_stats assert not key # Put missing file and try again client.put_object( Body=json.dumps(previous_model_stats, indent=1), Bucket=TEST_BUCKET_NAME, Key=f'model_stats/test/model_stats_2020-01-01-00-00-00.json') new_stats, key = get_model_stats('test', 'model', date='2020-01-01', bucket=TEST_BUCKET_NAME) assert new_stats assert isinstance(new_stats, dict) assert key == 'model_stats/test/model_stats_2020-01-01-00-00-00.json'
def get_all_statements_page(model): sort_by = request.args.get('sort_by', 'evidence') page = int(request.args.get('page', 1)) filter_curated = request.args.get('filter_curated', False) filter_curated = (filter_curated == 'true') offset = (page - 1) * 1000 stmts = _load_stmts_from_cache(model) stmts_by_hash = {} for stmt in stmts: stmts_by_hash[str(stmt.get_hash())] = stmt msg = None curations = get_curations() cur_counts = _count_curations(curations, stmts_by_hash) if filter_curated: stmts = [ stmt for stmt in stmts if str(stmt.get_hash()) not in cur_counts ] test_stats, _ = get_model_stats(model, 'test') stmt_counts = test_stats['test_round_summary'].get('path_stmt_counts', []) stmt_counts_dict = dict(stmt_counts) if len(stmts) % 1000 == 0: total_pages = len(stmts) // 1000 else: total_pages = len(stmts) // 1000 + 1 if page + 1 <= total_pages: next_page = page + 1 else: next_page = None if page != 1: prev_page = page - 1 else: prev_page = None if sort_by == 'evidence': stmts = sorted(stmts, key=lambda x: len(x.evidence), reverse=True)[offset:offset + 1000] elif sort_by == 'paths': if not stmt_counts: msg = 'Sorting by paths is not available, sorting by evidence' stmts = sorted(stmts, key=lambda x: len(x.evidence), reverse=True)[offset:offset + 1000] else: stmts = [] for (stmt_hash, count) in stmt_counts[offset:offset + 1000]: try: stmts.append(stmts_by_hash[stmt_hash]) except KeyError: continue stmt_rows = [] for stmt in stmts: stmt_row = _get_stmt_row(stmt, 'model_statement', model, cur_counts, None, stmt_counts_dict) stmt_rows.append(stmt_row) table_title = f'All statements in {model.upper()} model.' if does_exist(EMMAA_BUCKET_NAME, f'assembled/{model}/latest_statements_{model}'): fkey = f'assembled/{model}/latest_statements_{model}.json' link = f'https://{EMMAA_BUCKET_NAME}.s3.amazonaws.com/{fkey}' elif does_exist(EMMAA_BUCKET_NAME, f'assembled/{model}/statements_'): fkey = find_latest_s3_file(EMMAA_BUCKET_NAME, f'assembled/{model}/statements_', '.json') link = f'https://{EMMAA_BUCKET_NAME}.s3.amazonaws.com/{fkey}' else: link = None return render_template('evidence_template.html', stmt_rows=stmt_rows, model=model, source='model_statement', table_title=table_title, msg=msg, is_all_stmts=True, prev=prev_page, next=next_page, filter_curated=filter_curated, sort_by=sort_by, link=link)
def get_model_tests_page(model): model_type = request.args.get('model_type') test_hash = request.args.get('test_hash') test_corpus = request.args.get('test_corpus') if not test_corpus: abort(Response('Test corpus has to be provided', 404)) date = request.args.get('date') if model_type not in ALL_MODEL_TYPES: abort(Response(f'Model type {model_type} does not exist', 404)) test_stats, file_key = get_model_stats(model, 'test', tests=test_corpus, date=date) if not test_stats: abort(Response(f'Data for {model} for {date} was not found', 404)) try: current_test = \ test_stats['test_round_summary']['all_test_results'][test_hash] except KeyError: abort(Response(f'Result for this test does not exist for {date}', 404)) current_model_types = [ mt for mt in ALL_MODEL_TYPES if mt in test_stats['test_round_summary'] ] test = current_test["test"] test_status, path_list = current_test[model_type] correct, incorrect = _label_curations() if isinstance(path_list, list): for path in path_list: for edge in path['edge_list']: for stmt in edge['stmts']: cur = '' url = stmt[0] if 'stmt_hash' in url: stmt_hashes = parse.parse_qs( parse.urlparse(url).query)['stmt_hash'] cur = _set_curation(stmt_hashes, correct, incorrect) stmt.append(cur) latest_date = get_latest_available_date(model, test_corpus) prefix = f'stats/{model}/test_stats_{test_corpus}_' cur_ix = find_index_of_s3_file(file_key, EMMAA_BUCKET_NAME, prefix) if test_hash in test_stats['tests_delta']['applied_hashes_delta']['added']: prev_date = None elif (cur_ix + 1) < find_number_of_files_on_s3(EMMAA_BUCKET_NAME, prefix, '.json'): prev_date = last_updated_date(model, 'test_stats', 'date', tests=test_corpus, extension='.json', n=(cur_ix + 1), bucket=EMMAA_BUCKET_NAME) else: prev_date = None if cur_ix > 0: next_date = last_updated_date(model, 'test_stats', 'date', tests=test_corpus, extension='.json', n=(cur_ix - 1), bucket=EMMAA_BUCKET_NAME) else: next_date = None return render_template('tests_template.html', link_list=link_list, model=model, model_type=model_type, all_model_types=current_model_types, test_hash=test_hash, test=test, test_status=test_status, path_list=path_list, formatted_names=FORMATTED_TYPE_NAMES, date=date, latest_date=latest_date, prev=prev_date, next=next_date)
def get_model_dashboard(model): test_corpus = request.args.get( 'test_corpus', _default_test(model, get_model_config(model))) if not test_corpus: abort(Response('Could not identify test corpus', 404)) date = request.args.get('date', get_latest_available_date(model, test_corpus)) tab = request.args.get('tab', 'model') user, roles = resolve_auth(dict(request.args)) model_meta_data = _get_model_meta_data() model_stats, _ = get_model_stats(model, 'model', date=date) test_stats, _ = get_model_stats(model, 'test', tests=test_corpus, date=date) if not model_stats or not test_stats: abort( Response( f'Data for {model} and {test_corpus} for {date} ' f'was not found', 404)) ndex_id = 'None available' description = 'None available' for mid, mmd in model_meta_data: if mid == model: ndex_id = mmd['ndex']['network'] description = mmd['description'] if ndex_id == 'None available': logger.warning(f'No ndex ID found for {model}') available_tests = _get_test_corpora(model) latest_date = get_latest_available_date(model, test_corpus) model_info_contents = [ [('', 'Model Description', ''), ('', description, '')], [('', 'Latest Data Available', ''), ('', latest_date, '')], [('', 'Data Displayed', ''), ('', date, 'Click on the point on time graph to see earlier results') ], [('', 'Network on Ndex', ''), (f'http://www.ndexbio.org/#/network/{ndex_id}', ndex_id, 'Click to see network on Ndex')] ] test_data = test_stats['test_round_summary'].get('test_data') test_info_contents = None if test_data: test_info_contents = [[('', k.capitalize(), ''), ('', v, '')] for k, v in test_data.items()] current_model_types = [ mt for mt in ALL_MODEL_TYPES if mt in test_stats['test_round_summary'] ] # Get correct and incorrect curation hashes to pass it per stmt correct, incorrect = _label_curations() # Filter out rows with all tests == 'n_a' all_tests = [] for k, v in test_stats['test_round_summary']['all_test_results'].items(): cur = _set_curation(k, correct, incorrect) v['test'].append(cur) if all(v[mt][0].lower() == 'n_a' for mt in current_model_types): continue else: all_tests.append((k, v)) all_stmts = model_stats['model_summary']['all_stmts'] for st_hash, st_value in all_stmts.items(): url_param = parse.urlencode({ 'stmt_hash': st_hash, 'source': 'model_statement', 'model': model }) st_value[0] = f'/evidence?{url_param}' st_value[2] = stmt_db_link_msg cur = _set_curation(st_hash, correct, incorrect) st_value.append(cur) most_supported = model_stats['model_summary']['stmts_by_evidence'][:10] top_stmts_counts = [((all_stmts[h]), ('', str(c), '')) for h, c in most_supported] added_stmts_hashes = \ model_stats['model_delta']['statements_hashes_delta']['added'] if len(added_stmts_hashes) > 0: added_stmts = [((all_stmts[h]), ) for h in added_stmts_hashes] else: added_stmts = 'No new statements were added' return render_template( 'model_template.html', model=model, model_data=model_meta_data, model_stats_json=model_stats, test_stats_json=test_stats, test_corpus=test_corpus, available_tests=available_tests, link_list=link_list, user_email=user.email if user else "", stmts_counts=top_stmts_counts, added_stmts=added_stmts, model_info_contents=model_info_contents, test_info_contents=test_info_contents, model_types=[ "Test", *[FORMATTED_TYPE_NAMES[mt] for mt in current_model_types] ], new_applied_tests=_new_applied_tests(test_stats, current_model_types, model, date, test_corpus), all_test_results=_format_table_array(all_tests, current_model_types, model, date, test_corpus), new_passed_tests=_new_passed_tests(model, test_stats, current_model_types, date, test_corpus), date=date, latest_date=latest_date, tab=tab)
def get_model_deltas(model_name, test_corpora, date, bucket=EMMAA_BUCKET_NAME): """Get deltas from model and test stats for further use in tweets and email notifications. Parameters ---------- model_name : str A name of the model to get the updates for. test_corpora : list[str] A list of test corpora names to get the test updates for. date : str A date for which the updates should be generated. bucket : str A name of S3 bucket where the stats files are stored. Returns ------- deltas : dict A dictionary containing the deltas for the given model and test corpora. """ deltas = {} model_stats, _ = get_model_stats(model_name, 'model', date=date) test_stats_by_corpus = {} for test_corpus in test_corpora: test_stats, _ = get_model_stats(model_name, 'test', tests=test_corpus, date=date) if not test_stats: logger.info(f'Could not find test stats for {test_corpus}') test_stats_by_corpus[test_corpus] = test_stats if not model_stats or not test_stats_by_corpus: logger.warning('Stats are not found, cannot generate deltas') return deltas deltas['model_name'] = model_name deltas['date'] = date # Model deltas stmts_delta = model_stats['model_delta']['statements_hashes_delta'] paper_delta = model_stats['paper_delta']['raw_paper_ids_delta'] new_papers = len(paper_delta['added']) deltas['stmts_delta'] = stmts_delta deltas['new_papers'] = new_papers # Test deltas deltas['tests'] = {} for test_corpus, test_stats in test_stats_by_corpus.items(): test_deltas = {} test_name = None test_data = test_stats['test_round_summary'].get('test_data') if test_data: test_name = test_data.get('name') test_deltas['name'] = test_name test_deltas['passed'] = {} for k, v in test_stats['tests_delta'].items(): if k == 'applied_hashes_delta': applied_delta = v test_deltas['applied_tests'] = applied_delta else: mc_type = k passed_delta = v['passed_hashes_delta'] test_deltas['passed'][mc_type] = passed_delta deltas['tests'][test_corpus] = test_deltas return deltas