def test_uniform_distribution_entity_id_index(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) model = ModelFactory() feature_importances = [ FeatureImportanceFactory(model_rel=model, feature='feature_{}'.format(i)) for i in range(0, 10) ] data_dict = {'entity_id': [1, 2]} for imp in feature_importances: data_dict[imp.feature] = [0.5, 0.5] test_store = InMemoryMatrixStore( matrix=pandas.DataFrame.from_dict(data_dict), metadata=sample_metadata()) session.commit() results = uniform_distribution(db_engine, model_id=model.model_id, as_of_date='2016-01-01', test_matrix_store=test_store, n_ranks=5) assert len(results) == 10 # 5 features x 2 entities for result in results: assert 'entity_id' in result assert 'feature_name' in result assert 'score' in result assert 'feature_value' in result assert result['feature_value'] == 0.5 assert result['score'] >= 0 assert result['score'] <= 1 assert isinstance(result['feature_name'], str) assert result['entity_id'] in [1, 2]
def test_uniform_distribution(): with rig_engines() as (db_engine, project_storage): model = ModelFactory() feature_importances = [ FeatureImportanceFactory(model_rel=model, feature="feature_{}".format(i)) for i in range(0, 10) ] data_dict = {"entity_id": [1, 1], "as_of_date": ["2016-01-01", "2017-01-01"], "label": [0, 1]} for imp in feature_importances: data_dict[imp.feature] = [0.5, 0.5] metadata = matrix_metadata_creator() test_store = get_matrix_store( project_storage, pandas.DataFrame.from_dict(data_dict), metadata, ) results = uniform_distribution( db_engine, model_id=model.model_id, as_of_date=datetime.date(2016, 1, 1), test_matrix_store=test_store, n_ranks=5, ) assert len(results) == 5 # 5 features x 1 entity for this as_of_date for result in results: assert "entity_id" in result assert "feature_name" in result assert "score" in result assert "feature_value" in result assert result["feature_value"] == 0.5 assert result["score"] >= 0 assert result["score"] <= 1 assert isinstance(result["feature_name"], str) assert result["entity_id"] in [1, 2]
def test_uniform_distribution_entity_id_index(): with rig_engines() as (db_engine, project_storage): model = ModelFactory() feature_importances = [ FeatureImportanceFactory(model_rel=model, feature='feature_{}'.format(i)) for i in range(0, 10) ] data_dict = {'entity_id': [1, 2]} for imp in feature_importances: data_dict[imp.feature] = [0.5, 0.5] metadata = matrix_metadata_creator(indices='entity_id') test_store = get_matrix_store( project_storage, pandas.DataFrame.from_dict(data_dict).set_index( metadata['indices']), metadata) results = uniform_distribution(db_engine, model_id=model.model_id, as_of_date='2016-01-01', test_matrix_store=test_store, n_ranks=5) assert len(results) == 10 # 5 features x 2 entities for result in results: assert 'entity_id' in result assert 'feature_name' in result assert 'score' in result assert 'feature_value' in result assert result['feature_value'] == 0.5 assert result['score'] >= 0 assert result['score'] <= 1 assert isinstance(result['feature_name'], str) assert result['entity_id'] in [1, 2]
def update_ranks_test(predictor, entities_scores_labels, rank_col, expected_result, model_random_seed=12345, need_seed_data=True): """Not a test in itself but rather a utility called by many of the ranking tests""" ensure_db(predictor.db_engine) init_engine(predictor.db_engine) model_id = 5 matrix_uuid = "4567" matrix_type = "test" as_of_date = datetime.datetime(2012, 1, 1) if need_seed_data: matrix = MatrixFactory(matrix_uuid=matrix_uuid) model = ModelFactory(model_id=model_id, random_seed=model_random_seed) for entity_id, score, label in entities_scores_labels: PredictionFactory(model_rel=model, matrix_rel=matrix, as_of_date=as_of_date, entity_id=entity_id, score=score, label_value=int(label)) factory_session.commit() predictor.update_db_with_ranks( model_id=model_id, matrix_uuid=matrix_uuid, matrix_type=TestMatrixType, ) ranks = tuple(row for row in predictor.db_engine.execute( f''' select entity_id, {rank_col}::float from {matrix_type}_results.predictions where as_of_date = %s and model_id = %s and matrix_uuid = %s order by {rank_col} asc''', (as_of_date, model_id, matrix_uuid))) assert ranks == expected_result # Test that the predictions metadata table is populated metadata_records = [ row for row in predictor.db_engine.execute( f"""select tiebreaker_ordering, prediction_metadata.random_seed, models.random_seed from {matrix_type}_results.prediction_metadata join triage_metadata.models using (model_id) join triage_metadata.matrices using (matrix_uuid) """) ] assert len(metadata_records) == 1 tiebreaker_ordering, random_seed, received_model_random_seed = metadata_records[ 0] if tiebreaker_ordering == 'random': assert random_seed is model_random_seed else: assert not random_seed assert tiebreaker_ordering == predictor.rank_order assert received_model_random_seed == model_random_seed
def test_Audition(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) num_model_groups = 10 model_types = [ "classifier type {}".format(i) for i in range(0, num_model_groups) ] model_groups = [ ModelGroupFactory(model_type=model_type) for model_type in model_types ] train_end_times = [ datetime(2013, 1, 1), datetime(2014, 1, 1), datetime(2015, 1, 1), datetime(2016, 1, 1), ] models = [ ModelFactory(model_group_rel=model_group, train_end_time=train_end_time) for model_group in model_groups for train_end_time in train_end_times ] metrics = [ ("precision@", "100_abs"), ("recall@", "100_abs"), ("precision@", "50_abs"), ("recall@", "50_abs"), ("fpr@", "10_pct"), ] class ImmediateEvalFactory(EvaluationFactory): evaluation_start_time = factory.LazyAttribute( lambda o: o.model_rel.train_end_time) for model in models: for (metric, parameter) in metrics: ImmediateEvalFactory(model_rel=model, metric=metric, parameter=parameter) session.commit() with tempfile.TemporaryDirectory() as td: with mock.patch('os.getcwd') as mock_getcwd: mock_getcwd.return_value = td AuditionRunner(config_dict=config, db_engine=db_engine, directory=td).run() assert len(os.listdir(os.getcwd())) == 6
def test_ModelEvaluator_needs_evaluation_with_bias_audit( db_engine_with_results_schema): # test that if a bias audit config is passed, and there are no matching bias audits # in the database, needs_evaluation returns true # this all assumes that evaluations are populated. those tests are in the 'no_bias_audit' test model_evaluator = ModelEvaluator( testing_metric_groups=[ { "metrics": ["precision@"], "thresholds": { "top_n": [3] }, }, ], training_metric_groups=[], bias_config={'thresholds': { 'top_n': [2] }}, db_engine=db_engine_with_results_schema, ) model_with_evaluations = ModelFactory() eval_time = datetime.datetime(2016, 1, 1) as_of_date_frequency = "3d" for subset_hash in [""]: EvaluationFactory( model_rel=model_with_evaluations, evaluation_start_time=eval_time, evaluation_end_time=eval_time, as_of_date_frequency=as_of_date_frequency, metric="precision@", parameter="3_abs", subset_hash=subset_hash, ) session.commit() # make a test matrix to pass in metadata_overrides = { 'as_of_date_frequency': as_of_date_frequency, 'as_of_times': [eval_time], } test_matrix_store = MockMatrixStore("test", "1234", 5, db_engine_with_results_schema, metadata_overrides=metadata_overrides) assert model_evaluator.needs_evaluations( matrix_store=test_matrix_store, model_id=model_with_evaluations.model_id, subset_hash="", )
def test_prediction_ranks_multiple_dates(project_storage, db_engine): """make sure that multiple as-of-dates in a single matrix are handled correctly. keep the other variables simple by making no within-date ties that would end up testing the tiebreaker logic, just data for two dates with data that could theoretically confound a bad ranking method: - a different order for entities in both dates - each date has some not in the other """ ensure_db(db_engine) init_engine(db_engine) predictor = Predictor(project_storage.model_storage_engine(), db_engine, 'worst') model_id = 5 matrix_uuid = "4567" matrix_type = "test" entities_dates_and_scores = ( (23, datetime.datetime(2012, 1, 1), 0.95), (34, datetime.datetime(2012, 1, 1), 0.94), (45, datetime.datetime(2013, 1, 1), 0.92), (23, datetime.datetime(2013, 1, 1), 0.45), ) expected_result = ( (23, datetime.datetime(2012, 1, 1), 1), (34, datetime.datetime(2012, 1, 1), 2), (45, datetime.datetime(2013, 1, 1), 3), (23, datetime.datetime(2013, 1, 1), 4), ) matrix = MatrixFactory(matrix_uuid=matrix_uuid) model = ModelFactory(model_id=model_id) for entity_id, as_of_date, score in entities_dates_and_scores: PredictionFactory(model_rel=model, matrix_rel=matrix, as_of_date=as_of_date, entity_id=entity_id, score=score) factory_session.commit() predictor.update_db_with_ranks( model_id=model_id, matrix_uuid=matrix_uuid, matrix_type=TestMatrixType, ) ranks = tuple(row for row in predictor.db_engine.execute( f''' select entity_id, as_of_date, rank_abs_no_ties from {matrix_type}_results.predictions where model_id = %s and matrix_uuid = %s order by rank_abs_no_ties''', ( model_id, matrix_uuid))) assert ranks == expected_result
def test_uniform_distribution_entity_id_index(): with rig_engines() as (db_engine, project_storage): model = ModelFactory() feature_importances = [ FeatureImportanceFactory(model_rel=model, feature="feature_{}".format(i)) for i in range(0, 10) ] data_dict = {"entity_id": [1, 2]} for imp in feature_importances: data_dict[imp.feature] = [0.5, 0.5] metadata = matrix_metadata_creator(indices="entity_id") test_store = get_matrix_store( project_storage, pandas.DataFrame.from_dict(data_dict).set_index( metadata["indices"]), metadata, ) results = uniform_distribution( db_engine, model_id=model.model_id, as_of_date="2016-01-01", test_matrix_store=test_store, n_ranks=5, ) assert len(results) == 10 # 5 features x 2 entities for result in results: assert "entity_id" in result assert "feature_name" in result assert "score" in result assert "feature_value" in result assert result["feature_value"] == 0.5 assert result["score"] >= 0 assert result["score"] <= 1 assert isinstance(result["feature_name"], str) assert result["entity_id"] in [1, 2]
def test_ModelEvaluator_needs_evaluation_no_bias_audit(db_engine_with_results_schema): # TEST SETUP: # create two models: one that has zero evaluations, # one that has an evaluation for precision@100_abs # both overall and for each subset model_with_evaluations = ModelFactory() model_without_evaluations = ModelFactory() eval_time = datetime.datetime(2016, 1, 1) as_of_date_frequency = "3d" for subset_hash in [""] + [filename_friendly_hash(subset) for subset in SUBSETS]: EvaluationFactory( model_rel=model_with_evaluations, evaluation_start_time=eval_time, evaluation_end_time=eval_time, as_of_date_frequency=as_of_date_frequency, metric="precision@", parameter="100_abs", subset_hash=subset_hash, ) session.commit() # make a test matrix to pass in metadata_overrides = { "as_of_date_frequency": as_of_date_frequency, "as_of_times": [eval_time], } test_matrix_store = MockMatrixStore( "test", "1234", 5, db_engine_with_results_schema, metadata_overrides=metadata_overrides, ) train_matrix_store = MockMatrixStore( "train", "2345", 5, db_engine_with_results_schema, metadata_overrides=metadata_overrides, ) # the evaluated model has test evaluations for precision, but not recall, # so this needs evaluations for subset in SUBSETS: if not subset: subset_hash = "" else: subset_hash = filename_friendly_hash(subset) assert ModelEvaluator( testing_metric_groups=[ { "metrics": ["precision@", "recall@"], "thresholds": {"top_n": [100]}, } ], training_metric_groups=[], db_engine=db_engine_with_results_schema, ).needs_evaluations( matrix_store=test_matrix_store, model_id=model_with_evaluations.model_id, subset_hash=subset_hash, ) # the evaluated model has test evaluations for precision, # so this should not need evaluations for subset in SUBSETS: if not subset: subset_hash = "" else: subset_hash = filename_friendly_hash(subset) assert not ModelEvaluator( testing_metric_groups=[ { "metrics": ["precision@"], "thresholds": {"top_n": [100]}, } ], training_metric_groups=[], db_engine=db_engine_with_results_schema, ).needs_evaluations( matrix_store=test_matrix_store, model_id=model_with_evaluations.model_id, subset_hash=subset_hash, ) # the non-evaluated model has no evaluations, # so this should need evaluations for subset in SUBSETS: if not subset: subset_hash = "" else: subset_hash = filename_friendly_hash(subset) assert ModelEvaluator( testing_metric_groups=[ { "metrics": ["precision@"], "thresholds": {"top_n": [100]}, } ], training_metric_groups=[], db_engine=db_engine_with_results_schema, ).needs_evaluations( matrix_store=test_matrix_store, model_id=model_without_evaluations.model_id, subset_hash=subset_hash, ) # the evaluated model has no *train* evaluations, # so the train matrix should need evaluations for subset in SUBSETS: if not subset: subset_hash = "" else: subset_hash = filename_friendly_hash(subset) assert ModelEvaluator( testing_metric_groups=[ { "metrics": ["precision@"], "thresholds": {"top_n": [100]}, } ], training_metric_groups=[ { "metrics": ["precision@"], "thresholds": {"top_n": [100]}, } ], db_engine=db_engine_with_results_schema, ).needs_evaluations( matrix_store=train_matrix_store, model_id=model_with_evaluations.model_id, subset_hash=subset_hash, ) session.close() session.remove()
def test_Auditioner(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) # set up data, randomly generated by the factories but conforming # generally to what we expect triage_metadata schema data to look like num_model_groups = 10 model_types = [ "classifier type {}".format(i) for i in range(0, num_model_groups) ] model_groups = [ ModelGroupFactory(model_type=model_type) for model_type in model_types ] train_end_times = [ datetime(2013, 1, 1), datetime(2014, 1, 1), datetime(2015, 1, 1), datetime(2016, 1, 1), ] models = [ ModelFactory(model_group_rel=model_group, train_end_time=train_end_time) for model_group in model_groups for train_end_time in train_end_times ] metrics = [ ("precision@", "100_abs"), ("recall@", "100_abs"), ("precision@", "50_abs"), ("recall@", "50_abs"), ("fpr@", "10_pct"), ] class ImmediateEvalFactory(EvaluationFactory): evaluation_start_time = factory.LazyAttribute( lambda o: o.model_rel.train_end_time) for model in models: for (metric, parameter) in metrics: ImmediateEvalFactory(model_rel=model, metric=metric, parameter=parameter) session.commit() # define a very loose filtering that should admit all model groups no_filtering = [ { "metric": "precision@", "parameter": "100_abs", "max_from_best": 1.0, "threshold_value": 0.0, }, { "metric": "recall@", "parameter": "100_abs", "max_from_best": 1.0, "threshold_value": 0.0, }, ] model_group_ids = [mg.model_group_id for mg in model_groups] auditioner = Auditioner(db_engine, model_group_ids, train_end_times, no_filtering) assert len(auditioner.thresholded_model_group_ids) == num_model_groups auditioner.plot_model_groups() # here, we pick thresholding rules that should definitely remove # all model groups from contention because they are too strict. remove_all = [ { "metric": "precision@", "parameter": "100_abs", "max_from_best": 0.0, "threshold_value": 1.1, }, { "metric": "recall@", "parameter": "100_abs", "max_from_best": 0.0, "threshold_value": 1.1, }, ] auditioner.update_metric_filters(new_filters=remove_all) assert len(auditioner.thresholded_model_group_ids) == 0 # pass the argument instead and remove all model groups auditioner.set_one_metric_filter( metric="precision@", parameter="100_abs", max_from_best=0.0, threshold_value=1.1, ) assert len(auditioner.thresholded_model_group_ids) == 0 # one potential place for bugs would be when we pull back the rules # for being too restrictive. we want to make sure that the original list is # always used for thresholding, or else such a move would be impossible auditioner.update_metric_filters(new_filters=no_filtering) assert len(auditioner.thresholded_model_group_ids) == num_model_groups # pass the argument instead and let all model groups pass auditioner.set_one_metric_filter( metric="precision@", parameter="100_abs", max_from_best=1.0, threshold_value=0.0, ) assert len(auditioner.thresholded_model_group_ids) == num_model_groups # now, we want to take this partially thresholded list and run it through # a grid of selection rules, meant to pick winners by a variety of user-defined # criteria rule_grid = [ { "shared_parameters": [ { "metric": "precision@", "parameter": "100_abs" }, { "metric": "recall@", "parameter": "100_abs" }, ], "selection_rules": [ { "name": "most_frequent_best_dist", "dist_from_best_case": [0.1, 0.2, 0.3], "n": 1, }, { "name": "best_current_value", "n": 1 }, ], }, { "shared_parameters": [{ "metric1": "precision@", "parameter1": "100_abs" }], "selection_rules": [{ "name": "best_average_two_metrics", "metric2": ["recall@"], "parameter2": ["100_abs"], "metric1_weight": [0.4, 0.5, 0.6], "n": 1, }], }, ] auditioner.register_selection_rule_grid(rule_grid, plot=False) final_model_group_ids = auditioner.selection_rule_model_group_ids # we expect the result to be a mapping of selection rule name to model group id assert isinstance(final_model_group_ids, dict) # we expect that there is one winner for each selection rule assert sorted(final_model_group_ids.keys()) == sorted( [rule.descriptive_name for rule in auditioner.selection_rules])
def test_PreAudition(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) # set up data, randomly generated by the factories but conforming # generally to what we expect triage_metadata schema data to look like num_model_groups = 10 model_types = [ "classifier type {}".format(i) for i in range(0, num_model_groups) ] model_configs = [ {"label_definition": "label_1"} if i % 2 == 0 else {"label_definition": "label_2"} for i in range(0, num_model_groups) ] model_groups = [ ModelGroupFactory(model_type=model_type, model_config=model_config) for model_type, model_config in zip(model_types, model_configs) ] train_end_times = [ datetime(2013, 1, 1), datetime(2013, 7, 1), datetime(2014, 1, 1), datetime(2014, 7, 1), datetime(2015, 1, 1), datetime(2015, 7, 1), datetime(2016, 7, 1), datetime(2016, 1, 1), ] models = [ ModelFactory(model_group_rel=model_group, train_end_time=train_end_time) for model_group in model_groups for train_end_time in train_end_times ] metrics = [ ("precision@", "100_abs"), ("recall@", "100_abs"), ("precision@", "50_abs"), ("recall@", "50_abs"), ("fpr@", "10_pct"), ] class ImmediateEvalFactory(EvaluationFactory): evaluation_start_time = factory.LazyAttribute( lambda o: o.model_rel.train_end_time ) for model in models: for (metric, parameter) in metrics: ImmediateEvalFactory( model_rel=model, metric=metric, parameter=parameter ) session.commit() pre_aud = PreAudition(db_engine) # Expect the number of model groups with label_1 assert len(pre_aud.get_model_groups_from_label("label_1")['model_groups']) == sum( [x["label_definition"] == "label_1" for x in model_configs] ) # Expect no baseline model groups assert len(pre_aud.get_model_groups_from_label("label_1")['baseline_model_groups']) == 0 # Expect the number of model groups with certain experiment_hash experiment_hash = list( pd.read_sql( """SELECT experiment_hash FROM triage_metadata.models JOIN triage_metadata.experiment_models using (model_hash) limit 1""", con=db_engine, )["experiment_hash"] )[0] assert len(pre_aud.get_model_groups_from_experiment(experiment_hash)['model_groups']) == 1 # Expect the number of model groups for customs SQL query = """ SELECT DISTINCT(model_group_id) FROM triage_metadata.models JOIN triage_metadata.experiment_models using (model_hash) WHERE train_end_time >= '2013-01-01' AND experiment_hash = '{}' """.format( experiment_hash ) assert len(pre_aud.get_model_groups(query)) == 1 # Expect the number of train_end_times after 2014-01-01 assert len(pre_aud.get_train_end_times(after="2014-01-01")) == 6 query = """ SELECT DISTINCT train_end_time FROM triage_metadata.models WHERE model_group_id IN ({}) AND train_end_time >= '2014-01-01' ORDER BY train_end_time """.format( ", ".join(map(str, pre_aud.model_groups)) ) assert len(pre_aud.get_train_end_times(query=query)) == 6
def test_PreAudition(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) # set up data, randomly generated by the factories but conforming # generally to what we expect results schema data to look like num_model_groups = 10 model_types = [ 'classifier type {}'.format(i) for i in range(0, num_model_groups) ] model_configs = [{ 'label_definition': 'label_1' } if i % 2 == 0 else { 'label_definition': 'label_2' } for i in range(0, num_model_groups)] model_groups = [ ModelGroupFactory(model_type=model_type, model_config=model_config) for model_type, model_config in zip(model_types, model_configs) ] train_end_times = [ datetime(2013, 1, 1), datetime(2013, 7, 1), datetime(2014, 1, 1), datetime(2014, 7, 1), datetime(2015, 1, 1), datetime(2015, 7, 1), datetime(2016, 7, 1), datetime(2016, 1, 1), ] models = [ ModelFactory(model_group_rel=model_group, train_end_time=train_end_time) for model_group in model_groups for train_end_time in train_end_times ] metrics = [ ('precision@', '100_abs'), ('recall@', '100_abs'), ('precision@', '50_abs'), ('recall@', '50_abs'), ('fpr@', '10_pct'), ] class ImmediateEvalFactory(EvaluationFactory): evaluation_start_time = factory.LazyAttribute( lambda o: o.model_rel.train_end_time) for model in models: for (metric, parameter) in metrics: ImmediateEvalFactory(model_rel=model, metric=metric, parameter=parameter) session.commit() pre_aud = PreAudition(db_engine) # Expect the number of model groups with label_1 assert len(pre_aud.get_model_groups_from_label("label_1")) == \ sum([x['label_definition']=='label_1' for x in model_configs]) # Expect the number of model groups with certain experiment_hash experiment_hash = list( pd.read_sql("SELECT experiment_hash FROM results.models limit 1", con=db_engine)['experiment_hash'])[0] assert len( pre_aud.get_model_groups_from_experiment(experiment_hash)) == 1 # Expect the number of model groups for customs SQL query = """ SELECT DISTINCT(model_group_id) FROM results.models WHERE train_end_time >= '2013-01-01' AND experiment_hash = '{}' """.format(experiment_hash) assert len(pre_aud.get_model_groups(query)) == 1 # Expect the number of train_end_times after 2014-01-01 assert len(pre_aud.get_train_end_times(after='2014-01-01')) == 6 query = """ SELECT DISTINCT train_end_time FROM results.models WHERE model_group_id IN ({}) AND train_end_time >= '2014-01-01' ORDER BY train_end_time """.format(', '.join(map(str, pre_aud.model_groups))) assert len(pre_aud.get_train_end_times(query=query)) == 6
def filter_train_end_times(self, engine, train_end_times): ensure_db(engine) init_engine(engine) mg1 = ModelGroupFactory(model_group_id=1, model_type="modelType1") mg2 = ModelGroupFactory(model_group_id=2, model_type="modelType2") mg3 = ModelGroupFactory(model_group_id=3, model_type="modelType3") mg4 = ModelGroupFactory(model_group_id=4, model_type="modelType4") mg5 = ModelGroupFactory(model_group_id=5, model_type="modelType5") # model group 1 ModelFactory(model_group_rel=mg1, train_end_time=datetime(2014, 1, 1)) ModelFactory(model_group_rel=mg1, train_end_time=datetime(2015, 1, 1)) ModelFactory(model_group_rel=mg1, train_end_time=datetime(2016, 1, 1)) ModelFactory(model_group_rel=mg1, train_end_time=datetime(2017, 1, 1)) # model group 2 only has one timestamps ModelFactory(model_group_rel=mg2, train_end_time=datetime(2014, 1, 1)) # model group 3 ModelFactory(model_group_rel=mg3, train_end_time=datetime(2014, 1, 1)) ModelFactory(model_group_rel=mg3, train_end_time=datetime(2015, 1, 1)) ModelFactory(model_group_rel=mg3, train_end_time=datetime(2016, 1, 1)) ModelFactory(model_group_rel=mg3, train_end_time=datetime(2017, 1, 1)) # model group 4 only has two timestamps ModelFactory(model_group_rel=mg4, train_end_time=datetime(2015, 1, 1)) ModelFactory(model_group_rel=mg4, train_end_time=datetime(2016, 1, 1)) # model group 5 only has three timestamps ModelFactory(model_group_rel=mg5, train_end_time=datetime(2014, 1, 1)) ModelFactory(model_group_rel=mg5, train_end_time=datetime(2015, 1, 1)) ModelFactory(model_group_rel=mg5, train_end_time=datetime(2016, 1, 1)) session.commit() model_groups = [1, 2, 3, 4, 5] model_group_ids = model_groups_filter( train_end_times=train_end_times, initial_model_group_ids=model_groups, models_table="models", db_engine=engine, ) return model_group_ids
def test_Auditioner(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) # set up data, randomly generated by the factories but conforming # generally to what we expect model_metadata schema data to look like num_model_groups = 10 model_types = [ 'classifier type {}'.format(i) for i in range(0, num_model_groups) ] model_groups = [ ModelGroupFactory(model_type=model_type) for model_type in model_types ] train_end_times = [ datetime(2013, 1, 1), datetime(2014, 1, 1), datetime(2015, 1, 1), datetime(2016, 1, 1), ] models = [ ModelFactory(model_group_rel=model_group, train_end_time=train_end_time) for model_group in model_groups for train_end_time in train_end_times ] metrics = [ ('precision@', '100_abs'), ('recall@', '100_abs'), ('precision@', '50_abs'), ('recall@', '50_abs'), ('fpr@', '10_pct'), ] class ImmediateEvalFactory(EvaluationFactory): evaluation_start_time = factory.LazyAttribute( lambda o: o.model_rel.train_end_time) for model in models: for (metric, parameter) in metrics: ImmediateEvalFactory(model_rel=model, metric=metric, parameter=parameter) session.commit() # define a very loose filtering that should admit all model groups no_filtering = [{ 'metric': 'precision@', 'parameter': '100_abs', 'max_from_best': 1.0, 'threshold_value': 0.0 }, { 'metric': 'recall@', 'parameter': '100_abs', 'max_from_best': 1.0, 'threshold_value': 0.0 }] model_group_ids = [mg.model_group_id for mg in model_groups] auditioner = Auditioner( db_engine, model_group_ids, train_end_times, no_filtering, ) assert len(auditioner.thresholded_model_group_ids) == num_model_groups auditioner.plot_model_groups() # here, we pick thresholding rules that should definitely remove # all model groups from contention because they are too strict. remove_all = [{ 'metric': 'precision@', 'parameter': '100_abs', 'max_from_best': 0.0, 'threshold_value': 1.1 }, { 'metric': 'recall@', 'parameter': '100_abs', 'max_from_best': 0.0, 'threshold_value': 1.1 }] auditioner.update_metric_filters(new_filters=remove_all) assert len(auditioner.thresholded_model_group_ids) == 0 # pass the argument instead and remove all model groups auditioner.set_one_metric_filter(metric='precision@', parameter='100_abs', max_from_best=0.0, threshold_value=1.1) assert len(auditioner.thresholded_model_group_ids) == 0 # one potential place for bugs would be when we pull back the rules # for being too restrictive. we want to make sure that the original list is # always used for thresholding, or else such a move would be impossible auditioner.update_metric_filters(new_filters=no_filtering) assert len(auditioner.thresholded_model_group_ids) == num_model_groups # pass the argument instead and let all model groups pass auditioner.set_one_metric_filter(metric='precision@', parameter='100_abs', max_from_best=1.0, threshold_value=0.0) assert len(auditioner.thresholded_model_group_ids) == num_model_groups # now, we want to take this partially thresholded list and run it through # a grid of selection rules, meant to pick winners by a variety of user-defined # criteria rule_grid = [{ 'shared_parameters': [ { 'metric': 'precision@', 'parameter': '100_abs' }, { 'metric': 'recall@', 'parameter': '100_abs' }, ], 'selection_rules': [{ 'name': 'most_frequent_best_dist', 'dist_from_best_case': [0.1, 0.2, 0.3], 'n': 1 }, { 'name': 'best_current_value', 'n': 1 }] }, { 'shared_parameters': [ { 'metric1': 'precision@', 'parameter1': '100_abs' }, ], 'selection_rules': [ { 'name': 'best_average_two_metrics', 'metric2': ['recall@'], 'parameter2': ['100_abs'], 'metric1_weight': [0.4, 0.5, 0.6], 'n': 1 }, ] }] auditioner.register_selection_rule_grid(rule_grid, plot=False) final_model_group_ids = auditioner.selection_rule_model_group_ids # we expect the result to be a mapping of selection rule name to model group id assert isinstance(final_model_group_ids, dict) # we expect that there is one winner for each selection rule assert sorted(final_model_group_ids.keys()) == \ sorted([rule.descriptive_name for rule in auditioner.selection_rules]) # we expect that the results written to the yaml file are the # chosen model groups and their rules # however because the source data is randomly generated we could have a # different list on consecutive runs # and don't want to introduce non-determinism to the test with tempfile.NamedTemporaryFile() as tf: auditioner.write_tyra_config(tf.name) assert sorted(yaml.load(tf)['selection_rule_model_groups'].keys()) == \ sorted(final_model_group_ids.keys())
def filter_same_train_end_times(self, engine): ensure_db(engine) init_engine(engine) mg1 = ModelGroupFactory(model_group_id=1, model_type='modelType1') mg2 = ModelGroupFactory(model_group_id=2, model_type='modelType2') mg3 = ModelGroupFactory(model_group_id=3, model_type='modelType3') mg4 = ModelGroupFactory(model_group_id=4, model_type='modelType4') # model group 1 ModelFactory(model_group_rel=mg1, train_end_time=datetime(2014, 1, 1)) ModelFactory(model_group_rel=mg1, train_end_time=datetime(2015, 1, 1)) ModelFactory(model_group_rel=mg1, train_end_time=datetime(2016, 1, 1)) ModelFactory(model_group_rel=mg1, train_end_time=datetime(2017, 1, 1)) # model group 2 only has three timestamps, should not pass ModelFactory(model_group_rel=mg2, train_end_time=datetime(2014, 1, 1)) # model group 3 ModelFactory(model_group_rel=mg3, train_end_time=datetime(2014, 1, 1)) ModelFactory(model_group_rel=mg3, train_end_time=datetime(2015, 1, 1)) ModelFactory(model_group_rel=mg3, train_end_time=datetime(2016, 1, 1)) ModelFactory(model_group_rel=mg3, train_end_time=datetime(2017, 1, 1)) # model group 4 only has three timestamps, should not pass ModelFactory(model_group_rel=mg4, train_end_time=datetime(2015, 1, 1)) ModelFactory(model_group_rel=mg4, train_end_time=datetime(2016, 1, 1)) session.commit() train_end_times = [ '2014-01-01', '2015-01-01', '2016-01-01', '2017-01-01' ] model_groups = [1, 2, 3, 4] model_group_ids = model_groups_filter( train_end_times=train_end_times, initial_model_group_ids=model_groups, models_table='models', db_engine=engine) return model_group_ids
def test_ModelEvaluator_needs_evaluation(db_engine): ensure_db(db_engine) init_engine(db_engine) # TEST SETUP: # create two models: one that has zero evaluations, # one that has an evaluation for precision@100_abs model_with_evaluations = ModelFactory() model_without_evaluations = ModelFactory() eval_time = datetime.datetime(2016, 1, 1) as_of_date_frequency = "3d" EvaluationFactory(model_rel=model_with_evaluations, evaluation_start_time=eval_time, evaluation_end_time=eval_time, as_of_date_frequency=as_of_date_frequency, metric="precision@", parameter="100_abs") session.commit() # make a test matrix to pass in metadata_overrides = { 'as_of_date_frequency': as_of_date_frequency, 'end_time': eval_time, } test_matrix_store = MockMatrixStore("test", "1234", 5, db_engine, metadata_overrides=metadata_overrides) train_matrix_store = MockMatrixStore("train", "2345", 5, db_engine, metadata_overrides=metadata_overrides) # the evaluated model has test evaluations for precision, but not recall, # so this needs evaluations assert ModelEvaluator(testing_metric_groups=[{ "metrics": ["precision@", "recall@"], "thresholds": { "top_n": [100] }, }], training_metric_groups=[], db_engine=db_engine).needs_evaluations( matrix_store=test_matrix_store, model_id=model_with_evaluations.model_id, ) # the evaluated model has test evaluations for precision, # so this should not need evaluations assert not ModelEvaluator(testing_metric_groups=[{ "metrics": ["precision@"], "thresholds": { "top_n": [100] }, }], training_metric_groups=[], db_engine=db_engine).needs_evaluations( matrix_store=test_matrix_store, model_id=model_with_evaluations.model_id, ) # the non-evaluated model has no evaluations, # so this should need evaluations assert ModelEvaluator(testing_metric_groups=[{ "metrics": ["precision@"], "thresholds": { "top_n": [100] }, }], training_metric_groups=[], db_engine=db_engine).needs_evaluations( matrix_store=test_matrix_store, model_id=model_without_evaluations.model_id, ) # the evaluated model has no *train* evaluations, # so the train matrix should need evaluations assert ModelEvaluator(testing_metric_groups=[{ "metrics": ["precision@"], "thresholds": { "top_n": [100] }, }], training_metric_groups=[{ "metrics": ["precision@"], "thresholds": { "top_n": [100] }, }], db_engine=db_engine).needs_evaluations( matrix_store=train_matrix_store, model_id=model_with_evaluations.model_id, ) session.close() session.remove()