def test_predictor_save_predictions(matrix_type, predict_setup_args): """Test the save_predictions flag being set to False We still want to return predict_proba, but not save data to the DB """ (project_storage, db_engine, model_id) = predict_setup_args # if save_predictions is sent as False, don't save predictor = Predictor(project_storage.model_storage_engine(), db_engine, rank_order='worst', save_predictions=False) matrix_store = get_matrix_store(project_storage) train_matrix_columns = matrix_store.columns() predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns, ) # assert # 1. that the returned predictions are of the desired length assert len(predict_proba) == 2 # 2. that the predictions table entries are present and # can be linked to the original models assert not table_has_data(f"{matrix_type}_predictions", db_engine)
def test_uniform_distribution_entity_id_index(): with rig_engines() as (db_engine, project_storage): model = ModelFactory() feature_importances = [ FeatureImportanceFactory(model_rel=model, feature='feature_{}'.format(i)) for i in range(0, 10) ] data_dict = {'entity_id': [1, 2]} for imp in feature_importances: data_dict[imp.feature] = [0.5, 0.5] metadata = matrix_metadata_creator(indices='entity_id') test_store = get_matrix_store( project_storage, pandas.DataFrame.from_dict(data_dict).set_index( metadata['indices']), metadata) results = uniform_distribution(db_engine, model_id=model.model_id, as_of_date='2016-01-01', test_matrix_store=test_store, n_ranks=5) assert len(results) == 10 # 5 features x 2 entities for result in results: assert 'entity_id' in result assert 'feature_name' in result assert 'score' in result assert 'feature_value' in result assert result['feature_value'] == 0.5 assert result['score'] >= 0 assert result['score'] <= 1 assert isinstance(result['feature_name'], str) assert result['entity_id'] in [1, 2]
def test_retry_recovery(self): db_engine = None trainer = None port = None with rig_engines() as (db_engine, project_storage): port = db_engine.url.port trainer = ModelTrainer( experiment_hash=None, model_storage_engine=project_storage.model_storage_engine(), db_engine=db_engine, model_grouper=ModelGrouper(), ) matrix_store = get_matrix_store(project_storage) # start without a database server # then bring it back up after the first sleep # use self so it doesn't go out of scope too early and shut down self.new_server = None def replace_db(arg): self.new_server = testing.postgresql.Postgresql(port=port) db_engine = create_engine(self.new_server.url()) ensure_db(db_engine) init_engine(db_engine) get_matrix_store(project_storage) with patch("time.sleep") as time_mock: time_mock.side_effect = replace_db try: trainer.train_models(grid_config(), dict(), matrix_store) finally: if self.new_server is not None: self.new_server.stop() assert len(time_mock.mock_calls) == 1
def test_n_jobs_not_new_model(default_model_trainer): grid_config = { "sklearn.ensemble.AdaBoostClassifier": { "n_estimators": [10, 100, 1000] }, "sklearn.ensemble.RandomForestClassifier": { "n_estimators": [10, 100], "max_features": ["sqrt", "log2"], "max_depth": [5, 10, 15, 20], "criterion": ["gini", "entropy"], "n_jobs": [12, 24], }, } trainer = default_model_trainer project_storage = trainer.model_storage_engine.project_storage db_engine = trainer.db_engine train_tasks = trainer.generate_train_tasks( grid_config, dict(), get_matrix_store(project_storage)) assert len( train_tasks) == 35 # 32+3, would be (32*2)+3 if we didn't remove assert (len([ task for task in train_tasks if "n_jobs" in task["parameters"] ]) == 32) for train_task in train_tasks: trainer.process_train_task(**train_task) for row in db_engine.execute( "select hyperparameters from model_metadata.model_groups"): assert "n_jobs" not in row[0]
def test_custom_groups(grid_config, db_engine_with_results_schema, project_storage): model_storage_engine = project_storage.model_storage_engine() experiment_hash = save_experiment_and_get_hash( config={'foo': 'bar'}, db_engine=db_engine_with_results_schema ) run_id = initialize_tracking_and_get_run_id( experiment_hash, experiment_class_path="", random_seed=5, experiment_kwargs={}, db_engine=db_engine_with_results_schema ) trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(["class_path"]), db_engine=db_engine_with_results_schema, run_id=run_id, ) # create training set model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) # expect only one model group now records = [ row[0] for row in db_engine_with_results_schema.execute( "select distinct model_group_id from triage_metadata.models" ) ] assert len(records) == 1 assert records[0] == model_ids[0]
def test_uniform_distribution(): with rig_engines() as (db_engine, project_storage): model = ModelFactory() feature_importances = [ FeatureImportanceFactory(model_rel=model, feature="feature_{}".format(i)) for i in range(0, 10) ] data_dict = {"entity_id": [1, 1], "as_of_date": ["2016-01-01", "2017-01-01"], "label": [0, 1]} for imp in feature_importances: data_dict[imp.feature] = [0.5, 0.5] metadata = matrix_metadata_creator() test_store = get_matrix_store( project_storage, pandas.DataFrame.from_dict(data_dict), metadata, ) results = uniform_distribution( db_engine, model_id=model.model_id, as_of_date=datetime.date(2016, 1, 1), test_matrix_store=test_store, n_ranks=5, ) assert len(results) == 5 # 5 features x 1 entity for this as_of_date for result in results: assert "entity_id" in result assert "feature_name" in result assert "score" in result assert "feature_value" in result assert result["feature_value"] == 0.5 assert result["score"] >= 0 assert result["score"] <= 1 assert isinstance(result["feature_name"], str) assert result["entity_id"] in [1, 2]
def test_predictor_save_predictions(matrix_type, predict_setup_args): (project_storage, db_engine, model_id) = predict_setup_args # if save_predictions is sent as False, don't save predictor = Predictor(project_storage.model_storage_engine(), db_engine, save_predictions=False) matrix = matrix_creator(index="entity_id") metadata = matrix_metadata_creator(end_time=AS_OF_DATE, matrix_type=matrix_type, indices=["entity_id"]) matrix_store = get_matrix_store(project_storage, matrix, metadata) train_matrix_columns = matrix.columns[0:-1].tolist() predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns, ) # assert # 1. that the returned predictions are of the desired length assert len(predict_proba) == 2 # 2. that the predictions table entries are present and # can be linked to the original models assert not table_has_data(f"{matrix_type}_predictions", db_engine)
def test_predictor_get_train_columns(): with prepare() as (project_storage, db_engine, model_id): predictor = Predictor(project_storage.model_storage_engine(), db_engine) train_store = get_matrix_store( project_storage=project_storage, matrix=matrix_creator(), metadata=matrix_metadata_creator(matrix_type="train"), ) # flip the order of some feature columns in the test matrix other_order_matrix = matrix_creator() order = other_order_matrix.columns.tolist() order[0], order[1] = order[1], order[0] other_order_matrix = other_order_matrix[order] test_store = get_matrix_store( project_storage=project_storage, matrix=other_order_matrix, metadata=matrix_metadata_creator(matrix_type="test"), ) # Runs the same test for training and testing predictions for store, mat_type in zip((train_store, test_store), ("train", "test")): predict_proba = predictor.predict( model_id, store, misc_db_parameters=dict(), train_matrix_columns=train_store.columns(), ) # assert # 1. that we calculated predictions assert len(predict_proba) > 0 # 2. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute("""select entity_id, as_of_date from {}_results.predictions join model_metadata.models using (model_id)""".format( mat_type, mat_type)) ] assert len(records) > 0
def test_predictor_retrieve(): with prepare() as (project_storage, db_engine, model_id): predictor = Predictor(project_storage.model_storage_engine(), db_engine, replace=False) # create prediction set matrix = matrix_creator() metadata = matrix_metadata_creator() matrix_store = get_matrix_store(project_storage, matrix, metadata) predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=matrix.columns[0:-1].tolist()) # When run again, the predictions retrieved from the database # should match. # # Some trickiness here. Let's explain: # # If we are not careful, retrieving predictions from the database and # presenting them as a numpy array can result in a bad ordering, # since the given matrix may not be 'ordered' by some criteria # that can be easily represented by an ORDER BY clause. # # It will sometimes work, because without ORDER BY you will get # it back in the table's physical order, which unless something has # happened to the table will be the order you inserted it, # which could very well be the order in the matrix. # So it's not a bug that would necessarily immediately show itself, # but when it does go wrong your scores will be garbage. # # So we simulate a table order mutation that can happen over time: # Remove the first row and put it at the end. # If the Predictor doesn't explicitly reorder the results, this will fail # Only running on TestPrediction because TrainPrediction behaves the exact same way reorder_session = sessionmaker(bind=db_engine)() obj = reorder_session.query(TestPrediction).first() reorder_session.delete(obj) reorder_session.commit() make_transient(obj) reorder_session = sessionmaker(bind=db_engine)() reorder_session.add(obj) reorder_session.commit() predictor.load_model = Mock() new_predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=matrix.columns[0:-1].tolist()) assert_array_equal(new_predict_proba, predict_proba) assert not predictor.load_model.called
def test_calculate_and_save(): with rig_engines() as (db_engine, project_storage): train_store = get_matrix_store( project_storage, matrix_creator(), matrix_metadata_creator(matrix_type='train'), ) test_store = get_matrix_store( project_storage, matrix_creator(), matrix_metadata_creator(matrix_type='test'), ) calculator = IndividualImportanceCalculator(db_engine, methods=['sample'], replace=False) # given a trained model # and a test matrix _, model_id = \ fake_trained_model( db_engine, train_matrix_uuid=train_store.uuid ) # i expect to be able to call calculate and save calculator.calculate_and_save_all_methods_and_dates( model_id, test_store) # and find individual importances in the results schema afterwards records = [ row for row in db_engine.execute('''select entity_id, as_of_date from test_results.individual_importances join model_metadata.models using (model_id)''') ] assert len(records) > 0 # and that when run again, has the same result calculator.calculate_and_save_all_methods_and_dates( model_id, test_store) new_records = [ row for row in db_engine.execute('''select entity_id, as_of_date from test_results.individual_importances join model_metadata.models using (model_id)''') ] assert len(records) == len(new_records) assert records == new_records
def test_n_jobs_not_new_model(default_model_trainer): grid_config = { "sklearn.ensemble.AdaBoostClassifier": {"n_estimators": [10, 100, 1000]}, "sklearn.ensemble.RandomForestClassifier": { "n_estimators": [10, 100], "max_features": ["sqrt", "log2"], "max_depth": [5, 10, 15, 20], "criterion": ["gini", "entropy"], "n_jobs": [12], }, } trainer = default_model_trainer project_storage = trainer.model_storage_engine.project_storage db_engine = trainer.db_engine # generate train tasks, with a specific random seed so that we can compare # apples to apples later random.seed(5) train_tasks = trainer.generate_train_tasks( grid_config, dict(), get_matrix_store(project_storage) ) for train_task in train_tasks: trainer.process_train_task(**train_task) # since n_jobs is a runtime attribute of the model, it should not make it # into the model group for row in db_engine.execute( "select hyperparameters from triage_metadata.model_groups" ): assert "n_jobs" not in row[0] hashes = set(task['model_hash'] for task in train_tasks) # generate the grid again with a different n_jobs (but the same random seed!) # and make sure that the hashes are the same as before random.seed(5) grid_config['sklearn.ensemble.RandomForestClassifier']['n_jobs'] = [24] new_train_tasks = trainer.generate_train_tasks( grid_config, dict(), get_matrix_store(project_storage) ) assert hashes == set(task['model_hash'] for task in new_train_tasks)
def setup_model_train_tester(project_storage, replace, additional_bigtrain_classnames=None): matrix_storage_engine = MatrixStorageEngine(project_storage) train_matrix_store = get_matrix_store( project_storage, metadata=matrix_metadata_creator(matrix_type="train"), write_to_db=False) test_matrix_store = get_matrix_store( project_storage, metadata=matrix_metadata_creator(matrix_type="test"), write_to_db=False) sample_train_kwargs = { 'matrix_store': train_matrix_store, 'class_path': None, 'parameters': {}, 'model_hash': None, 'misc_db_parameters': {} } train_test_task = { 'train_kwargs': sample_train_kwargs, 'train_store': train_matrix_store, 'test_store': test_matrix_store } predictor = MagicMock(spec_set=Predictor) trainer = MagicMock(spec_set=ModelTrainer) evaluator = MagicMock(spec_set=ModelEvaluator) individual_importance_calculator = MagicMock( spec_set=IndividualImportanceCalculator) protected_groups_generator = MagicMock(spec_set=ProtectedGroupsGenerator) train_tester = ModelTrainTester( matrix_storage_engine=matrix_storage_engine, model_trainer=trainer, model_evaluator=evaluator, individual_importance_calculator=individual_importance_calculator, predictor=predictor, subsets=[], replace=replace, protected_groups_generator=protected_groups_generator, additional_bigtrain_classnames=additional_bigtrain_classnames) return train_tester, train_test_task
def test_ModelTrainTester_generate_tasks(db_engine_with_results_schema, project_storage, sample_timechop_splits, sample_grid_config): db_engine = db_engine_with_results_schema model_storage_engine = ModelStorageEngine(project_storage) matrix_storage_engine = MatrixStorageEngine(project_storage) sample_matrix_store = get_matrix_store(project_storage) experiment_hash = save_experiment_and_get_hash({}, db_engine) run_id = initialize_tracking_and_get_run_id( experiment_hash, experiment_class_path="", random_seed=5, experiment_kwargs={}, db_engine=db_engine_with_results_schema) # instantiate pipeline objects trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, run_id=run_id, ) train_tester = ModelTrainTester( matrix_storage_engine=matrix_storage_engine, model_trainer=trainer, model_evaluator=None, individual_importance_calculator=None, predictor=None, subsets=None, protected_groups_generator=None, ) with patch.object(matrix_storage_engine, 'get_store', return_value=sample_matrix_store): batches = train_tester.generate_task_batches( splits=sample_timechop_splits, grid_config=sample_grid_config) assert len(batches) == 3 # we expect to have a task for each combination of split and classifier flattened_tasks = list(task for batch in batches for task in batch.tasks) assert len(flattened_tasks) == \ len(sample_timechop_splits) * len(list(flatten_grid_config(sample_grid_config))) # we also expect each task to match the call signature of process_task with patch.object(train_tester, 'process_task', autospec=True): for task in flattened_tasks: train_tester.process_task(**task)
def test_predictor_composite_index(): with prepare() as (project_storage, db_engine, model_id): predictor = Predictor(project_storage.model_storage_engine(), db_engine) dayone = datetime.datetime(2011, 1, 1) daytwo = datetime.datetime(2011, 1, 2) source_dict = { "entity_id": [1, 2, 1, 2], "as_of_date": [dayone, dayone, daytwo, daytwo], "feature_one": [3, 4, 5, 6], "feature_two": [5, 6, 7, 8], "label": [7, 8, 8, 7], } # Runs the same test for training and testing predictions for mat_type in ("train", "test"): matrix = pandas.DataFrame.from_dict(source_dict).set_index( ["entity_id", "as_of_date"]) metadata = matrix_metadata_creator(matrix_type=mat_type) matrix_store = get_matrix_store(project_storage, matrix, metadata) predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=["feature_one", "feature_two"], ) # assert # 1. that the returned predictions are of the desired length assert len(predict_proba) == 4 # 2. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute("""select entity_id, as_of_date from {}_results.predictions join model_metadata.models using (model_id)""".format( mat_type, mat_type)) ] assert len(records) == 4
def test_retry_max(self): db_engine = None trainer = None # set up a basic model training run with rig_engines() as (db_engine, project_storage): trainer = ModelTrainer( experiment_hash=None, model_storage_engine=project_storage.model_storage_engine(), db_engine=db_engine, model_grouper=ModelGrouper()) matrix_store = get_matrix_store(project_storage) # the postgres server goes out of scope here and thus no longer exists with patch('time.sleep') as time_mock: with self.assertRaises(sqlalchemy.exc.OperationalError): trainer.train_models(grid_config(), dict(), matrix_store) # we want to make sure that we are using the retrying module sanely # as opposed to matching the exact # of calls specified by the code assert len(time_mock.mock_calls) > 5
def test_baseline_exception_handling(): grid_config = { 'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature': { 'feature': ['feature_one', 'feature_three'] } } with rig_engines() as (db_engine, project_storage): trainer = ModelTrainer( experiment_hash=None, model_storage_engine=project_storage.model_storage_engine(), db_engine=db_engine, model_grouper=ModelGrouper()) train_tasks = trainer.generate_train_tasks( grid_config, dict(), get_matrix_store(project_storage)) model_ids = [] for train_task in train_tasks: model_ids.append(trainer.process_train_task(**train_task)) assert model_ids == [1, None]
def test_custom_groups(grid_config): with rig_engines() as (db_engine, project_storage): # create training set model_storage_engine = project_storage.model_storage_engine() trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(['class_path']), db_engine=db_engine, ) model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage)) # expect only one model group now records = [ row[0] for row in db_engine.execute( 'select distinct model_group_id from model_metadata.models') ] assert len(records) == 1 assert records[0] == model_ids[0]
def test_predictor_needs_predictions(matrix_type, predict_setup_args): """Test that the logic that figures out if predictions are needed for a given model/matrix""" (project_storage, db_engine, model_id) = predict_setup_args # if not all of the predictions for the given model id and matrix are present in the db, # needs_predictions should return true. else, false predictor = Predictor(project_storage.model_storage_engine(), db_engine, 'worst') metadata = matrix_metadata_creator(matrix_type=matrix_type) matrix_store = get_matrix_store(project_storage, metadata=metadata) train_matrix_columns = matrix_store.columns() # we haven't done anything yet, this should definitely need predictions assert predictor.needs_predictions(matrix_store, model_id) predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns, ) # now that predictions have been made, this should no longer need predictions assert not predictor.needs_predictions(matrix_store, model_id)
def test_n_jobs_not_new_model(): grid_config = { 'sklearn.ensemble.AdaBoostClassifier': { 'n_estimators': [10, 100, 1000] }, 'sklearn.ensemble.RandomForestClassifier': { 'n_estimators': [10, 100], 'max_features': ['sqrt', 'log2'], 'max_depth': [5, 10, 15, 20], 'criterion': ['gini', 'entropy'], 'n_jobs': [12, 24], } } with rig_engines() as (db_engine, project_storage): model_storage_engine = project_storage.model_storage_engine() trainer = ModelTrainer(experiment_hash=None, model_storage_engine=model_storage_engine, db_engine=db_engine, model_grouper=ModelGrouper()) train_tasks = trainer.generate_train_tasks( grid_config, dict(), get_matrix_store(project_storage), ) assert len( train_tasks) == 35 # 32+3, would be (32*2)+3 if we didn't remove assert len([ task for task in train_tasks if 'n_jobs' in task['parameters'] ]) == 32 for train_task in train_tasks: trainer.process_train_task(**train_task) for row in db_engine.execute( 'select hyperparameters from model_metadata.model_groups'): assert 'n_jobs' not in row[0]
def test_uniform_distribution_entity_id_index(): with rig_engines() as (db_engine, project_storage): model = ModelFactory() feature_importances = [ FeatureImportanceFactory(model_rel=model, feature="feature_{}".format(i)) for i in range(0, 10) ] data_dict = {"entity_id": [1, 2]} for imp in feature_importances: data_dict[imp.feature] = [0.5, 0.5] metadata = matrix_metadata_creator(indices="entity_id") test_store = get_matrix_store( project_storage, pandas.DataFrame.from_dict(data_dict).set_index( metadata["indices"]), metadata, ) results = uniform_distribution( db_engine, model_id=model.model_id, as_of_date="2016-01-01", test_matrix_store=test_store, n_ranks=5, ) assert len(results) == 10 # 5 features x 2 entities for result in results: assert "entity_id" in result assert "feature_name" in result assert "score" in result assert "feature_value" in result assert result["feature_value"] == 0.5 assert result["score"] >= 0 assert result["score"] <= 1 assert isinstance(result["feature_name"], str) assert result["entity_id"] in [1, 2]
def prediction_results(matrix_type, predictor, predict_setup_args): (project_storage, db_engine, model_id) = predict_setup_args dayone = datetime.datetime(2011, 1, 1) daytwo = datetime.datetime(2011, 1, 2) source_dict = { "entity_id": [1, 2, 3, 1, 2, 3], "as_of_date": [dayone, dayone, dayone, daytwo, daytwo, daytwo], "feature_one": [3] * 6, "feature_two": [5] * 6, "label": [True, False] * 3 } matrix = pd.DataFrame.from_dict(source_dict) metadata = matrix_metadata_creator(matrix_type=matrix_type) matrix_store = get_matrix_store(project_storage, matrix, metadata) predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=["feature_one", "feature_two"], ) return predict_proba
def test_predictor_needs_predictions(matrix_type, predict_setup_args): (project_storage, db_engine, model_id) = predict_setup_args # if not all of the predictions for the given model id and matrix are present in the db, # needs_predictions should return true. else, false predictor = Predictor(project_storage.model_storage_engine(), db_engine) matrix = matrix_creator(index="entity_id") metadata = matrix_metadata_creator(end_time=AS_OF_DATE, matrix_type=matrix_type, indices=["entity_id"]) matrix_store = get_matrix_store(project_storage, matrix, metadata) train_matrix_columns = matrix.columns[0:-1].tolist() # we haven't done anything yet, this should definitely need predictions assert predictor.needs_predictions(matrix_store, model_id) predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns, ) # now that predictions have been made, this should no longer need predictions assert not predictor.needs_predictions(matrix_store, model_id)
def test_predictor_entity_index(): with prepare() as (project_storage, db_engine, model_id): predictor = Predictor(project_storage.model_storage_engine(), db_engine) # Runs the same test for training and testing predictions for mat_type in ("train", "test"): matrix = matrix_creator(index="entity_id") metadata = matrix_metadata_creator(end_time=AS_OF_DATE, matrix_type=mat_type, indices=["entity_id"]) matrix_store = get_matrix_store(project_storage, matrix, metadata) train_matrix_columns = matrix.columns[0:-1].tolist() predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns, ) # assert # 1. that the returned predictions are of the desired length assert len(predict_proba) == 2 # 2. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute("""select entity_id, as_of_date from {}_results.predictions join model_metadata.models using (model_id)""".format( mat_type, mat_type)) ] assert len(records) == 2 # 3. that the contained as_of_dates match what we sent in for record in records: assert record[1].date() == AS_OF_DATE # 4. that the entity ids match the given dataset assert sorted([record[0] for record in records]) == [1, 2] # 5. running with same model_id, different as of date # then with same as of date only replaces the records # with the same date # Runs the same test for training and testing predictions for mat_type in ("train", "test"): new_matrix = matrix_creator(index="entity_id") new_metadata = matrix_metadata_creator( end_time=AS_OF_DATE + datetime.timedelta(days=1), matrix_type=mat_type, indices=["entity_id"], ) new_matrix_store = get_matrix_store(project_storage, new_matrix, new_metadata) predictor.predict( model_id, new_matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns, ) predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns, ) records = [ row for row in db_engine.execute("""select entity_id, as_of_date from {}_results.predictions join model_metadata.models using (model_id)""".format( mat_type, mat_type)) ] assert len(records) == 4 # 6. That we can delete the model when done prediction on it predictor.delete_model(model_id) assert predictor.load_model(model_id) is None
def test_model_trainer(grid_config): with rig_engines() as (db_engine, project_storage): # Creates a matrix entry in the matrices table with uuid from metadata above model_storage_engine = project_storage.model_storage_engine() trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(), db_engine=db_engine, ) model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) # assert # 1. that the models and feature importances table entries are present records = [ row for row in db_engine.execute( 'select * from train_results.feature_importances') ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes records = [ row for row in db_engine.execute( 'select model_hash from model_metadata.models') ] assert len(records) == 4 hashes = [row[0] for row in records] # 2. that the model groups are distinct records = [ row for row in db_engine.execute( 'select distinct model_group_id from model_metadata.models') ] assert len(records) == 4 # 3. that the model sizes are saved in the table and all are < 1 kB records = [ row for row in db_engine.execute( 'select model_size from model_metadata.models') ] assert len(records) == 4 for i in records: size = i[0] assert size < 1 # 4. that all four models are cached model_pickles = [ model_storage_engine.load(model_hash) for model_hash in hashes ] assert len(model_pickles) == 4 assert len([x for x in model_pickles if x is not None]) == 4 # 5. that their results can have predictions made on it test_matrix = pandas.DataFrame.from_dict({ 'entity_id': [3, 4], 'feature_one': [4, 4], 'feature_two': [6, 5], }).set_index('entity_id') for model_pickle in model_pickles: predictions = model_pickle.predict(test_matrix) assert len(predictions) == 2 # 6. when run again, same models are returned new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage)) assert len([ row for row in db_engine.execute( 'select model_hash from model_metadata.models') ]) == 4 assert model_ids == new_model_ids # 7. if replace is set, update non-unique attributes and feature importances max_batch_run_time = [ row[0] for row in db_engine.execute( 'select max(batch_run_time) from model_metadata.models') ][0] trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper( model_group_keys=['label_name', 'label_timespan']), db_engine=db_engine, replace=True) new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage)) assert model_ids == new_model_ids assert [ row['model_id'] for row in db_engine.execute( 'select model_id from model_metadata.models order by 1 asc') ] == model_ids new_max_batch_run_time = [ row[0] for row in db_engine.execute( 'select max(batch_run_time) from model_metadata.models') ][0] assert new_max_batch_run_time > max_batch_run_time records = [ row for row in db_engine.execute( 'select * from train_results.feature_importances') ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes # 8. if the cache is missing but the metadata is still there, reuse the metadata for row in db_engine.execute( 'select model_hash from model_metadata.models'): model_storage_engine.delete(row[0]) new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage)) assert model_ids == sorted(new_model_ids) # 9. that the generator interface works the same way new_model_ids = trainer.generate_trained_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage)) assert model_ids == \ sorted([model_id for model_id in new_model_ids])
def test_reuse_model_random_seeds(grid_config, default_model_trainer): trainer = default_model_trainer db_engine = trainer.db_engine project_storage = trainer.model_storage_engine.project_storage model_storage_engine = trainer.model_storage_engine # re-using the random seeds requires the association between experiments and models # to exist, which we're not getting in these tests since we aren't using the experiment # architecture, so back-fill these associations after each train_models() run def update_experiment_models(db_engine): sql = """ INSERT INTO triage_metadata.experiment_models(experiment_hash,model_hash) SELECT er.run_hash, m.model_hash FROM triage_metadata.models m LEFT JOIN triage_metadata.triage_runs er ON m.built_in_triage_run = er.id LEFT JOIN triage_metadata.experiment_models em ON m.model_hash = em.model_hash AND er.run_hash = em.experiment_hash WHERE em.experiment_hash IS NULL """ db_engine.execute(sql) db_engine.execute('COMMIT;') random.seed(5) model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) update_experiment_models(db_engine) # simulate running a new experiment where the experiment hash has changed # (e.g. because the model grid is different), but experiment seed is the # same, so previously-trained models should not get new seeds experiment_hash = save_experiment_and_get_hash( config={'baz': 'qux'}, db_engine=db_engine ) run_id = initialize_tracking_and_get_run_id( experiment_hash, experiment_class_path="", random_seed=5, experiment_kwargs={}, db_engine=db_engine ) trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, model_grouper=ModelGrouper(), run_id=run_id, ) new_grid = grid_config.copy() new_grid['sklearn.tree.DecisionTreeClassifier']['min_samples_split'] = [3,10,100] random.seed(5) new_model_ids = trainer.train_models( grid_config=new_grid, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) update_experiment_models(db_engine) # should have received 5 models assert len(new_model_ids) == 6 # all the original model ids should be in the new set assert len(set(new_model_ids) & set(model_ids)) == len(model_ids) # however, we should NOT re-use the random seeds (and so get new model_ids) # if the experiment-level seed is different experiment_hash = save_experiment_and_get_hash( config={'lorem': 'ipsum'}, db_engine=db_engine ) run_id = initialize_tracking_and_get_run_id( experiment_hash, experiment_class_path="", random_seed=42, experiment_kwargs={}, db_engine=db_engine ) trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, model_grouper=ModelGrouper(), run_id=run_id, ) random.seed(42) # different from above newer_model_ids = trainer.train_models( grid_config=new_grid, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) update_experiment_models(db_engine) # should get entirely new models now (different IDs) assert len(newer_model_ids) == 6 assert len(set(new_model_ids) & set(newer_model_ids)) == 0
def replace_db(arg): self.new_server = testing.postgresql.Postgresql(port=port) db_engine = create_engine(self.new_server.url()) ensure_db(db_engine) init_engine(db_engine) get_matrix_store(project_storage)