def test_retry_recovery(self): db_engine = None trainer = None port = None with rig_engines() as (db_engine, project_storage): port = db_engine.url.port trainer = ModelTrainer( experiment_hash=None, model_storage_engine=project_storage.model_storage_engine(), db_engine=db_engine, model_grouper=ModelGrouper(), ) matrix_store = get_matrix_store(project_storage) # start without a database server # then bring it back up after the first sleep # use self so it doesn't go out of scope too early and shut down self.new_server = None def replace_db(arg): self.new_server = testing.postgresql.Postgresql(port=port) db_engine = create_engine(self.new_server.url()) ensure_db(db_engine) init_engine(db_engine) get_matrix_store(project_storage) with patch("time.sleep") as time_mock: time_mock.side_effect = replace_db try: trainer.train_models(grid_config(), dict(), matrix_store) finally: if self.new_server is not None: self.new_server.stop() assert len(time_mock.mock_calls) == 1
def test_uniform_distribution_entity_id_index(): with rig_engines() as (db_engine, project_storage): model = ModelFactory() feature_importances = [ FeatureImportanceFactory(model_rel=model, feature='feature_{}'.format(i)) for i in range(0, 10) ] data_dict = {'entity_id': [1, 2]} for imp in feature_importances: data_dict[imp.feature] = [0.5, 0.5] metadata = matrix_metadata_creator(indices='entity_id') test_store = get_matrix_store( project_storage, pandas.DataFrame.from_dict(data_dict).set_index( metadata['indices']), metadata) results = uniform_distribution(db_engine, model_id=model.model_id, as_of_date='2016-01-01', test_matrix_store=test_store, n_ranks=5) assert len(results) == 10 # 5 features x 2 entities for result in results: assert 'entity_id' in result assert 'feature_name' in result assert 'score' in result assert 'feature_value' in result assert result['feature_value'] == 0.5 assert result['score'] >= 0 assert result['score'] <= 1 assert isinstance(result['feature_name'], str) assert result['entity_id'] in [1, 2]
def test_uniform_distribution(): with rig_engines() as (db_engine, project_storage): model = ModelFactory() feature_importances = [ FeatureImportanceFactory(model_rel=model, feature="feature_{}".format(i)) for i in range(0, 10) ] data_dict = {"entity_id": [1, 1], "as_of_date": ["2016-01-01", "2017-01-01"], "label": [0, 1]} for imp in feature_importances: data_dict[imp.feature] = [0.5, 0.5] metadata = matrix_metadata_creator() test_store = get_matrix_store( project_storage, pandas.DataFrame.from_dict(data_dict), metadata, ) results = uniform_distribution( db_engine, model_id=model.model_id, as_of_date=datetime.date(2016, 1, 1), test_matrix_store=test_store, n_ranks=5, ) assert len(results) == 5 # 5 features x 1 entity for this as_of_date for result in results: assert "entity_id" in result assert "feature_name" in result assert "score" in result assert "feature_value" in result assert result["feature_value"] == 0.5 assert result["score"] >= 0 assert result["score"] <= 1 assert isinstance(result["feature_name"], str) assert result["entity_id"] in [1, 2]
def prepare(): with rig_engines() as (db_engine, project_storage): train_matrix_uuid = '1234' session = sessionmaker(db_engine)() session.add(Matrix(matrix_uuid=train_matrix_uuid)) # Create the fake trained model and store in db trained_model = MockTrainedModel() model_hash = 'abcd' project_storage.model_storage_engine().write(trained_model, model_hash) db_model = Model(model_hash=model_hash, train_matrix_uuid=train_matrix_uuid) session.add(db_model) session.commit() yield project_storage, db_engine, db_model.model_id
def test_retry_max(self): db_engine = None trainer = None # set up a basic model training run with rig_engines() as (db_engine, project_storage): trainer = ModelTrainer( experiment_hash=None, model_storage_engine=project_storage.model_storage_engine(), db_engine=db_engine, model_grouper=ModelGrouper()) matrix_store = get_matrix_store(project_storage) # the postgres server goes out of scope here and thus no longer exists with patch('time.sleep') as time_mock: with self.assertRaises(sqlalchemy.exc.OperationalError): trainer.train_models(grid_config(), dict(), matrix_store) # we want to make sure that we are using the retrying module sanely # as opposed to matching the exact # of calls specified by the code assert len(time_mock.mock_calls) > 5
def test_calculate_and_save(): with rig_engines() as (db_engine, project_storage): train_store = get_matrix_store( project_storage, matrix_creator(), matrix_metadata_creator(matrix_type='train'), ) test_store = get_matrix_store( project_storage, matrix_creator(), matrix_metadata_creator(matrix_type='test'), ) calculator = IndividualImportanceCalculator(db_engine, methods=['sample'], replace=False) # given a trained model # and a test matrix _, model_id = \ fake_trained_model( db_engine, train_matrix_uuid=train_store.uuid ) # i expect to be able to call calculate and save calculator.calculate_and_save_all_methods_and_dates( model_id, test_store) # and find individual importances in the results schema afterwards records = [ row for row in db_engine.execute('''select entity_id, as_of_date from test_results.individual_importances join model_metadata.models using (model_id)''') ] assert len(records) > 0 # and that when run again, has the same result calculator.calculate_and_save_all_methods_and_dates( model_id, test_store) new_records = [ row for row in db_engine.execute('''select entity_id, as_of_date from test_results.individual_importances join model_metadata.models using (model_id)''') ] assert len(records) == len(new_records) assert records == new_records
def prepare(): with rig_engines() as (db_engine, project_storage): train_matrix_uuid = "1234" try: session = sessionmaker(db_engine)() session.add(Matrix(matrix_uuid=train_matrix_uuid)) # Create the fake trained model and store in db trained_model = MockTrainedModel() model_hash = "abcd" project_storage.model_storage_engine().write( trained_model, model_hash) db_model = Model(model_hash=model_hash, train_matrix_uuid=train_matrix_uuid, random_seed=MODEL_RANDOM_SEED) session.add(db_model) session.commit() yield project_storage, db_engine, db_model.model_id finally: session.close()
def test_custom_groups(grid_config): with rig_engines() as (db_engine, project_storage): # create training set model_storage_engine = project_storage.model_storage_engine() trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(['class_path']), db_engine=db_engine, ) model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage)) # expect only one model group now records = [ row[0] for row in db_engine.execute( 'select distinct model_group_id from model_metadata.models') ] assert len(records) == 1 assert records[0] == model_ids[0]
def test_baseline_exception_handling(): grid_config = { 'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature': { 'feature': ['feature_one', 'feature_three'] } } with rig_engines() as (db_engine, project_storage): trainer = ModelTrainer( experiment_hash=None, model_storage_engine=project_storage.model_storage_engine(), db_engine=db_engine, model_grouper=ModelGrouper()) train_tasks = trainer.generate_train_tasks( grid_config, dict(), get_matrix_store(project_storage)) model_ids = [] for train_task in train_tasks: model_ids.append(trainer.process_train_task(**train_task)) assert model_ids == [1, None]
def test_n_jobs_not_new_model(): grid_config = { 'sklearn.ensemble.AdaBoostClassifier': { 'n_estimators': [10, 100, 1000] }, 'sklearn.ensemble.RandomForestClassifier': { 'n_estimators': [10, 100], 'max_features': ['sqrt', 'log2'], 'max_depth': [5, 10, 15, 20], 'criterion': ['gini', 'entropy'], 'n_jobs': [12, 24], } } with rig_engines() as (db_engine, project_storage): model_storage_engine = project_storage.model_storage_engine() trainer = ModelTrainer(experiment_hash=None, model_storage_engine=model_storage_engine, db_engine=db_engine, model_grouper=ModelGrouper()) train_tasks = trainer.generate_train_tasks( grid_config, dict(), get_matrix_store(project_storage), ) assert len( train_tasks) == 35 # 32+3, would be (32*2)+3 if we didn't remove assert len([ task for task in train_tasks if 'n_jobs' in task['parameters'] ]) == 32 for train_task in train_tasks: trainer.process_train_task(**train_task) for row in db_engine.execute( 'select hyperparameters from model_metadata.model_groups'): assert 'n_jobs' not in row[0]
def test_n_jobs_not_new_model(): grid_config = { "sklearn.ensemble.AdaBoostClassifier": { "n_estimators": [10, 100, 1000] }, "sklearn.ensemble.RandomForestClassifier": { "n_estimators": [10, 100], "max_features": ["sqrt", "log2"], "max_depth": [5, 10, 15, 20], "criterion": ["gini", "entropy"], "n_jobs": [12, 24], }, } with rig_engines() as (db_engine, project_storage): model_storage_engine = project_storage.model_storage_engine() trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, db_engine=db_engine, model_grouper=ModelGrouper(), ) train_tasks = trainer.generate_train_tasks( grid_config, dict(), get_matrix_store(project_storage)) assert len( train_tasks) == 35 # 32+3, would be (32*2)+3 if we didn't remove assert (len([ task for task in train_tasks if "n_jobs" in task["parameters"] ]) == 32) for train_task in train_tasks: trainer.process_train_task(**train_task) for row in db_engine.execute( "select hyperparameters from model_metadata.model_groups"): assert "n_jobs" not in row[0]
def test_uniform_distribution_entity_id_index(): with rig_engines() as (db_engine, project_storage): model = ModelFactory() feature_importances = [ FeatureImportanceFactory(model_rel=model, feature="feature_{}".format(i)) for i in range(0, 10) ] data_dict = {"entity_id": [1, 2]} for imp in feature_importances: data_dict[imp.feature] = [0.5, 0.5] metadata = matrix_metadata_creator(indices="entity_id") test_store = get_matrix_store( project_storage, pandas.DataFrame.from_dict(data_dict).set_index( metadata["indices"]), metadata, ) results = uniform_distribution( db_engine, model_id=model.model_id, as_of_date="2016-01-01", test_matrix_store=test_store, n_ranks=5, ) assert len(results) == 10 # 5 features x 2 entities for result in results: assert "entity_id" in result assert "feature_name" in result assert "score" in result assert "feature_value" in result assert result["feature_value"] == 0.5 assert result["score"] >= 0 assert result["score"] <= 1 assert isinstance(result["feature_name"], str) assert result["entity_id"] in [1, 2]
def test_integration(): with rig_engines() as (db_engine, project_storage): train_store = get_matrix_store( project_storage, matrix_creator(), matrix_metadata_creator(matrix_type='train')) as_of_dates = [datetime.date(2016, 12, 21), datetime.date(2017, 1, 21)] test_stores = [] for as_of_date in as_of_dates: matrix_store = get_matrix_store( project_storage, pandas.DataFrame.from_dict({ 'entity_id': [3], 'feature_one': [8], 'feature_two': [5], 'label': [0] }).set_index('entity_id'), matrix_metadata_creator(end_time=as_of_date, indices=['entity_id'])) test_stores.append(matrix_store) model_storage_engine = ModelStorageEngine(project_storage) experiment_hash = save_experiment_and_get_hash({}, db_engine) # instantiate pipeline objects trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, ) predictor = Predictor(model_storage_engine, db_engine) model_evaluator = ModelEvaluator([{ 'metrics': ['precision@'], 'thresholds': { 'top_n': [5] } }], [{}], db_engine) # run the pipeline grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict(), train_matrix_columns=['feature_one', 'feature_two']) model_evaluator.evaluate( predictions_proba, test_store, model_id, ) # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( '''select entity_id, model_id, as_of_date from test_results.predictions join model_metadata.models using (model_id) order by 3, 2''') ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(''' select model_id, evaluation_start_time, metric, parameter from test_results.evaluations order by 2, 1''') ] assert records == [ (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), ]
def test_model_trainer(grid_config): with rig_engines() as (db_engine, project_storage): # Creates a matrix entry in the matrices table with uuid from metadata above model_storage_engine = project_storage.model_storage_engine() trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(), db_engine=db_engine, ) model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) # assert # 1. that the models and feature importances table entries are present records = [ row for row in db_engine.execute( "select * from train_results.feature_importances") ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes records = [ row for row in db_engine.execute( "select model_hash from model_metadata.models") ] assert len(records) == 4 hashes = [row[0] for row in records] # 2. that the model groups are distinct records = [ row for row in db_engine.execute( "select distinct model_group_id from model_metadata.models") ] assert len(records) == 4 # 3. that the model sizes are saved in the table and all are < 1 kB records = [ row for row in db_engine.execute( "select model_size from model_metadata.models") ] assert len(records) == 4 for i in records: size = i[0] assert size < 1 # 4. that all four models are cached model_pickles = [ model_storage_engine.load(model_hash) for model_hash in hashes ] assert len(model_pickles) == 4 assert len([x for x in model_pickles if x is not None]) == 4 # 5. that their results can have predictions made on it test_matrix = pandas.DataFrame.from_dict({ "entity_id": [3, 4], "feature_one": [4, 4], "feature_two": [6, 5] }).set_index("entity_id") for model_pickle in model_pickles: predictions = model_pickle.predict(test_matrix) assert len(predictions) == 2 # 6. when run again, same models are returned new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) assert (len([ row for row in db_engine.execute( "select model_hash from model_metadata.models") ]) == 4) assert model_ids == new_model_ids # 7. if replace is set, update non-unique attributes and feature importances max_batch_run_time = [ row[0] for row in db_engine.execute( "select max(batch_run_time) from model_metadata.models") ][0] trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper( model_group_keys=["label_name", "label_timespan"]), db_engine=db_engine, replace=True, ) new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) assert model_ids == new_model_ids assert [ row["model_id"] for row in db_engine.execute( "select model_id from model_metadata.models order by 1 asc") ] == model_ids new_max_batch_run_time = [ row[0] for row in db_engine.execute( "select max(batch_run_time) from model_metadata.models") ][0] assert new_max_batch_run_time > max_batch_run_time records = [ row for row in db_engine.execute( "select * from train_results.feature_importances") ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes # 8. if the cache is missing but the metadata is still there, reuse the metadata for row in db_engine.execute( "select model_hash from model_metadata.models"): model_storage_engine.delete(row[0]) new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) assert model_ids == sorted(new_model_ids) # 9. that the generator interface works the same way new_model_ids = trainer.generate_trained_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) assert model_ids == sorted([model_id for model_id in new_model_ids])
def test_integration(): with rig_engines() as (db_engine, project_storage): train_store = get_matrix_store( project_storage, matrix_creator(), matrix_metadata_creator(matrix_type="train"), ) as_of_dates = [datetime.date(2016, 12, 21), datetime.date(2017, 1, 21)] test_stores = [] for as_of_date in as_of_dates: matrix_store = get_matrix_store( project_storage, pandas.DataFrame.from_dict({ "entity_id": [3], "feature_one": [8], "feature_two": [5], "label": [0], }).set_index("entity_id"), matrix_metadata_creator(end_time=as_of_date, indices=["entity_id"]), ) test_stores.append(matrix_store) model_storage_engine = ModelStorageEngine(project_storage) experiment_hash = save_experiment_and_get_hash({}, db_engine) # instantiate pipeline objects trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, ) predictor = Predictor(model_storage_engine, db_engine) model_evaluator = ModelEvaluator([{ "metrics": ["precision@"], "thresholds": { "top_n": [5] } }], [{}], db_engine) # run the pipeline grid_config = { "sklearn.linear_model.LogisticRegression": { "C": [0.00001, 0.0001], "penalty": ["l1", "l2"], "random_state": [2193], } } model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict(), train_matrix_columns=["feature_one", "feature_two"], ) model_evaluator.evaluate(predictions_proba, test_store, model_id) # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( """select entity_id, model_id, as_of_date from test_results.predictions join model_metadata.models using (model_id) order by 3, 2""") ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(""" select model_id, evaluation_start_time, metric, parameter from test_results.evaluations order by 2, 1""") ] assert records == [ (1, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (2, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (3, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (4, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (1, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), (2, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), (3, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), (4, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), ]