def test_retry_max(self): db_engine = None trainer = None # set up a basic model training run # TODO abstract the setup of a basic model training run where # we don't worry about the specific values used? it would make # tests like this require a bit less noise to read past with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) trainer = ModelTrainer( project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=InMemoryModelStorageEngine( project_path=''), db_engine=db_engine, model_grouper=ModelGrouper()) # the postgres server goes out of scope here and thus no longer exists with patch('time.sleep') as time_mock: with self.assertRaises(sqlalchemy.exc.OperationalError): trainer.train_models(grid_config(), dict(), sample_matrix_store()) # we want to make sure that we are using the retrying module sanely # as opposed to matching the exact # of calls specified by the code assert len(time_mock.mock_calls) > 5
def test_retry_recovery(self): db_engine = None trainer = None port = None with rig_engines() as (db_engine, project_storage): port = db_engine.url.port trainer = ModelTrainer( experiment_hash=None, model_storage_engine=project_storage.model_storage_engine(), db_engine=db_engine, model_grouper=ModelGrouper(), ) matrix_store = get_matrix_store(project_storage) # start without a database server # then bring it back up after the first sleep # use self so it doesn't go out of scope too early and shut down self.new_server = None def replace_db(arg): self.new_server = testing.postgresql.Postgresql(port=port) db_engine = create_engine(self.new_server.url()) ensure_db(db_engine) init_engine(db_engine) get_matrix_store(project_storage) with patch("time.sleep") as time_mock: time_mock.side_effect = replace_db try: trainer.train_models(grid_config(), dict(), matrix_store) finally: if self.new_server is not None: self.new_server.stop() assert len(time_mock.mock_calls) == 1
def test_retry_recovery(self): grid_config = { 'sklearn.ensemble.AdaBoostClassifier': { 'n_estimators': [10] }, } engine = None trainer = None port = None with testing.postgresql.Postgresql() as postgresql: port = postgresql.settings['port'] engine = create_engine(postgresql.url()) ensure_db(engine) trainer = ModelTrainer( project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=InMemoryModelStorageEngine(project_path=''), db_engine=engine, model_group_keys=['label_name', 'label_timespan'] ) matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': ['good', 'bad'] }) matrix_store = InMemoryMatrixStore(matrix, { 'label_timespan': '1d', 'end_time': datetime.datetime.now(), 'feature_start_time': datetime.date(2012, 12, 20), 'label_name': 'label', 'metta-uuid': '1234', 'feature_names': ['ft1', 'ft2'], 'indices': ['entity_id'], }) # start without a database server # then bring it back up after the first sleep # use self so it doesn't go out of scope too early and shut down self.new_server = None def replace_db(arg): self.new_server = testing.postgresql.Postgresql(port=port) engine = create_engine(self.new_server.url()) ensure_db(engine) with patch('time.sleep') as time_mock: time_mock.side_effect = replace_db try: trainer.train_models(grid_config, dict(), matrix_store) finally: if self.new_server is not None: self.new_server.stop() assert len(time_mock.mock_calls) == 1
def test_custom_groups(grid_config, db_engine_with_results_schema, project_storage): model_storage_engine = project_storage.model_storage_engine() experiment_hash = save_experiment_and_get_hash( config={'foo': 'bar'}, db_engine=db_engine_with_results_schema ) run_id = initialize_tracking_and_get_run_id( experiment_hash, experiment_class_path="", random_seed=5, experiment_kwargs={}, db_engine=db_engine_with_results_schema ) trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(["class_path"]), db_engine=db_engine_with_results_schema, run_id=run_id, ) # create training set model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) # expect only one model group now records = [ row[0] for row in db_engine_with_results_schema.execute( "select distinct model_group_id from triage_metadata.models" ) ] assert len(records) == 1 assert records[0] == model_ids[0]
def test_custom_groups(sample_matrix_store, grid_config): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) init_engine(engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') MatrixFactory(matrix_uuid="1234") session.commit() # create training set project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(project_path) trainer = ModelTrainer( project_path=project_path, experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(['class_path']), db_engine=engine, ) model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store) # expect only one model group now records = [ row[0] for row in engine.execute( 'select distinct model_group_id from model_metadata.models' ) ] assert len(records) == 1 assert records[0] == model_ids[0]
def test_retry_max(self): grid_config = { 'sklearn.ensemble.AdaBoostClassifier': { 'n_estimators': [10] }, } engine = None trainer = None # set up a basic model training run # TODO abstract the setup of a basic model training run where # we don't worry about the specific values used? it would make # tests like this require a bit less noise to read past with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) trainer = ModelTrainer( project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=InMemoryModelStorageEngine(project_path=''), db_engine=engine, model_group_keys=['label_name', 'label_timespan'] ) matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': ['good', 'bad'] }) matrix_store = InMemoryMatrixStore(matrix, { 'label_timespan': '1d', 'end_time': datetime.datetime.now(), 'feature_start_time': datetime.date(2012, 12, 20), 'label_name': 'label', 'metta-uuid': '1234', 'feature_names': ['ft1', 'ft2'], 'indices': ['entity_id'], }) # the postgres server goes out of scope here and thus no longer exists with patch('time.sleep') as time_mock: with self.assertRaises(sqlalchemy.exc.OperationalError): trainer.train_models(grid_config, dict(), matrix_store) # we want to make sure that we are using the retrying module sanely # as opposed to matching the exact # of calls specified by the code assert len(time_mock.mock_calls) > 5
def test_retry_max(self): db_engine = None trainer = None # set up a basic model training run with rig_engines() as (db_engine, project_storage): trainer = ModelTrainer( experiment_hash=None, model_storage_engine=project_storage.model_storage_engine(), db_engine=db_engine, model_grouper=ModelGrouper()) matrix_store = get_matrix_store(project_storage) # the postgres server goes out of scope here and thus no longer exists with patch('time.sleep') as time_mock: with self.assertRaises(sqlalchemy.exc.OperationalError): trainer.train_models(grid_config(), dict(), matrix_store) # we want to make sure that we are using the retrying module sanely # as opposed to matching the exact # of calls specified by the code assert len(time_mock.mock_calls) > 5
def test_retry_recovery(self): db_engine = None trainer = None port = None with testing.postgresql.Postgresql() as postgresql: port = postgresql.settings['port'] db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) trainer = ModelTrainer( project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=InMemoryModelStorageEngine( project_path=''), db_engine=db_engine, model_grouper=ModelGrouper()) # start without a database server # then bring it back up after the first sleep # use self so it doesn't go out of scope too early and shut down self.new_server = None def replace_db(arg): self.new_server = testing.postgresql.Postgresql(port=port) db_engine = create_engine(self.new_server.url()) ensure_db(db_engine) init_engine(db_engine) # Creates a matrix entry in the matrices table with uuid from train_metadata MatrixFactory(matrix_uuid="1234") session.commit() with patch('time.sleep') as time_mock: time_mock.side_effect = replace_db try: trainer.train_models(grid_config(), dict(), sample_matrix_store()) finally: if self.new_server is not None: self.new_server.stop() assert len(time_mock.mock_calls) == 1
def test_custom_groups(grid_config): with rig_engines() as (db_engine, project_storage): # create training set model_storage_engine = project_storage.model_storage_engine() trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(['class_path']), db_engine=db_engine, ) model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage)) # expect only one model group now records = [ row[0] for row in db_engine.execute( 'select distinct model_group_id from model_metadata.models') ] assert len(records) == 1 assert records[0] == model_ids[0]
def test_custom_groups(grid_config, db_engine_with_results_schema, project_storage): model_storage_engine = project_storage.model_storage_engine() trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(["class_path"]), db_engine=db_engine_with_results_schema, ) # create training set model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) # expect only one model group now records = [ row[0] for row in db_engine_with_results_schema.execute( "select distinct model_group_id from triage_metadata.models") ] assert len(records) == 1 assert records[0] == model_ids[0]
def test_integration(): with rig_engines() as (db_engine, project_storage): train_store = get_matrix_store( project_storage, matrix_creator(), matrix_metadata_creator(matrix_type="train"), ) as_of_dates = [datetime.date(2016, 12, 21), datetime.date(2017, 1, 21)] test_stores = [] for as_of_date in as_of_dates: matrix_store = get_matrix_store( project_storage, pandas.DataFrame.from_dict({ "entity_id": [3], "feature_one": [8], "feature_two": [5], "label": [0], }).set_index("entity_id"), matrix_metadata_creator(end_time=as_of_date, indices=["entity_id"]), ) test_stores.append(matrix_store) model_storage_engine = ModelStorageEngine(project_storage) experiment_hash = save_experiment_and_get_hash({}, db_engine) # instantiate pipeline objects trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, ) predictor = Predictor(model_storage_engine, db_engine) model_evaluator = ModelEvaluator([{ "metrics": ["precision@"], "thresholds": { "top_n": [5] } }], [{}], db_engine) # run the pipeline grid_config = { "sklearn.linear_model.LogisticRegression": { "C": [0.00001, 0.0001], "penalty": ["l1", "l2"], "random_state": [2193], } } model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict(), train_matrix_columns=["feature_one", "feature_two"], ) model_evaluator.evaluate(predictions_proba, test_store, model_id) # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( """select entity_id, model_id, as_of_date from test_results.predictions join model_metadata.models using (model_id) order by 3, 2""") ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(""" select model_id, evaluation_start_time, metric, parameter from test_results.evaluations order by 2, 1""") ] assert records == [ (1, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (2, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (3, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (4, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (1, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), (2, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), (3, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), (4, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), ]
def test_model_trainer(sample_matrix_store, grid_config): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') # Creates a matrix entry in the matrices table with uuid from metadata above MatrixFactory(matrix_uuid="1234") session.commit() project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(project_path) trainer = ModelTrainer( project_path=project_path, experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(), db_engine=db_engine, ) model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store) # assert # 1. that the models and feature importances table entries are present records = [ row for row in db_engine.execute( 'select * from train_results.feature_importances') ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes records = [ row for row in db_engine.execute( 'select model_hash from model_metadata.models') ] assert len(records) == 4 hashes = [row[0] for row in records] # 2. that the model groups are distinct records = [ row for row in db_engine.execute( 'select distinct model_group_id from model_metadata.models' ) ] assert len(records) == 4 # 3. that the model sizes are saved in the table and all are < 1 kB records = [ row for row in db_engine.execute( 'select model_size from model_metadata.models') ] assert len(records) == 4 for i in records: size = i[0] assert size < 1 # 4. that all four models are cached model_pickles = [ model_storage_engine.get_store(model_hash).load() for model_hash in hashes ] assert len(model_pickles) == 4 assert len([x for x in model_pickles if x is not None]) == 4 # 5. that their results can have predictions made on it test_matrix = pandas.DataFrame.from_dict({ 'entity_id': [3, 4], 'feature_one': [4, 4], 'feature_two': [6, 5], }) test_matrix = InMemoryMatrixStore(matrix=test_matrix, metadata=sample_metadata())\ .matrix for model_pickle in model_pickles: predictions = model_pickle.predict(test_matrix) assert len(predictions) == 2 # 6. when run again, same models are returned new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store) assert len([ row for row in db_engine.execute( 'select model_hash from model_metadata.models') ]) == 4 assert model_ids == new_model_ids # 7. if replace is set, update non-unique attributes and feature importances max_batch_run_time = [ row[0] for row in db_engine.execute( 'select max(batch_run_time) from model_metadata.models') ][0] trainer = ModelTrainer( project_path=project_path, experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper( model_group_keys=['label_name', 'label_timespan']), db_engine=db_engine, replace=True) new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store, ) assert model_ids == new_model_ids assert [ row['model_id'] for row in db_engine.execute( 'select model_id from model_metadata.models order by 1 asc' ) ] == model_ids new_max_batch_run_time = [ row[0] for row in db_engine.execute( 'select max(batch_run_time) from model_metadata.models') ][0] assert new_max_batch_run_time > max_batch_run_time records = [ row for row in db_engine.execute( 'select * from train_results.feature_importances') ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes # 8. if the cache is missing but the metadata is still there, reuse the metadata for row in db_engine.execute( 'select model_hash from model_metadata.models'): model_storage_engine.get_store(row[0]).delete() new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store) assert model_ids == sorted(new_model_ids) # 9. that the generator interface works the same way new_model_ids = trainer.generate_trained_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store) assert model_ids == \ sorted([model_id for model_id in new_model_ids])
def test_reuse_model_random_seeds(grid_config, default_model_trainer): trainer = default_model_trainer db_engine = trainer.db_engine project_storage = trainer.model_storage_engine.project_storage model_storage_engine = trainer.model_storage_engine # re-using the random seeds requires the association between experiments and models # to exist, which we're not getting in these tests since we aren't using the experiment # architecture, so back-fill these associations after each train_models() run def update_experiment_models(db_engine): sql = """ INSERT INTO triage_metadata.experiment_models(experiment_hash,model_hash) SELECT er.run_hash, m.model_hash FROM triage_metadata.models m LEFT JOIN triage_metadata.triage_runs er ON m.built_in_triage_run = er.id LEFT JOIN triage_metadata.experiment_models em ON m.model_hash = em.model_hash AND er.run_hash = em.experiment_hash WHERE em.experiment_hash IS NULL """ db_engine.execute(sql) db_engine.execute('COMMIT;') random.seed(5) model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) update_experiment_models(db_engine) # simulate running a new experiment where the experiment hash has changed # (e.g. because the model grid is different), but experiment seed is the # same, so previously-trained models should not get new seeds experiment_hash = save_experiment_and_get_hash( config={'baz': 'qux'}, db_engine=db_engine ) run_id = initialize_tracking_and_get_run_id( experiment_hash, experiment_class_path="", random_seed=5, experiment_kwargs={}, db_engine=db_engine ) trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, model_grouper=ModelGrouper(), run_id=run_id, ) new_grid = grid_config.copy() new_grid['sklearn.tree.DecisionTreeClassifier']['min_samples_split'] = [3,10,100] random.seed(5) new_model_ids = trainer.train_models( grid_config=new_grid, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) update_experiment_models(db_engine) # should have received 5 models assert len(new_model_ids) == 6 # all the original model ids should be in the new set assert len(set(new_model_ids) & set(model_ids)) == len(model_ids) # however, we should NOT re-use the random seeds (and so get new model_ids) # if the experiment-level seed is different experiment_hash = save_experiment_and_get_hash( config={'lorem': 'ipsum'}, db_engine=db_engine ) run_id = initialize_tracking_and_get_run_id( experiment_hash, experiment_class_path="", random_seed=42, experiment_kwargs={}, db_engine=db_engine ) trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, model_grouper=ModelGrouper(), run_id=run_id, ) random.seed(42) # different from above newer_model_ids = trainer.train_models( grid_config=new_grid, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) update_experiment_models(db_engine) # should get entirely new models now (different IDs) assert len(newer_model_ids) == 6 assert len(set(new_model_ids) & set(newer_model_ids)) == 0
def test_integration(): with rig_engines() as (db_engine, project_storage): train_store = get_matrix_store( project_storage, matrix_creator(), matrix_metadata_creator(matrix_type='train')) as_of_dates = [datetime.date(2016, 12, 21), datetime.date(2017, 1, 21)] test_stores = [] for as_of_date in as_of_dates: matrix_store = get_matrix_store( project_storage, pandas.DataFrame.from_dict({ 'entity_id': [3], 'feature_one': [8], 'feature_two': [5], 'label': [0] }).set_index('entity_id'), matrix_metadata_creator(end_time=as_of_date, indices=['entity_id'])) test_stores.append(matrix_store) model_storage_engine = ModelStorageEngine(project_storage) experiment_hash = save_experiment_and_get_hash({}, db_engine) # instantiate pipeline objects trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, ) predictor = Predictor(model_storage_engine, db_engine) model_evaluator = ModelEvaluator([{ 'metrics': ['precision@'], 'thresholds': { 'top_n': [5] } }], [{}], db_engine) # run the pipeline grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict(), train_matrix_columns=['feature_one', 'feature_two']) model_evaluator.evaluate( predictions_proba, test_store, model_id, ) # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( '''select entity_id, model_id, as_of_date from test_results.predictions join model_metadata.models using (model_id) order by 3, 2''') ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(''' select model_id, evaluation_start_time, metric, parameter from test_results.evaluations order by 2, 1''') ] assert records == [ (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), ]
def test_model_trainer(grid_config, default_model_trainer): trainer = default_model_trainer db_engine = trainer.db_engine project_storage = trainer.model_storage_engine.project_storage model_storage_engine = trainer.model_storage_engine def set_test_seed(): random.seed(5) set_test_seed() model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) # assert # 1. that the models and feature importances table entries are present records = [ row for row in db_engine.execute( "select * from train_results.feature_importances") ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes records = [ row for row in db_engine.execute( "select model_hash from triage_metadata.models") ] assert len(records) == 4 hashes = [row[0] for row in records] # 2. that the model groups are distinct records = [ row for row in db_engine.execute( "select distinct model_group_id from triage_metadata.models") ] assert len(records) == 4 # 2. that the random seeds are distinct records = [ row for row in db_engine.execute( "select distinct random_seed from triage_metadata.models") ] assert len(records) == 4 # 3. that the model sizes are saved in the table and all are < 1 kB records = [ row for row in db_engine.execute( "select model_size from triage_metadata.models") ] assert len(records) == 4 for i in records: size = i[0] assert size < 1 # 4. that all four models are cached model_pickles = [ model_storage_engine.load(model_hash) for model_hash in hashes ] assert len(model_pickles) == 4 assert len([x for x in model_pickles if x is not None]) == 4 # 5. that their results can have predictions made on it test_matrix = pd.DataFrame.from_dict({ "entity_id": [3, 4], "feature_one": [4, 4], "feature_two": [6, 5] }).set_index("entity_id") for model_pickle in model_pickles: predictions = model_pickle.predict(test_matrix) assert len(predictions) == 2 # 6. when run again with the same starting seed, same models are returned set_test_seed() new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) assert (len([ row for row in db_engine.execute( "select model_hash from triage_metadata.models") ]) == 4) assert model_ids == new_model_ids # 7. if replace is set, update non-unique attributes and feature importances max_batch_run_time = [ row[0] for row in db_engine.execute( "select max(batch_run_time) from triage_metadata.models") ][0] trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper( model_group_keys=["label_name", "label_timespan"]), db_engine=db_engine, replace=True, ) set_test_seed() new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) assert model_ids == new_model_ids assert [ row["model_id"] for row in db_engine.execute( "select model_id from triage_metadata.models order by 1 asc") ] == model_ids new_max_batch_run_time = [ row[0] for row in db_engine.execute( "select max(batch_run_time) from triage_metadata.models") ][0] assert new_max_batch_run_time > max_batch_run_time records = [ row for row in db_engine.execute( "select * from train_results.feature_importances") ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes # 8. if the cache is missing but the metadata is still there, reuse the metadata set_test_seed() for row in db_engine.execute( "select model_hash from triage_metadata.models"): model_storage_engine.delete(row[0]) new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) assert model_ids == sorted(new_model_ids) # 9. that the generator interface works the same way set_test_seed() new_model_ids = trainer.generate_trained_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) assert model_ids == sorted([model_id for model_id in new_model_ids])
def test_integration(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') project_path = 'econ-dev/inspections' # create train and test matrices train_matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') train_metadata = { 'feature_start_time': datetime.date(2012, 12, 20), 'end_time': datetime.date(2016, 12, 20), 'label_name': 'label', 'label_timespan': '1y', 'feature_names': ['ft1', 'ft2'], 'metta-uuid': '1234', 'indices': ['entity_id'], 'matrix_type': 'train' } # Creates a matrix entry in the matrices table with uuid from train_metadata MatrixFactory(matrix_uuid="1234") session.commit() train_store = InMemoryMatrixStore(train_matrix, sample_metadata()) as_of_dates = [ datetime.date(2016, 12, 21), datetime.date(2017, 1, 21) ] test_stores = [ InMemoryMatrixStore( pandas.DataFrame.from_dict({ 'entity_id': [3], 'feature_one': [8], 'feature_two': [5], 'label': [5] }), { 'label_name': 'label', 'label_timespan': '1y', 'end_time': as_of_date, 'metta-uuid': '1234', 'indices': ['entity_id'], 'matrix_type': 'test', 'as_of_date_frequency': '1month' }) for as_of_date in as_of_dates ] model_storage_engine = S3ModelStorageEngine(project_path) experiment_hash = save_experiment_and_get_hash({}, db_engine) # instantiate pipeline objects trainer = ModelTrainer( project_path=project_path, experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, ) predictor = Predictor(project_path, model_storage_engine, db_engine) model_evaluator = ModelEvaluator([{ 'metrics': ['precision@'], 'thresholds': { 'top_n': [5] } }], [{}], db_engine) # run the pipeline grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict(), train_matrix_columns=['feature_one', 'feature_two']) model_evaluator.evaluate( predictions_proba, test_store, model_id, ) # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( '''select entity_id, model_id, as_of_date from test_results.test_predictions join model_metadata.models using (model_id) order by 3, 2''') ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(''' select model_id, evaluation_start_time, metric, parameter from test_results.test_evaluations order by 2, 1''') ] assert records == [ (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), ]
def test_model_trainer(grid_config): with rig_engines() as (db_engine, project_storage): # Creates a matrix entry in the matrices table with uuid from metadata above model_storage_engine = project_storage.model_storage_engine() trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(), db_engine=db_engine, ) model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) # assert # 1. that the models and feature importances table entries are present records = [ row for row in db_engine.execute( 'select * from train_results.feature_importances') ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes records = [ row for row in db_engine.execute( 'select model_hash from model_metadata.models') ] assert len(records) == 4 hashes = [row[0] for row in records] # 2. that the model groups are distinct records = [ row for row in db_engine.execute( 'select distinct model_group_id from model_metadata.models') ] assert len(records) == 4 # 3. that the model sizes are saved in the table and all are < 1 kB records = [ row for row in db_engine.execute( 'select model_size from model_metadata.models') ] assert len(records) == 4 for i in records: size = i[0] assert size < 1 # 4. that all four models are cached model_pickles = [ model_storage_engine.load(model_hash) for model_hash in hashes ] assert len(model_pickles) == 4 assert len([x for x in model_pickles if x is not None]) == 4 # 5. that their results can have predictions made on it test_matrix = pandas.DataFrame.from_dict({ 'entity_id': [3, 4], 'feature_one': [4, 4], 'feature_two': [6, 5], }).set_index('entity_id') for model_pickle in model_pickles: predictions = model_pickle.predict(test_matrix) assert len(predictions) == 2 # 6. when run again, same models are returned new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage)) assert len([ row for row in db_engine.execute( 'select model_hash from model_metadata.models') ]) == 4 assert model_ids == new_model_ids # 7. if replace is set, update non-unique attributes and feature importances max_batch_run_time = [ row[0] for row in db_engine.execute( 'select max(batch_run_time) from model_metadata.models') ][0] trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper( model_group_keys=['label_name', 'label_timespan']), db_engine=db_engine, replace=True) new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage)) assert model_ids == new_model_ids assert [ row['model_id'] for row in db_engine.execute( 'select model_id from model_metadata.models order by 1 asc') ] == model_ids new_max_batch_run_time = [ row[0] for row in db_engine.execute( 'select max(batch_run_time) from model_metadata.models') ][0] assert new_max_batch_run_time > max_batch_run_time records = [ row for row in db_engine.execute( 'select * from train_results.feature_importances') ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes # 8. if the cache is missing but the metadata is still there, reuse the metadata for row in db_engine.execute( 'select model_hash from model_metadata.models'): model_storage_engine.delete(row[0]) new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage)) assert model_ids == sorted(new_model_ids) # 9. that the generator interface works the same way new_model_ids = trainer.generate_trained_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage)) assert model_ids == \ sorted([model_id for model_id in new_model_ids])
def test_model_trainer(): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') # create training set matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': ['good', 'bad'] }) metadata = { 'feature_start_time': datetime.date(2012, 12, 20), 'end_time': datetime.date(2016, 12, 20), 'label_name': 'label', 'label_timespan': '1y', 'metta-uuid': '1234', 'feature_names': ['ft1', 'ft2'], 'indices': ['entity_id'], } project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(project_path) trainer = ModelTrainer( project_path=project_path, experiment_hash=None, model_storage_engine=model_storage_engine, db_engine=engine, model_group_keys=['label_name', 'label_timespan'] ) matrix_store = InMemoryMatrixStore(matrix, metadata) model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store ) # assert # 1. that the models and feature importances table entries are present records = [ row for row in engine.execute('select * from results.feature_importances') ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes records = [ row for row in engine.execute('select model_hash from results.models') ] assert len(records) == 4 hashes = [row[0] for row in records] # 2. that the model groups are distinct records = [ row for row in engine.execute('select distinct model_group_id from results.models') ] assert len(records) == 4 # 3. that all four models are cached model_pickles = [ model_storage_engine.get_store(model_hash).load() for model_hash in hashes ] assert len(model_pickles) == 4 assert len([x for x in model_pickles if x is not None]) == 4 # 4. that their results can have predictions made on it test_matrix = pandas.DataFrame.from_dict({ 'entity_id': [3, 4], 'feature_one': [4, 4], 'feature_two': [6, 5], }) test_matrix = InMemoryMatrixStore(matrix=test_matrix, metadata=metadata).matrix for model_pickle in model_pickles: predictions = model_pickle.predict(test_matrix) assert len(predictions) == 2 # 5. when run again, same models are returned new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store ) assert len([ row for row in engine.execute('select model_hash from results.models') ]) == 4 assert model_ids == new_model_ids # 6. if replace is set, update non-unique attributes and feature importances max_batch_run_time = [ row[0] for row in engine.execute('select max(batch_run_time) from results.models') ][0] trainer = ModelTrainer( project_path=project_path, experiment_hash=None, model_storage_engine=model_storage_engine, db_engine=engine, model_group_keys=['label_name', 'label_timespan'], replace=True ) new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store, ) assert model_ids == new_model_ids assert [ row['model_id'] for row in engine.execute('select model_id from results.models order by 1 asc') ] == model_ids new_max_batch_run_time = [ row[0] for row in engine.execute('select max(batch_run_time) from results.models') ][0] assert new_max_batch_run_time > max_batch_run_time records = [ row for row in engine.execute('select * from results.feature_importances') ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes # 7. if the cache is missing but the metadata is still there, reuse the metadata for row in engine.execute('select model_hash from results.models'): model_storage_engine.get_store(row[0]).delete() new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store ) assert model_ids == sorted(new_model_ids) # 8. that the generator interface works the same way new_model_ids = trainer.generate_trained_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store ) assert model_ids == \ sorted([model_id for model_id in new_model_ids])