def test_baseline_exception_handling(sample_matrix_store): grid_config = { 'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature': { 'feature': ['feature_one', 'feature_three'] } } with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(project_path) ensure_db(db_engine) init_engine(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') trainer = ModelTrainer(project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=model_storage_engine, db_engine=db_engine, model_grouper=ModelGrouper()) train_tasks = trainer.generate_train_tasks(grid_config, dict(), sample_matrix_store) # Creates a matrix entry in the matrices table with uuid from train_metadata MatrixFactory(matrix_uuid="1234") session.commit() model_ids = [] for train_task in train_tasks: model_ids.append(trainer.process_train_task(**train_task)) assert model_ids == [1, None]
def test_custom_groups(sample_matrix_store, grid_config): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) init_engine(engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') MatrixFactory(matrix_uuid="1234") session.commit() # create training set project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(project_path) trainer = ModelTrainer( project_path=project_path, experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(['class_path']), db_engine=engine, ) model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store) # expect only one model group now records = [ row[0] for row in engine.execute( 'select distinct model_group_id from model_metadata.models' ) ] assert len(records) == 1 assert records[0] == model_ids[0]
def test_retry_max(self): db_engine = None trainer = None # set up a basic model training run # TODO abstract the setup of a basic model training run where # we don't worry about the specific values used? it would make # tests like this require a bit less noise to read past with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) trainer = ModelTrainer( project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=InMemoryModelStorageEngine( project_path=''), db_engine=db_engine, model_grouper=ModelGrouper()) # the postgres server goes out of scope here and thus no longer exists with patch('time.sleep') as time_mock: with self.assertRaises(sqlalchemy.exc.OperationalError): trainer.train_models(grid_config(), dict(), sample_matrix_store()) # we want to make sure that we are using the retrying module sanely # as opposed to matching the exact # of calls specified by the code assert len(time_mock.mock_calls) > 5
def test_retry_recovery(self): db_engine = None trainer = None port = None with rig_engines() as (db_engine, project_storage): port = db_engine.url.port trainer = ModelTrainer( experiment_hash=None, model_storage_engine=project_storage.model_storage_engine(), db_engine=db_engine, model_grouper=ModelGrouper(), ) matrix_store = get_matrix_store(project_storage) # start without a database server # then bring it back up after the first sleep # use self so it doesn't go out of scope too early and shut down self.new_server = None def replace_db(arg): self.new_server = testing.postgresql.Postgresql(port=port) db_engine = create_engine(self.new_server.url()) ensure_db(db_engine) init_engine(db_engine) get_matrix_store(project_storage) with patch("time.sleep") as time_mock: time_mock.side_effect = replace_db try: trainer.train_models(grid_config(), dict(), matrix_store) finally: if self.new_server is not None: self.new_server.stop() assert len(time_mock.mock_calls) == 1
def test_custom_groups(grid_config, db_engine_with_results_schema, project_storage): model_storage_engine = project_storage.model_storage_engine() experiment_hash = save_experiment_and_get_hash( config={'foo': 'bar'}, db_engine=db_engine_with_results_schema ) run_id = initialize_tracking_and_get_run_id( experiment_hash, experiment_class_path="", random_seed=5, experiment_kwargs={}, db_engine=db_engine_with_results_schema ) trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(["class_path"]), db_engine=db_engine_with_results_schema, run_id=run_id, ) # create training set model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) # expect only one model group now records = [ row[0] for row in db_engine_with_results_schema.execute( "select distinct model_group_id from triage_metadata.models" ) ] assert len(records) == 1 assert records[0] == model_ids[0]
def default_model_trainer(db_engine_with_results_schema, project_storage): model_storage_engine = project_storage.model_storage_engine() trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, db_engine=db_engine_with_results_schema, model_grouper=ModelGrouper(), ) yield trainer
def test_model_grouping_default_config(sample_metadata): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) model_grouper = ModelGrouper() # get the basic first model group with our default matrix assert (model_grouper.get_model_group_id("module.Classifier", {"param1": "val1"}, sample_metadata, engine) == 1) # the end time is not by default a model group key so changing it # should still get us the same group metadata_new_end_time = copy(sample_metadata) metadata_new_end_time["end_time"] = datetime.date(2017, 3, 20) assert (model_grouper.get_model_group_id("module.Classifier", {"param1": "val1"}, metadata_new_end_time, engine) == 1) # max_training_history is a default key, # so it should trigger a new group metadata_train_history = copy(sample_metadata) metadata_train_history["max_training_history"] = "3y" assert (model_grouper.get_model_group_id("module.Classifier", {"param1": "val1"}, metadata_train_history, engine) == 2) # classifier is of course a default key as well assert (model_grouper.get_model_group_id("module.OtherClassifier", {"param1": "val1"}, sample_metadata, engine) == 3)
def test_model_grouping_custom_config(sample_metadata): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) model_grouper = ModelGrouper( model_group_keys=['feature_names', 'as_of_date_frequency']) # get the basic first model group with our default matrix assert model_grouper.get_model_group_id('module.Classifier', {'param1': 'val1'}, sample_metadata, engine) == 1 # classifier is now not a key, so changing it should not get a new id assert model_grouper.get_model_group_id('module.OtherClassifier', {'param1': 'val1'}, sample_metadata, engine) == 1 # as_of_date_frequency is a key, # so it should trigger a new group metadata_frequency = copy(sample_metadata) metadata_frequency['as_of_date_frequency'] = '2w' assert model_grouper.get_model_group_id('module.Classifier', {'param1': 'val1'}, metadata_frequency, engine) == 2 # testing feature names may seem redundant but it is on a separate # code path so make sure its logic works metadata_features = copy(sample_metadata) metadata_features['feature_names'] = ['ft1', 'ft3'] assert model_grouper.get_model_group_id('module.Classifier', {'param1': 'val1'}, metadata_features, engine) == 3
def test_model_grouping_custom_config(sample_metadata): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) model_grouper = ModelGrouper( model_group_keys=["feature_names", "as_of_date_frequency"]) # get the basic first model group with our default matrix assert (model_grouper.get_model_group_id("module.Classifier", {"param1": "val1"}, sample_metadata, engine) == 1) # classifier is now not a key, so changing it should not get a new id assert (model_grouper.get_model_group_id("module.OtherClassifier", {"param1": "val1"}, sample_metadata, engine) == 1) # as_of_date_frequency is a key, # so it should trigger a new group metadata_frequency = copy(sample_metadata) metadata_frequency["as_of_date_frequency"] = "2w" assert (model_grouper.get_model_group_id("module.Classifier", {"param1": "val1"}, metadata_frequency, engine) == 2) # testing feature names may seem redundant but it is on a separate # code path so make sure its logic works metadata_features = copy(sample_metadata) metadata_features["feature_names"] = ["ft1", "ft3"] assert (model_grouper.get_model_group_id("module.Classifier", {"param1": "val1"}, metadata_features, engine) == 3)
def test_n_jobs_not_new_model(sample_matrix_store): grid_config = { 'sklearn.ensemble.AdaBoostClassifier': { 'n_estimators': [10, 100, 1000] }, 'sklearn.ensemble.RandomForestClassifier': { 'n_estimators': [10, 100], 'max_features': ['sqrt', 'log2'], 'max_depth': [5, 10, 15, 20], 'criterion': ['gini', 'entropy'], 'n_jobs': [12, 24], } } with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') trainer = ModelTrainer(project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=S3ModelStorageEngine( 'econ-dev/inspections'), db_engine=db_engine, model_grouper=ModelGrouper()) train_tasks = trainer.generate_train_tasks( grid_config, dict(), sample_matrix_store, ) # Creates a matrix entry in the matrices table with uuid from train_metadata MatrixFactory(matrix_uuid="1234") session.commit() assert len(train_tasks ) == 35 # 32+3, would be (32*2)+3 if we didn't remove assert len([ task for task in train_tasks if 'n_jobs' in task['parameters'] ]) == 32 for train_task in train_tasks: trainer.process_train_task(**train_task) for row in db_engine.execute( 'select model_parameters from model_metadata.model_groups' ): assert 'n_jobs' not in row[0]
def test_retry_max(self): db_engine = None trainer = None # set up a basic model training run with rig_engines() as (db_engine, project_storage): trainer = ModelTrainer( experiment_hash=None, model_storage_engine=project_storage.model_storage_engine(), db_engine=db_engine, model_grouper=ModelGrouper()) matrix_store = get_matrix_store(project_storage) # the postgres server goes out of scope here and thus no longer exists with patch('time.sleep') as time_mock: with self.assertRaises(sqlalchemy.exc.OperationalError): trainer.train_models(grid_config(), dict(), matrix_store) # we want to make sure that we are using the retrying module sanely # as opposed to matching the exact # of calls specified by the code assert len(time_mock.mock_calls) > 5
def test_retry_recovery(self): db_engine = None trainer = None port = None with testing.postgresql.Postgresql() as postgresql: port = postgresql.settings['port'] db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) trainer = ModelTrainer( project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=InMemoryModelStorageEngine( project_path=''), db_engine=db_engine, model_grouper=ModelGrouper()) # start without a database server # then bring it back up after the first sleep # use self so it doesn't go out of scope too early and shut down self.new_server = None def replace_db(arg): self.new_server = testing.postgresql.Postgresql(port=port) db_engine = create_engine(self.new_server.url()) ensure_db(db_engine) init_engine(db_engine) # Creates a matrix entry in the matrices table with uuid from train_metadata MatrixFactory(matrix_uuid="1234") session.commit() with patch('time.sleep') as time_mock: time_mock.side_effect = replace_db try: trainer.train_models(grid_config(), dict(), sample_matrix_store()) finally: if self.new_server is not None: self.new_server.stop() assert len(time_mock.mock_calls) == 1
def test_baseline_exception_handling(): grid_config = { 'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature': { 'feature': ['feature_one', 'feature_three'] } } with rig_engines() as (db_engine, project_storage): trainer = ModelTrainer( experiment_hash=None, model_storage_engine=project_storage.model_storage_engine(), db_engine=db_engine, model_grouper=ModelGrouper()) train_tasks = trainer.generate_train_tasks( grid_config, dict(), get_matrix_store(project_storage)) model_ids = [] for train_task in train_tasks: model_ids.append(trainer.process_train_task(**train_task)) assert model_ids == [1, None]
def test_custom_groups(grid_config): with rig_engines() as (db_engine, project_storage): # create training set model_storage_engine = project_storage.model_storage_engine() trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(['class_path']), db_engine=db_engine, ) model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage)) # expect only one model group now records = [ row[0] for row in db_engine.execute( 'select distinct model_group_id from model_metadata.models') ] assert len(records) == 1 assert records[0] == model_ids[0]
def test_custom_groups(grid_config, db_engine_with_results_schema, project_storage): model_storage_engine = project_storage.model_storage_engine() trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(["class_path"]), db_engine=db_engine_with_results_schema, ) # create training set model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) # expect only one model group now records = [ row[0] for row in db_engine_with_results_schema.execute( "select distinct model_group_id from triage_metadata.models") ] assert len(records) == 1 assert records[0] == model_ids[0]
def test_n_jobs_not_new_model(): grid_config = { 'sklearn.ensemble.AdaBoostClassifier': { 'n_estimators': [10, 100, 1000] }, 'sklearn.ensemble.RandomForestClassifier': { 'n_estimators': [10, 100], 'max_features': ['sqrt', 'log2'], 'max_depth': [5, 10, 15, 20], 'criterion': ['gini', 'entropy'], 'n_jobs': [12, 24], } } with rig_engines() as (db_engine, project_storage): model_storage_engine = project_storage.model_storage_engine() trainer = ModelTrainer(experiment_hash=None, model_storage_engine=model_storage_engine, db_engine=db_engine, model_grouper=ModelGrouper()) train_tasks = trainer.generate_train_tasks( grid_config, dict(), get_matrix_store(project_storage), ) assert len( train_tasks) == 35 # 32+3, would be (32*2)+3 if we didn't remove assert len([ task for task in train_tasks if 'n_jobs' in task['parameters'] ]) == 32 for train_task in train_tasks: trainer.process_train_task(**train_task) for row in db_engine.execute( 'select hyperparameters from model_metadata.model_groups'): assert 'n_jobs' not in row[0]
def default_model_trainer(db_engine_with_results_schema, project_storage): model_storage_engine = project_storage.model_storage_engine() experiment_hash = save_experiment_and_get_hash( config={'foo': 'bar'}, db_engine=db_engine_with_results_schema ) run_id = initialize_tracking_and_get_run_id( experiment_hash, experiment_class_path="", random_seed=5, experiment_kwargs={}, db_engine=db_engine_with_results_schema ) # import pdb; pdb.set_trace() trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine_with_results_schema, model_grouper=ModelGrouper(), run_id=run_id, ) yield trainer
def test_n_jobs_not_new_model(): grid_config = { "sklearn.ensemble.AdaBoostClassifier": { "n_estimators": [10, 100, 1000] }, "sklearn.ensemble.RandomForestClassifier": { "n_estimators": [10, 100], "max_features": ["sqrt", "log2"], "max_depth": [5, 10, 15, 20], "criterion": ["gini", "entropy"], "n_jobs": [12, 24], }, } with rig_engines() as (db_engine, project_storage): model_storage_engine = project_storage.model_storage_engine() trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, db_engine=db_engine, model_grouper=ModelGrouper(), ) train_tasks = trainer.generate_train_tasks( grid_config, dict(), get_matrix_store(project_storage)) assert len( train_tasks) == 35 # 32+3, would be (32*2)+3 if we didn't remove assert (len([ task for task in train_tasks if "n_jobs" in task["parameters"] ]) == 32) for train_task in train_tasks: trainer.process_train_task(**train_task) for row in db_engine.execute( "select hyperparameters from model_metadata.model_groups"): assert "n_jobs" not in row[0]
def initialize_components(self): split_config = self.config['temporal_config'] self.chopper = Timechop( feature_start_time=dt_from_str(split_config['feature_start_time']), feature_end_time=dt_from_str(split_config['feature_end_time']), label_start_time=dt_from_str(split_config['label_start_time']), label_end_time=dt_from_str(split_config['label_end_time']), model_update_frequency=split_config['model_update_frequency'], training_label_timespans=split_config['training_label_timespans'], test_label_timespans=split_config['test_label_timespans'], training_as_of_date_frequencies=split_config[ 'training_as_of_date_frequencies'], test_as_of_date_frequencies=split_config[ 'test_as_of_date_frequencies'], max_training_histories=split_config['max_training_histories'], test_durations=split_config['test_durations'], ) cohort_config = self.config.get('cohort_config', {}) if 'query' in cohort_config: self.state_table_generator = StateTableGeneratorFromQuery( experiment_hash=self.experiment_hash, db_engine=self.db_engine, query=cohort_config['query']) elif 'entities_table' in cohort_config: self.state_table_generator = StateTableGeneratorFromEntities( experiment_hash=self.experiment_hash, db_engine=self.db_engine, entities_table=cohort_config['entities_table']) elif 'dense_states' in cohort_config: self.state_table_generator = StateTableGeneratorFromDense( experiment_hash=self.experiment_hash, db_engine=self.db_engine, dense_state_table=cohort_config['dense_states']['table_name']) else: raise ValueError('Cohort config missing or unrecognized') self.label_generator = LabelGenerator( label_name=self.config['label_config'].get('name', None), query=self.config['label_config']['query'], db_engine=self.db_engine, ) self.feature_dictionary_creator = FeatureDictionaryCreator( features_schema_name=self.features_schema_name, db_engine=self.db_engine, ) self.feature_generator = FeatureGenerator( features_schema_name=self.features_schema_name, replace=self.replace, db_engine=self.db_engine, feature_start_time=split_config['feature_start_time']) self.feature_group_creator = FeatureGroupCreator( self.config.get('feature_group_definition', {'all': [True]})) self.feature_group_mixer = FeatureGroupMixer( self.config.get('feature_group_strategies', ['all'])) self.planner = Planner( feature_start_time=dt_from_str(split_config['feature_start_time']), label_names=[ self.config.get('label_config', {}).get('name', DEFAULT_LABEL_NAME) ], label_types=['binary'], matrix_directory=self.matrices_directory, cohort_name=self.config.get('cohort_config', {}).get('name', None), states=self.config.get('cohort_config', {}).get('dense_states', {}).get('state_filters', []), user_metadata=self.config.get('user_metadata', {}), ) self.matrix_builder = HighMemoryCSVBuilder( db_config={ 'features_schema_name': self.features_schema_name, 'labels_schema_name': 'public', 'labels_table_name': self.labels_table_name, # TODO: have planner/builder take state table later on, so we # can grab it from the StateTableGenerator instead of # duplicating it here 'sparse_state_table_name': 'tmp_sparse_states_{}'.format(self.experiment_hash), }, matrix_directory=self.matrices_directory, include_missing_labels_in_train_as=self.config['label_config'].get( 'include_missing_labels_in_train_as', None), engine=self.db_engine, replace=self.replace) self.trainer = ModelTrainer( project_path=self.project_path, experiment_hash=self.experiment_hash, model_storage_engine=self.model_storage_engine, model_grouper=ModelGrouper(self.config.get('model_group_keys', [])), db_engine=self.db_engine, replace=self.replace) self.tester = ModelTester( model_storage_engine=self.model_storage_engine, project_path=self.project_path, replace=self.replace, db_engine=self.db_engine, individual_importance_config=self.config.get( 'individual_importance', {}), evaluator_config=self.config.get('scoring', {}))
def test_model_trainer(sample_matrix_store, grid_config): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') # Creates a matrix entry in the matrices table with uuid from metadata above MatrixFactory(matrix_uuid="1234") session.commit() project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(project_path) trainer = ModelTrainer( project_path=project_path, experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(), db_engine=db_engine, ) model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store) # assert # 1. that the models and feature importances table entries are present records = [ row for row in db_engine.execute( 'select * from train_results.feature_importances') ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes records = [ row for row in db_engine.execute( 'select model_hash from model_metadata.models') ] assert len(records) == 4 hashes = [row[0] for row in records] # 2. that the model groups are distinct records = [ row for row in db_engine.execute( 'select distinct model_group_id from model_metadata.models' ) ] assert len(records) == 4 # 3. that the model sizes are saved in the table and all are < 1 kB records = [ row for row in db_engine.execute( 'select model_size from model_metadata.models') ] assert len(records) == 4 for i in records: size = i[0] assert size < 1 # 4. that all four models are cached model_pickles = [ model_storage_engine.get_store(model_hash).load() for model_hash in hashes ] assert len(model_pickles) == 4 assert len([x for x in model_pickles if x is not None]) == 4 # 5. that their results can have predictions made on it test_matrix = pandas.DataFrame.from_dict({ 'entity_id': [3, 4], 'feature_one': [4, 4], 'feature_two': [6, 5], }) test_matrix = InMemoryMatrixStore(matrix=test_matrix, metadata=sample_metadata())\ .matrix for model_pickle in model_pickles: predictions = model_pickle.predict(test_matrix) assert len(predictions) == 2 # 6. when run again, same models are returned new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store) assert len([ row for row in db_engine.execute( 'select model_hash from model_metadata.models') ]) == 4 assert model_ids == new_model_ids # 7. if replace is set, update non-unique attributes and feature importances max_batch_run_time = [ row[0] for row in db_engine.execute( 'select max(batch_run_time) from model_metadata.models') ][0] trainer = ModelTrainer( project_path=project_path, experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper( model_group_keys=['label_name', 'label_timespan']), db_engine=db_engine, replace=True) new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store, ) assert model_ids == new_model_ids assert [ row['model_id'] for row in db_engine.execute( 'select model_id from model_metadata.models order by 1 asc' ) ] == model_ids new_max_batch_run_time = [ row[0] for row in db_engine.execute( 'select max(batch_run_time) from model_metadata.models') ][0] assert new_max_batch_run_time > max_batch_run_time records = [ row for row in db_engine.execute( 'select * from train_results.feature_importances') ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes # 8. if the cache is missing but the metadata is still there, reuse the metadata for row in db_engine.execute( 'select model_hash from model_metadata.models'): model_storage_engine.get_store(row[0]).delete() new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store) assert model_ids == sorted(new_model_ids) # 9. that the generator interface works the same way new_model_ids = trainer.generate_trained_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=sample_matrix_store) assert model_ids == \ sorted([model_id for model_id in new_model_ids])
def test_reuse_model_random_seeds(grid_config, default_model_trainer): trainer = default_model_trainer db_engine = trainer.db_engine project_storage = trainer.model_storage_engine.project_storage model_storage_engine = trainer.model_storage_engine # re-using the random seeds requires the association between experiments and models # to exist, which we're not getting in these tests since we aren't using the experiment # architecture, so back-fill these associations after each train_models() run def update_experiment_models(db_engine): sql = """ INSERT INTO triage_metadata.experiment_models(experiment_hash,model_hash) SELECT er.run_hash, m.model_hash FROM triage_metadata.models m LEFT JOIN triage_metadata.triage_runs er ON m.built_in_triage_run = er.id LEFT JOIN triage_metadata.experiment_models em ON m.model_hash = em.model_hash AND er.run_hash = em.experiment_hash WHERE em.experiment_hash IS NULL """ db_engine.execute(sql) db_engine.execute('COMMIT;') random.seed(5) model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) update_experiment_models(db_engine) # simulate running a new experiment where the experiment hash has changed # (e.g. because the model grid is different), but experiment seed is the # same, so previously-trained models should not get new seeds experiment_hash = save_experiment_and_get_hash( config={'baz': 'qux'}, db_engine=db_engine ) run_id = initialize_tracking_and_get_run_id( experiment_hash, experiment_class_path="", random_seed=5, experiment_kwargs={}, db_engine=db_engine ) trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, model_grouper=ModelGrouper(), run_id=run_id, ) new_grid = grid_config.copy() new_grid['sklearn.tree.DecisionTreeClassifier']['min_samples_split'] = [3,10,100] random.seed(5) new_model_ids = trainer.train_models( grid_config=new_grid, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) update_experiment_models(db_engine) # should have received 5 models assert len(new_model_ids) == 6 # all the original model ids should be in the new set assert len(set(new_model_ids) & set(model_ids)) == len(model_ids) # however, we should NOT re-use the random seeds (and so get new model_ids) # if the experiment-level seed is different experiment_hash = save_experiment_and_get_hash( config={'lorem': 'ipsum'}, db_engine=db_engine ) run_id = initialize_tracking_and_get_run_id( experiment_hash, experiment_class_path="", random_seed=42, experiment_kwargs={}, db_engine=db_engine ) trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, model_grouper=ModelGrouper(), run_id=run_id, ) random.seed(42) # different from above newer_model_ids = trainer.train_models( grid_config=new_grid, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) update_experiment_models(db_engine) # should get entirely new models now (different IDs) assert len(newer_model_ids) == 6 assert len(set(new_model_ids) & set(newer_model_ids)) == 0
def test_model_trainer(grid_config, default_model_trainer): trainer = default_model_trainer db_engine = trainer.db_engine project_storage = trainer.model_storage_engine.project_storage model_storage_engine = trainer.model_storage_engine def set_test_seed(): random.seed(5) set_test_seed() model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) # assert # 1. that the models and feature importances table entries are present records = [ row for row in db_engine.execute( "select * from train_results.feature_importances") ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes records = [ row for row in db_engine.execute( "select model_hash from triage_metadata.models") ] assert len(records) == 4 hashes = [row[0] for row in records] # 2. that the model groups are distinct records = [ row for row in db_engine.execute( "select distinct model_group_id from triage_metadata.models") ] assert len(records) == 4 # 2. that the random seeds are distinct records = [ row for row in db_engine.execute( "select distinct random_seed from triage_metadata.models") ] assert len(records) == 4 # 3. that the model sizes are saved in the table and all are < 1 kB records = [ row for row in db_engine.execute( "select model_size from triage_metadata.models") ] assert len(records) == 4 for i in records: size = i[0] assert size < 1 # 4. that all four models are cached model_pickles = [ model_storage_engine.load(model_hash) for model_hash in hashes ] assert len(model_pickles) == 4 assert len([x for x in model_pickles if x is not None]) == 4 # 5. that their results can have predictions made on it test_matrix = pd.DataFrame.from_dict({ "entity_id": [3, 4], "feature_one": [4, 4], "feature_two": [6, 5] }).set_index("entity_id") for model_pickle in model_pickles: predictions = model_pickle.predict(test_matrix) assert len(predictions) == 2 # 6. when run again with the same starting seed, same models are returned set_test_seed() new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) assert (len([ row for row in db_engine.execute( "select model_hash from triage_metadata.models") ]) == 4) assert model_ids == new_model_ids # 7. if replace is set, update non-unique attributes and feature importances max_batch_run_time = [ row[0] for row in db_engine.execute( "select max(batch_run_time) from triage_metadata.models") ][0] trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper( model_group_keys=["label_name", "label_timespan"]), db_engine=db_engine, replace=True, ) set_test_seed() new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) assert model_ids == new_model_ids assert [ row["model_id"] for row in db_engine.execute( "select model_id from triage_metadata.models order by 1 asc") ] == model_ids new_max_batch_run_time = [ row[0] for row in db_engine.execute( "select max(batch_run_time) from triage_metadata.models") ][0] assert new_max_batch_run_time > max_batch_run_time records = [ row for row in db_engine.execute( "select * from train_results.feature_importances") ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes # 8. if the cache is missing but the metadata is still there, reuse the metadata set_test_seed() for row in db_engine.execute( "select model_hash from triage_metadata.models"): model_storage_engine.delete(row[0]) new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) assert model_ids == sorted(new_model_ids) # 9. that the generator interface works the same way set_test_seed() new_model_ids = trainer.generate_trained_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) assert model_ids == sorted([model_id for model_id in new_model_ids])
def test_model_trainer(grid_config): with rig_engines() as (db_engine, project_storage): # Creates a matrix entry in the matrices table with uuid from metadata above model_storage_engine = project_storage.model_storage_engine() trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper(), db_engine=db_engine, ) model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage), ) # assert # 1. that the models and feature importances table entries are present records = [ row for row in db_engine.execute( 'select * from train_results.feature_importances') ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes records = [ row for row in db_engine.execute( 'select model_hash from model_metadata.models') ] assert len(records) == 4 hashes = [row[0] for row in records] # 2. that the model groups are distinct records = [ row for row in db_engine.execute( 'select distinct model_group_id from model_metadata.models') ] assert len(records) == 4 # 3. that the model sizes are saved in the table and all are < 1 kB records = [ row for row in db_engine.execute( 'select model_size from model_metadata.models') ] assert len(records) == 4 for i in records: size = i[0] assert size < 1 # 4. that all four models are cached model_pickles = [ model_storage_engine.load(model_hash) for model_hash in hashes ] assert len(model_pickles) == 4 assert len([x for x in model_pickles if x is not None]) == 4 # 5. that their results can have predictions made on it test_matrix = pandas.DataFrame.from_dict({ 'entity_id': [3, 4], 'feature_one': [4, 4], 'feature_two': [6, 5], }).set_index('entity_id') for model_pickle in model_pickles: predictions = model_pickle.predict(test_matrix) assert len(predictions) == 2 # 6. when run again, same models are returned new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage)) assert len([ row for row in db_engine.execute( 'select model_hash from model_metadata.models') ]) == 4 assert model_ids == new_model_ids # 7. if replace is set, update non-unique attributes and feature importances max_batch_run_time = [ row[0] for row in db_engine.execute( 'select max(batch_run_time) from model_metadata.models') ][0] trainer = ModelTrainer( experiment_hash=None, model_storage_engine=model_storage_engine, model_grouper=ModelGrouper( model_group_keys=['label_name', 'label_timespan']), db_engine=db_engine, replace=True) new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage)) assert model_ids == new_model_ids assert [ row['model_id'] for row in db_engine.execute( 'select model_id from model_metadata.models order by 1 asc') ] == model_ids new_max_batch_run_time = [ row[0] for row in db_engine.execute( 'select max(batch_run_time) from model_metadata.models') ][0] assert new_max_batch_run_time > max_batch_run_time records = [ row for row in db_engine.execute( 'select * from train_results.feature_importances') ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes # 8. if the cache is missing but the metadata is still there, reuse the metadata for row in db_engine.execute( 'select model_hash from model_metadata.models'): model_storage_engine.delete(row[0]) new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage)) assert model_ids == sorted(new_model_ids) # 9. that the generator interface works the same way new_model_ids = trainer.generate_trained_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=get_matrix_store(project_storage)) assert model_ids == \ sorted([model_id for model_id in new_model_ids])
def initialize_components(self): split_config = self.config['temporal_config'] self.chopper = Timechop(**split_config) cohort_config = self.config.get('cohort_config', {}) if 'query' in cohort_config: self.state_table_generator = StateTableGeneratorFromQuery( experiment_hash=self.experiment_hash, db_engine=self.db_engine, query=cohort_config['query'] ) elif 'entities_table' in cohort_config: self.state_table_generator = StateTableGeneratorFromEntities( experiment_hash=self.experiment_hash, db_engine=self.db_engine, entities_table=cohort_config['entities_table'] ) elif 'dense_states' in cohort_config: self.state_table_generator = StateTableGeneratorFromDense( experiment_hash=self.experiment_hash, db_engine=self.db_engine, dense_state_table=cohort_config['dense_states']['table_name'] ) else: logging.warning('cohort_config missing or unrecognized. Without a cohort, you will not be able to make matrices or perform feature imputation.') self.state_table_generator = StateTableGeneratorNoOp() if 'label_config' in self.config: self.label_generator = LabelGenerator( label_name=self.config['label_config'].get('name', None), query=self.config['label_config']['query'], db_engine=self.db_engine, ) else: self.label_generator = LabelGeneratorNoOp() logging.warning('label_config missing or unrecognized. Without labels, you will not be able to make matrices.') self.feature_dictionary_creator = FeatureDictionaryCreator( features_schema_name=self.features_schema_name, db_engine=self.db_engine, ) self.feature_generator = FeatureGenerator( features_schema_name=self.features_schema_name, replace=self.replace, db_engine=self.db_engine, feature_start_time=split_config['feature_start_time'] ) self.feature_group_creator = FeatureGroupCreator( self.config.get('feature_group_definition', {'all': [True]}) ) self.feature_group_mixer = FeatureGroupMixer( self.config.get('feature_group_strategies', ['all']) ) self.planner = Planner( feature_start_time=dt_from_str(split_config['feature_start_time']), label_names=[self.config.get('label_config', {}).get('name', DEFAULT_LABEL_NAME)], label_types=['binary'], cohort_name=self.config.get('cohort_config', {}).get('name', None), states=self.config.get('cohort_config', {}).get('dense_states', {}) .get('state_filters', []), user_metadata=self.config.get('user_metadata', {}), ) self.matrix_builder = MatrixBuilder( db_config={ 'features_schema_name': self.features_schema_name, 'labels_schema_name': 'public', 'labels_table_name': self.labels_table_name, # TODO: have planner/builder take state table later on, so we # can grab it from the StateTableGenerator instead of # duplicating it here 'sparse_state_table_name': self.sparse_states_table_name, }, matrix_storage_engine=self.matrix_storage_engine, include_missing_labels_in_train_as=self.config.get('label_config', {}) .get('include_missing_labels_in_train_as', None), engine=self.db_engine, replace=self.replace ) self.trainer = ModelTrainer( experiment_hash=self.experiment_hash, model_storage_engine=self.model_storage_engine, model_grouper=ModelGrouper(self.config.get('model_group_keys', [])), db_engine=self.db_engine, replace=self.replace ) self.tester = ModelTester( model_storage_engine=self.model_storage_engine, matrix_storage_engine=self.matrix_storage_engine, replace=self.replace, db_engine=self.db_engine, individual_importance_config=self.config.get('individual_importance', {}), evaluator_config=self.config.get('scoring', {}) )
def initialize_components(self): split_config = self.config["temporal_config"] self.chopper = Timechop(**split_config) cohort_config = self.config.get("cohort_config", {}) if "query" in cohort_config: self.cohort_table_generator = CohortTableGenerator( cohort_table_name=self.cohort_table_name, db_engine=self.db_engine, query=cohort_config["query"], replace=self.replace ) else: logging.warning( "cohort_config missing or unrecognized. Without a cohort, " "you will not be able to make matrices or perform feature imputation." ) self.cohort_table_generator = CohortTableGeneratorNoOp() if "label_config" in self.config: self.label_generator = LabelGenerator( label_name=self.config["label_config"].get("name", None), query=self.config["label_config"]["query"], replace=self.replace, db_engine=self.db_engine, ) else: self.label_generator = LabelGeneratorNoOp() logging.warning( "label_config missing or unrecognized. Without labels, " "you will not be able to make matrices." ) self.feature_dictionary_creator = FeatureDictionaryCreator( features_schema_name=self.features_schema_name, db_engine=self.db_engine ) self.feature_generator = FeatureGenerator( features_schema_name=self.features_schema_name, replace=self.replace, db_engine=self.db_engine, feature_start_time=split_config["feature_start_time"], materialize_subquery_fromobjs=self.materialize_subquery_fromobjs ) self.feature_group_creator = FeatureGroupCreator( self.config.get("feature_group_definition", {"all": [True]}) ) self.feature_group_mixer = FeatureGroupMixer( self.config.get("feature_group_strategies", ["all"]) ) self.planner = Planner( feature_start_time=dt_from_str(split_config["feature_start_time"]), label_names=[ self.config.get("label_config", {}).get("name", DEFAULT_LABEL_NAME) ], label_types=["binary"], cohort_names=[self.config.get("cohort_config", {}).get("name", None)], user_metadata=self.config.get("user_metadata", {}), ) self.matrix_builder = MatrixBuilder( db_config={ "features_schema_name": self.features_schema_name, "labels_schema_name": "public", "labels_table_name": self.labels_table_name, "cohort_table_name": self.cohort_table_name, }, matrix_storage_engine=self.matrix_storage_engine, experiment_hash=self.experiment_hash, include_missing_labels_in_train_as=self.config.get("label_config", {}).get( "include_missing_labels_in_train_as", None ), engine=self.db_engine, replace=self.replace, ) self.trainer = ModelTrainer( experiment_hash=self.experiment_hash, model_storage_engine=self.model_storage_engine, model_grouper=ModelGrouper(self.config.get("model_group_keys", [])), db_engine=self.db_engine, replace=self.replace, ) self.tester = ModelTester( model_storage_engine=self.model_storage_engine, matrix_storage_engine=self.matrix_storage_engine, replace=self.replace, db_engine=self.db_engine, individual_importance_config=self.config.get("individual_importance", {}), evaluator_config=self.config.get("scoring", {}), )