def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, materialize_subquery_fromobjs=True, features_ignore_cohort=False, profile=False, save_predictions=True, ): self._check_config_version(config) self.config = config self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine( self.project_storage, matrix_storage_class) self.project_path = project_path self.replace = replace self.save_predictions = save_predictions self.db_engine = db_engine upgrade_db(db_engine=self.db_engine) self.features_schema_name = "features" self.materialize_subquery_fromobjs = materialize_subquery_fromobjs self.features_ignore_cohort = features_ignore_cohort self.experiment_hash = save_experiment_and_get_hash( self.config, self.db_engine) self.initialize_components() self.cleanup = cleanup if self.cleanup: logging.info( "cleanup is set to True, so intermediate tables (labels and cohort) " "will be removed after matrix creation and subset tables will be " "removed after model training and testing") else: logging.info( "cleanup is set to False, so intermediate tables (labels, cohort, and subsets) " "will not be removed") self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout) self.profile = profile logging.info("Generate profiling stats? (profile option): %s", self.profile)
def test_ModelStorageEngine_caching(project_storage): mse = ModelStorageEngine(project_storage) with mse.cache_models(): mse.write("testobject", "myhash") with mock.patch.object(mse, "_get_store") as get_store_mock: assert mse.load("myhash") == "testobject" assert not get_store_mock.called assert "myhash" in mse.cache # when cache_models goes out of scope the cache should be empty assert "myhash" not in mse.cache
def test_ModelStorageEngine_caching(project_storage): mse = ModelStorageEngine(project_storage) with mse.cache_models(): mse.write('testobject', 'myhash') with mock.patch.object(mse, "_get_store") as get_store_mock: assert mse.load('myhash') == 'testobject' assert not get_store_mock.called assert 'myhash' in mse.cache # when cache_models goes out of scope the cache should be empty assert 'myhash' not in mse.cache
def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, ): self._check_config_version(config) self.config = config if isinstance(db_engine, Engine): logging.warning( "Raw, unserializable SQLAlchemy engine passed. " "URL will be used, other options may be lost in multi-process environments" ) self.db_engine = create_engine(db_engine.url) else: self.db_engine = db_engine self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine( self.project_storage, matrix_storage_class) self.project_path = project_path self.replace = replace upgrade_db(db_engine=self.db_engine) self.features_schema_name = "features" self.experiment_hash = save_experiment_and_get_hash( self.config, self.db_engine) self.labels_table_name = "labels_{}".format(self.experiment_hash) self.initialize_components() self.cleanup = cleanup if self.cleanup: logging.info( "cleanup is set to True, so intermediate tables (labels and states) " "will be removed after matrix creation") else: logging.info( "cleanup is set to False, so intermediate tables (labels and states) " "will not be removed after matrix creation") self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout)
def test_ModelTrainTester_generate_tasks(db_engine_with_results_schema, project_storage, sample_timechop_splits, sample_grid_config): db_engine = db_engine_with_results_schema model_storage_engine = ModelStorageEngine(project_storage) matrix_storage_engine = MatrixStorageEngine(project_storage) sample_matrix_store = get_matrix_store(project_storage) experiment_hash = save_experiment_and_get_hash({}, db_engine) run_id = initialize_tracking_and_get_run_id( experiment_hash, experiment_class_path="", random_seed=5, experiment_kwargs={}, db_engine=db_engine_with_results_schema) # instantiate pipeline objects trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, run_id=run_id, ) train_tester = ModelTrainTester( matrix_storage_engine=matrix_storage_engine, model_trainer=trainer, model_evaluator=None, individual_importance_calculator=None, predictor=None, subsets=None, protected_groups_generator=None, ) with patch.object(matrix_storage_engine, 'get_store', return_value=sample_matrix_store): batches = train_tester.generate_task_batches( splits=sample_timechop_splits, grid_config=sample_grid_config) assert len(batches) == 3 # we expect to have a task for each combination of split and classifier flattened_tasks = list(task for batch in batches for task in batch.tasks) assert len(flattened_tasks) == \ len(sample_timechop_splits) * len(list(flatten_grid_config(sample_grid_config))) # we also expect each task to match the call signature of process_task with patch.object(train_tester, 'process_task', autospec=True): for task in flattened_tasks: train_tester.process_task(**task)
def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, materialize_subquery_fromobjs=True, features_ignore_cohort=False, profile=False, save_predictions=True, skip_validation=False, partial_run=False, ): # For a partial run, skip validation and avoid cleaning up # we'll also skip filling default config values below if partial_run: cleanup = False skip_validation = True experiment_kwargs = bind_kwargs( self.__class__, **{ key: value for (key, value) in locals().items() if key not in {'db_engine', 'config', 'self'} }) self._check_config_version(config) self.config = config self.config['random_seed'] = self.config.get('random_seed', random.randint(1, 1e7)) random.seed(self.config['random_seed']) self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine( self.project_storage, matrix_storage_class) self.project_path = project_path self.replace = replace self.save_predictions = save_predictions self.skip_validation = skip_validation self.db_engine = db_engine results_schema.upgrade_if_clean(dburl=self.db_engine.url) self.features_schema_name = "features" self.materialize_subquery_fromobjs = materialize_subquery_fromobjs self.features_ignore_cohort = features_ignore_cohort # only fill default values for full runs if not partial_run: ## Defaults to sane values self.config['temporal_config'] = fill_timechop_config_missing( self.config, self.db_engine) ## Defaults to all the entities found in the features_aggregation's from_obj self.config['cohort_config'] = fill_cohort_config_missing( self.config) ## Defaults to all the feature_aggregation's prefixes self.config[ 'feature_group_definition'] = fill_feature_group_definition( self.config) grid_config = fill_model_grid_presets(self.config) self.config.pop('model_grid_preset', None) if grid_config is not None: self.config['grid_config'] = grid_config ###################### RUBICON ###################### self.experiment_hash = save_experiment_and_get_hash( self.config, self.db_engine) self.run_id = initialize_tracking_and_get_run_id( self.experiment_hash, experiment_class_path=classpath(self.__class__), experiment_kwargs=experiment_kwargs, db_engine=self.db_engine) self.initialize_components() self.cleanup = cleanup if self.cleanup: logging.info( "cleanup is set to True, so intermediate tables (labels and cohort) " "will be removed after matrix creation and subset tables will be " "removed after model training and testing") else: logging.info( "cleanup is set to False, so intermediate tables (labels, cohort, and subsets) " "will not be removed") self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout) self.profile = profile logging.info("Generate profiling stats? (profile option): %s", self.profile)
def test_integration(): with rig_engines() as (db_engine, project_storage): train_store = get_matrix_store( project_storage, matrix_creator(), matrix_metadata_creator(matrix_type='train')) as_of_dates = [datetime.date(2016, 12, 21), datetime.date(2017, 1, 21)] test_stores = [] for as_of_date in as_of_dates: matrix_store = get_matrix_store( project_storage, pandas.DataFrame.from_dict({ 'entity_id': [3], 'feature_one': [8], 'feature_two': [5], 'label': [0] }).set_index('entity_id'), matrix_metadata_creator(end_time=as_of_date, indices=['entity_id'])) test_stores.append(matrix_store) model_storage_engine = ModelStorageEngine(project_storage) experiment_hash = save_experiment_and_get_hash({}, db_engine) # instantiate pipeline objects trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, ) predictor = Predictor(model_storage_engine, db_engine) model_evaluator = ModelEvaluator([{ 'metrics': ['precision@'], 'thresholds': { 'top_n': [5] } }], [{}], db_engine) # run the pipeline grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict(), train_matrix_columns=['feature_one', 'feature_two']) model_evaluator.evaluate( predictions_proba, test_store, model_id, ) # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( '''select entity_id, model_id, as_of_date from test_results.predictions join model_metadata.models using (model_id) order by 3, 2''') ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(''' select model_id, evaluation_start_time, metric, parameter from test_results.evaluations order by 2, 1''') ] assert records == [ (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), ]
def test_ModelStorageEngine_nocaching(project_storage): mse = ModelStorageEngine(project_storage) mse.write("testobject", "myhash") assert mse.exists("myhash") assert mse.load("myhash") == "testobject" assert "myhash" not in mse.cache
def test_ModelStorageEngine_nocaching(project_storage): mse = ModelStorageEngine(project_storage) mse.write('testobject', 'myhash') assert mse.exists('myhash') assert mse.load('myhash') == 'testobject' assert 'myhash' not in mse.cache
def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, materialize_subquery_fromobjs=True, features_ignore_cohort=False, additional_bigtrain_classnames=None, profile=False, save_predictions=True, skip_validation=False, partial_run=False, ): # For a partial run, skip validation and avoid cleaning up # we'll also skip filling default config values below if partial_run: cleanup = False skip_validation = True experiment_kwargs = bind_kwargs( self.__class__, **{ key: value for (key, value) in locals().items() if key not in {"db_engine", "config", "self"} }, ) self._check_config_version(config) self.config = config if self.config.get("cohort_config") is not None: self.config["cohort_config"] = load_query_if_needed( self.config["cohort_config"] ) if self.config.get("label_config") is not None: self.config["label_config"] = load_query_if_needed( self.config["label_config"] ) self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine( self.project_storage, matrix_storage_class ) self.project_path = project_path logger.verbose( f"Matrices and trained models will be saved in {self.project_path}" ) self.replace = replace if self.replace: logger.notice( f"Replace flag is set to true. Matrices, models, " "evaluations and predictions (if they exist) will be replaced" ) self.save_predictions = save_predictions if not self.save_predictions: logger.notice( f"Save predictions flag is set to false. " "Individual predictions won't be stored in the predictions " "table. This will decrease both the running time " "of an experiment and also decrease the space needed in the db" ) self.skip_validation = skip_validation if self.skip_validation: logger.notice( f"Warning: Skip validation flag is set to true. " "The experiment config file specified won't be validated. " "This will reduce (a little) the running time of the experiment, " "but has some potential risks, e.g. the experiment could fail" "after some time due to some misconfiguration. Proceed with care." ) self.db_engine = db_engine results_schema.upgrade_if_clean(dburl=self.db_engine.url) self.features_schema_name = "features" self.materialize_subquery_fromobjs = materialize_subquery_fromobjs if not self.materialize_subquery_fromobjs: logger.notice( "Materialize from_objs is set to false. " "The from_objs will be calculated on the fly every time." ) self.features_ignore_cohort = features_ignore_cohort if self.features_ignore_cohort: logger.notice( "Features will be calculated for all the entities " "(i.e. ignoring cohort) this setting will have the effect " "that more db space will be used, but potentially could save " "time if you are running several similar experiments with " "different cohorts." ) self.additional_bigtrain_classnames = additional_bigtrain_classnames # only fill default values for full runs if not partial_run: ## Defaults to sane values self.config["temporal_config"] = fill_timechop_config_missing( self.config, self.db_engine ) ## Defaults to all the entities found in the features_aggregation's from_obj self.config["cohort_config"] = fill_cohort_config_missing(self.config) ## Defaults to all the feature_aggregation's prefixes self.config["feature_group_definition"] = fill_feature_group_definition( self.config ) grid_config = fill_model_grid_presets(self.config) self.config.pop("model_grid_preset", None) if grid_config is not None: self.config["grid_config"] = grid_config if not self.config.get("random_seed", None): logger.notice( "Random seed not specified. A random seed will be provided. " "This could have interesting side effects, " "e.g. new models per model group are trained, " "tested and evaluated everytime that you run this experiment configuration" ) self.random_seed = self.config.pop("random_seed", random.randint(1, 1e7)) logger.verbose( f"Using random seed [{self.random_seed}] for running the experiment" ) random.seed(self.random_seed) ###################### RUBICON ###################### self.experiment_hash = save_experiment_and_get_hash(self.config, self.db_engine) logger.debug(f"Experiment hash [{self.experiment_hash}] assigned") self.run_id = initialize_tracking_and_get_run_id( self.experiment_hash, experiment_class_path=classpath(self.__class__), random_seed=self.random_seed, experiment_kwargs=experiment_kwargs, db_engine=self.db_engine, ) logger.debug(f"Experiment run id [{self.run_id}] assigned") self.initialize_components() self.cleanup = cleanup if self.cleanup: logger.notice( "Cleanup is set to true, so intermediate tables (labels and cohort) " "will be removed after matrix creation and subset tables will be " "removed after model training and testing" ) self.cleanup_timeout = ( self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout ) self.profile = profile if self.profile: logger.spam("Profiling will be stored using cProfile")
def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, materialize_subquery_fromobjs=True, features_ignore_cohort=False, profile=False, save_predictions=True, skip_validation=False, ): experiment_kwargs = bind_kwargs( self.__class__, **{ key: value for (key, value) in locals().items() if key not in {'db_engine', 'config', 'self'} }) self._check_config_version(config) self.config = config random.seed(config['random_seed']) self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine( self.project_storage, matrix_storage_class) self.project_path = project_path self.replace = replace self.save_predictions = save_predictions self.skip_validation = skip_validation self.db_engine = db_engine results_schema.upgrade_if_clean(dburl=self.db_engine.url) self.features_schema_name = "features" self.materialize_subquery_fromobjs = materialize_subquery_fromobjs self.features_ignore_cohort = features_ignore_cohort self.experiment_hash = save_experiment_and_get_hash( self.config, self.db_engine) self.run_id = initialize_tracking_and_get_run_id( self.experiment_hash, experiment_class_path=classpath(self.__class__), experiment_kwargs=experiment_kwargs, db_engine=self.db_engine) self.initialize_components() self.cleanup = cleanup if self.cleanup: logging.info( "cleanup is set to True, so intermediate tables (labels and cohort) " "will be removed after matrix creation and subset tables will be " "removed after model training and testing") else: logging.info( "cleanup is set to False, so intermediate tables (labels, cohort, and subsets) " "will not be removed") self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout) self.profile = profile logging.info("Generate profiling stats? (profile option): %s", self.profile)
def __init__(self, db_engine, project_path, model_group_id): self.retrain_hash = None self.db_engine = db_engine upgrade_db(db_engine=self.db_engine) self.project_storage = ProjectStorage(project_path) self.model_group_id = model_group_id self.model_group_info = get_model_group_info(self.db_engine, self.model_group_id) self.matrix_storage_engine = self.project_storage.matrix_storage_engine( ) self.triage_run_id, self.experiment_config = experiment_config_from_model_group_id( self.db_engine, self.model_group_id) # This feels like it needs some refactoring since in some edge cases at least the test matrix temporal parameters # might differ across models in the mdoel group (the training ones shouldn't), but this should probably work for # the vast majorty of use cases... self.experiment_config['temporal_config'].update( temporal_params_from_matrix_metadata( self.db_engine, self.model_group_info['model_id_last_split'])) # Since "testing" here is predicting forward to a single new date, the test_duration should always be '0day' # (regardless of what it may have been before) self.experiment_config['temporal_config']['test_durations'] = ['0day'] # These lists should now only contain one item (the value actually used for the last model in this group) self.training_label_timespan = self.experiment_config[ 'temporal_config']['training_label_timespans'][0] self.test_label_timespan = self.experiment_config['temporal_config'][ 'test_label_timespans'][0] self.test_duration = self.experiment_config['temporal_config'][ 'test_durations'][0] self.feature_start_time = self.experiment_config['temporal_config'][ 'feature_start_time'] self.label_name = self.experiment_config['label_config']['name'] self.cohort_name = self.experiment_config['cohort_config']['name'] self.user_metadata = self.experiment_config['user_metadata'] self.feature_dictionary_creator = FeatureDictionaryCreator( features_schema_name='triage_production', db_engine=self.db_engine) self.label_generator = LabelGenerator( label_name=self.experiment_config['label_config'].get( "name", None), query=self.experiment_config['label_config']["query"], replace=True, db_engine=self.db_engine, ) self.labels_table_name = "labels_{}_{}_production".format( self.experiment_config['label_config'].get('name', 'default'), filename_friendly_hash( self.experiment_config['label_config']['query'])) self.feature_generator = FeatureGenerator( db_engine=self.db_engine, features_schema_name="triage_production", feature_start_time=self.feature_start_time, ) self.model_trainer = ModelTrainer( experiment_hash=None, model_storage_engine=ModelStorageEngine(self.project_storage), db_engine=self.db_engine, replace=True, run_id=self.triage_run_id, )
def test_integration(): with rig_engines() as (db_engine, project_storage): train_store = get_matrix_store( project_storage, matrix_creator(), matrix_metadata_creator(matrix_type="train"), ) as_of_dates = [datetime.date(2016, 12, 21), datetime.date(2017, 1, 21)] test_stores = [] for as_of_date in as_of_dates: matrix_store = get_matrix_store( project_storage, pandas.DataFrame.from_dict({ "entity_id": [3], "feature_one": [8], "feature_two": [5], "label": [0], }).set_index("entity_id"), matrix_metadata_creator(end_time=as_of_date, indices=["entity_id"]), ) test_stores.append(matrix_store) model_storage_engine = ModelStorageEngine(project_storage) experiment_hash = save_experiment_and_get_hash({}, db_engine) # instantiate pipeline objects trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, ) predictor = Predictor(model_storage_engine, db_engine) model_evaluator = ModelEvaluator([{ "metrics": ["precision@"], "thresholds": { "top_n": [5] } }], [{}], db_engine) # run the pipeline grid_config = { "sklearn.linear_model.LogisticRegression": { "C": [0.00001, 0.0001], "penalty": ["l1", "l2"], "random_state": [2193], } } model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict(), train_matrix_columns=["feature_one", "feature_two"], ) model_evaluator.evaluate(predictions_proba, test_store, model_id) # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( """select entity_id, model_id, as_of_date from test_results.predictions join model_metadata.models using (model_id) order by 3, 2""") ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(""" select model_id, evaluation_start_time, metric, parameter from test_results.evaluations order by 2, 1""") ] assert records == [ (1, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (2, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (3, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (4, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (1, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), (2, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), (3, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), (4, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), ]