def test_filename_friendly_hash(): data = { "stuff": "stuff", "other_stuff": "more_stuff", "a_datetime": datetime.datetime(2015, 1, 1), "a_date": datetime.date(2016, 1, 1), "a_number": 5.0, } output = filename_friendly_hash(data) assert isinstance(output, str) assert re.match(r"^[\w]+$", output) is not None # make sure ordering keys differently doesn't change the hash new_output = filename_friendly_hash({ "other_stuff": "more_stuff", "stuff": "stuff", "a_datetime": datetime.datetime(2015, 1, 1), "a_date": datetime.date(2016, 1, 1), "a_number": 5.0, }) assert new_output == output # make sure new data hashes to something different new_output = filename_friendly_hash({"stuff": "stuff", "a_number": 5.0}) assert new_output != output
def test_filename_friendly_hash(): data = { 'stuff': 'stuff', 'other_stuff': 'more_stuff', 'a_datetime': datetime.datetime(2015, 1, 1), 'a_date': datetime.date(2016, 1, 1), 'a_number': 5.0 } output = filename_friendly_hash(data) assert isinstance(output, str) assert re.match('^[\w]+$', output) is not None # make sure ordering keys differently doesn't change the hash new_output = filename_friendly_hash({ 'other_stuff': 'more_stuff', 'stuff': 'stuff', 'a_datetime': datetime.datetime(2015, 1, 1), 'a_date': datetime.date(2016, 1, 1), 'a_number': 5.0 }) assert new_output == output # make sure new data hashes to something different new_output = filename_friendly_hash({'stuff': 'stuff', 'a_number': 5.0}) assert new_output != output
def test_filename_friendly_hash_stability(): nested_data = {"one": "two", "three": {"four": "five", "six": "seven"}} output = filename_friendly_hash(nested_data) # 1. we want to make sure this is stable across different runs # so hardcode an expected value assert output == "9a844a7ebbfd821010b1c2c13f7391e6" other_nested_data = {"one": "two", "three": {"six": "seven", "four": "five"}} new_output = filename_friendly_hash(other_nested_data) assert output == new_output
def test_filename_friendly_hash_stability(): nested_data = {'one': 'two', 'three': {'four': 'five', 'six': 'seven'}} output = filename_friendly_hash(nested_data) # 1. we want to make sure this is stable across different runs # so hardcode an expected value assert output == '9a844a7ebbfd821010b1c2c13f7391e6' other_nested_data = { 'one': 'two', 'three': { 'six': 'seven', 'four': 'five' } } new_output = filename_friendly_hash(other_nested_data) assert output == new_output
def save_retrain_and_get_hash(config, db_engine): retrain_hash = filename_friendly_hash(config) session = sessionmaker(bind=db_engine)() session.merge(Retrain(retrain_hash=retrain_hash, config=config)) session.commit() session.close() return retrain_hash
def get_matrix_store(project_storage, matrix=None, metadata=None, write_to_db=True): """Return a matrix store associated with the given project storage. Also adds an entry in the matrices table if it doesn't exist already Args: project_storage (triage.component.catwalk.storage.ProjectStorage) A project's storage matrix (dataframe, optional): A matrix to store. Defaults to the output of matrix_creator() metadata (dict, optional): matrix metadata. defaults to the output of matrix_metadata_creator() """ if matrix is None: matrix = matrix_creator() if not metadata: metadata = matrix_metadata_creator() matrix["as_of_date"] = matrix["as_of_date"].apply(pd.Timestamp) matrix.set_index(MatrixStore.indices, inplace=True) matrix_store = project_storage.matrix_storage_engine().get_store( filename_friendly_hash(metadata)) matrix_store.metadata = metadata new_matrix = matrix.copy() labels = new_matrix.pop(matrix_store.label_column_name) matrix_store.matrix_label_tuple = new_matrix, labels matrix_store.save() matrix_store.clear_cache() if write_to_db: if (session.query(Matrix).filter( Matrix.matrix_uuid == matrix_store.uuid).count() == 0): MatrixFactory(matrix_uuid=matrix_store.uuid) session.commit() return matrix_store
def test_test_matrix(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) ensure_db(engine) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states, ) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, ) uuid = filename_friendly_hash(self.good_metadata) builder.build_matrix( as_of_times=self.good_dates, label_name="booking", label_type="binary", feature_dictionary=self.good_feature_dictionary, matrix_metadata=self.good_metadata, matrix_uuid=uuid, matrix_type="test", ) assert len(matrix_storage_engine.get_store(uuid).design_matrix) == 5
def test_nullcheck(self): f0_dict = {(r[0], r[1]): r for r in features0_pre} f1_dict = {(r[0], r[1]): r for r in features1_pre} features0 = sorted(f0_dict.values(), key=lambda x: (x[1], x[0])) features1 = sorted(f1_dict.values(), key=lambda x: (x[1], x[0])) features_tables = [features0, features1] with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states, ) dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0), ] with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, ) feature_dictionary = { "features0": ["f1", "f2"], "features1": ["f3", "f4"], } matrix_metadata = { "matrix_id": "hi", "state": "active", "label_name": "booking", "end_time": datetime.datetime(2016, 3, 1, 0, 0), "feature_start_time": datetime.datetime(2016, 1, 1, 0, 0), "label_timespan": "1 month", "test_duration": "1 month", "indices": ["entity_id", "as_of_date"], } uuid = filename_friendly_hash(matrix_metadata) with self.assertRaises(ValueError): builder.build_matrix( as_of_times=dates, label_name="booking", label_type="binary", feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type="test", )
def audit_models(experiment, metric, rules): click.echo("Auditing experiment") experiment_hash = filename_friendly_hash(experiment.config) with open(f"/triage/selection_rules/{rules}") as f: rules = yaml.load(f) metric, k = metric.split('@') audit_experiment(experiment_hash, f"{metric}@", k, rules)
def test_replace_true_rerun(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) ensure_db(engine) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states, ) matrix_metadata = matrix_metadata_creator(state="active", test_duration="1month", label_name="booking") dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0), ] feature_dictionary = { "features0": ["f1", "f2"], "features1": ["f3", "f4"] } uuid = filename_friendly_hash(matrix_metadata) build_args = dict( as_of_times=dates, label_name="booking", label_type="binary", feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type="test", ) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, replace=True, ) builder.build_matrix(**build_args) assert len( matrix_storage_engine.get_store(uuid).design_matrix) == 5 assert builder.sessionmaker().query(Matrix).get(uuid) # rerun builder.build_matrix(**build_args) assert len( matrix_storage_engine.get_store(uuid).design_matrix) == 5 assert builder.sessionmaker().query(Matrix).get(uuid)
def generate_tasks(self, subset_configs): logging.info("Generating subset table creation tasks") subset_tasks = [] for subset_config in subset_configs: if subset_config: subset_hash = filename_friendly_hash(subset_config) subset_table_generator = EntityDateTableGenerator( entity_date_table_name=get_subset_table_name(subset_config), db_engine=self.db_engine, query=subset_config["query"], replace=self.replace ) subset_tasks.append( { "subset_config": subset_config, "subset_hash": subset_hash, "subset_table_generator": subset_table_generator, } ) return subset_tasks
def test_ModelEvaluator_needs_evaluation_no_bias_audit(db_engine_with_results_schema): # TEST SETUP: # create two models: one that has zero evaluations, # one that has an evaluation for precision@100_abs # both overall and for each subset model_with_evaluations = ModelFactory() model_without_evaluations = ModelFactory() eval_time = datetime.datetime(2016, 1, 1) as_of_date_frequency = "3d" for subset_hash in [""] + [filename_friendly_hash(subset) for subset in SUBSETS]: EvaluationFactory( model_rel=model_with_evaluations, evaluation_start_time=eval_time, evaluation_end_time=eval_time, as_of_date_frequency=as_of_date_frequency, metric="precision@", parameter="100_abs", subset_hash=subset_hash, ) session.commit() # make a test matrix to pass in metadata_overrides = { "as_of_date_frequency": as_of_date_frequency, "as_of_times": [eval_time], } test_matrix_store = MockMatrixStore( "test", "1234", 5, db_engine_with_results_schema, metadata_overrides=metadata_overrides, ) train_matrix_store = MockMatrixStore( "train", "2345", 5, db_engine_with_results_schema, metadata_overrides=metadata_overrides, ) # the evaluated model has test evaluations for precision, but not recall, # so this needs evaluations for subset in SUBSETS: if not subset: subset_hash = "" else: subset_hash = filename_friendly_hash(subset) assert ModelEvaluator( testing_metric_groups=[ { "metrics": ["precision@", "recall@"], "thresholds": {"top_n": [100]}, } ], training_metric_groups=[], db_engine=db_engine_with_results_schema, ).needs_evaluations( matrix_store=test_matrix_store, model_id=model_with_evaluations.model_id, subset_hash=subset_hash, ) # the evaluated model has test evaluations for precision, # so this should not need evaluations for subset in SUBSETS: if not subset: subset_hash = "" else: subset_hash = filename_friendly_hash(subset) assert not ModelEvaluator( testing_metric_groups=[ { "metrics": ["precision@"], "thresholds": {"top_n": [100]}, } ], training_metric_groups=[], db_engine=db_engine_with_results_schema, ).needs_evaluations( matrix_store=test_matrix_store, model_id=model_with_evaluations.model_id, subset_hash=subset_hash, ) # the non-evaluated model has no evaluations, # so this should need evaluations for subset in SUBSETS: if not subset: subset_hash = "" else: subset_hash = filename_friendly_hash(subset) assert ModelEvaluator( testing_metric_groups=[ { "metrics": ["precision@"], "thresholds": {"top_n": [100]}, } ], training_metric_groups=[], db_engine=db_engine_with_results_schema, ).needs_evaluations( matrix_store=test_matrix_store, model_id=model_without_evaluations.model_id, subset_hash=subset_hash, ) # the evaluated model has no *train* evaluations, # so the train matrix should need evaluations for subset in SUBSETS: if not subset: subset_hash = "" else: subset_hash = filename_friendly_hash(subset) assert ModelEvaluator( testing_metric_groups=[ { "metrics": ["precision@"], "thresholds": {"top_n": [100]}, } ], training_metric_groups=[ { "metrics": ["precision@"], "thresholds": {"top_n": [100]}, } ], db_engine=db_engine_with_results_schema, ).needs_evaluations( matrix_store=train_matrix_store, model_id=model_with_evaluations.model_id, subset_hash=subset_hash, ) session.close() session.remove()
def test_evaluating_early_warning(db_engine_with_results_schema): num_entities = 10 labels = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1] # Set up testing configuration parameters testing_metric_groups = [ { "metrics": [ "precision@", "recall@", "true positives@", "true negatives@", "false positives@", "false negatives@", ], "thresholds": {"percentiles": [5.0, 10.0], "top_n": [5, 10]}, }, { "metrics": [ "f1", "mediocre", "accuracy", "roc_auc", "average precision score", ] }, {"metrics": ["fbeta@"], "parameters": [{"beta": 0.75}, {"beta": 1.25}]}, ] training_metric_groups = [{"metrics": ["accuracy", "roc_auc"]}] custom_metrics = {"mediocre": always_half} # Acquire fake data and objects to be used in the tests model_evaluator = ModelEvaluator( testing_metric_groups, training_metric_groups, db_engine_with_results_schema, custom_metrics=custom_metrics, ) fake_test_matrix_store = MockMatrixStore( matrix_type="test", matrix_uuid="efgh", label_count=num_entities, db_engine=db_engine_with_results_schema, init_labels=pd.DataFrame( { "label_value": labels, "entity_id": list(range(num_entities)), "as_of_date": [TRAIN_END_TIME] * num_entities, } ) .set_index(["entity_id", "as_of_date"]) .label_value, init_as_of_dates=[TRAIN_END_TIME], ) fake_train_matrix_store = MockMatrixStore( matrix_type="train", matrix_uuid="1234", label_count=num_entities, db_engine=db_engine_with_results_schema, init_labels=pd.DataFrame( { "label_value": labels, "entity_id": list(range(num_entities)), "as_of_date": [TRAIN_END_TIME] * num_entities, } ) .set_index(["entity_id", "as_of_date"]) .label_value, init_as_of_dates=[TRAIN_END_TIME], ) trained_model, model_id = fake_trained_model( db_engine_with_results_schema, train_end_time=TRAIN_END_TIME, ) # ensure that the matrix uuid is present matrix_uuids = [ row[0] for row in db_engine_with_results_schema.execute( "select matrix_uuid from test_results.evaluations" ) ] assert all(matrix_uuid == "efgh" for matrix_uuid in matrix_uuids) # Evaluate the training metrics and test model_evaluator.evaluate( trained_model.predict_proba(labels)[:, 1], fake_train_matrix_store, model_id ) records = [ row[0] for row in db_engine_with_results_schema.execute( """select distinct(metric || parameter) from train_results.evaluations where model_id = %s and evaluation_start_time = %s order by 1""", (model_id, fake_train_matrix_store.as_of_dates[0]), ) ] assert records == ["accuracy", "roc_auc"] # Run tests for overall and subset evaluations for subset in SUBSETS: if subset is None: where_hash = "" else: populate_subset_data( db_engine_with_results_schema, subset, list(range(num_entities)) ) SubsetFactory(subset_hash=filename_friendly_hash(subset)) session.commit() where_hash = f"and subset_hash = '{filename_friendly_hash(subset)}'" # Evaluate the testing metrics and test for all of them. model_evaluator.evaluate( trained_model.predict_proba(labels)[:, 1], fake_test_matrix_store, model_id, subset=subset, ) records = [ row[0] for row in db_engine_with_results_schema.execute( f"""\ select distinct(metric || parameter) from test_results.evaluations where model_id = %s and evaluation_start_time = %s {where_hash} order by 1 """, (model_id, fake_test_matrix_store.as_of_dates[0]), ) ] assert records == [ "accuracy", "average precision score", "f1", "false [email protected]_pct", "false negatives@10_abs", "false [email protected]_pct", "false negatives@5_abs", "false [email protected]_pct", "false positives@10_abs", "false [email protected]_pct", "false positives@5_abs", "[email protected]_beta", "[email protected]_beta", "mediocre", "[email protected]_pct", "precision@10_abs", "[email protected]_pct", "precision@5_abs", "[email protected]_pct", "recall@10_abs", "[email protected]_pct", "recall@5_abs", "roc_auc", "true [email protected]_pct", "true negatives@10_abs", "true [email protected]_pct", "true negatives@5_abs", "true [email protected]_pct", "true positives@10_abs", "true [email protected]_pct", "true positives@5_abs", ] # Evaluate the training metrics and test model_evaluator.evaluate( trained_model.predict_proba(labels)[:, 1], fake_train_matrix_store, model_id, subset=subset, ) records = [ row[0] for row in db_engine_with_results_schema.execute( f"""select distinct(metric || parameter) from train_results.evaluations where model_id = %s and evaluation_start_time = %s {where_hash} order by 1""", (model_id, fake_train_matrix_store.as_of_dates[0]), ) ] assert records == ["accuracy", "roc_auc"] # ensure that the matrix uuid is present matrix_uuids = [ row[0] for row in db_engine_with_results_schema.execute( "select matrix_uuid from train_results.evaluations" ) ] assert all(matrix_uuid == "1234" for matrix_uuid in matrix_uuids)
def initialize_components(self): split_config = self.config["temporal_config"] self.chopper = Timechop(**split_config) cohort_config = self.config.get("cohort_config", {}) if "query" in cohort_config: self.cohort_table_name = "cohort_{}_{}".format( cohort_config.get('name', 'default'), self.cohort_hash) self.cohort_table_generator = EntityDateTableGenerator( entity_date_table_name=self.cohort_table_name, db_engine=self.db_engine, query=cohort_config["query"], replace=self.replace) else: logging.warning( "cohort_config missing or unrecognized. Without a cohort, " "you will not be able to make matrices, perform feature imputation, " "or save time by only computing features for that cohort.") self.features_ignore_cohort = True self.cohort_table_name = "cohort_{}".format(self.experiment_hash) self.cohort_table_generator = EntityDateTableGeneratorNoOp() self.subsets = [None] + self.config.get("scoring", {}).get( "subsets", []) if "label_config" in self.config: label_config = self.config["label_config"] self.labels_table_name = "labels_{}_{}".format( label_config.get('name', 'default'), filename_friendly_hash(label_config['query'])) self.label_generator = LabelGenerator( label_name=label_config.get("name", None), query=label_config["query"], replace=self.replace, db_engine=self.db_engine, ) else: self.labels_table_name = "labels_{}".format(self.experiment_hash) self.label_generator = LabelGeneratorNoOp() logging.warning( "label_config missing or unrecognized. Without labels, " "you will not be able to make matrices.") if "bias_audit_config" in self.config: bias_config = self.config["bias_audit_config"] self.bias_hash = filename_friendly_hash(bias_config) self.protected_groups_table_name = f"protected_groups_{self.bias_hash}" self.protected_groups_generator = ProtectedGroupsGenerator( db_engine=self.db_engine, from_obj=parse_from_obj(bias_config, 'bias_from_obj'), attribute_columns=bias_config.get("attribute_columns", None), entity_id_column=bias_config.get("entity_id_column", None), knowledge_date_column=bias_config.get("knowledge_date_column", None), protected_groups_table_name=self.protected_groups_table_name, replace=self.replace) else: self.protected_groups_generator = ProtectedGroupsGeneratorNoOp() logging.warning( "bias_audit_config missing or unrecognized. Without protected groups, " "you will not audit your models for bias and fairness.") self.feature_dictionary_creator = FeatureDictionaryCreator( features_schema_name=self.features_schema_name, db_engine=self.db_engine) self.feature_generator = FeatureGenerator( features_schema_name=self.features_schema_name, replace=self.replace, db_engine=self.db_engine, feature_start_time=split_config["feature_start_time"], materialize_subquery_fromobjs=self.materialize_subquery_fromobjs, features_ignore_cohort=self.features_ignore_cohort) self.feature_group_creator = FeatureGroupCreator( self.config.get("feature_group_definition", {"all": [True]})) self.feature_group_mixer = FeatureGroupMixer( self.config.get("feature_group_strategies", ["all"])) self.planner = Planner( feature_start_time=dt_from_str(split_config["feature_start_time"]), label_names=[ self.config.get("label_config", {}).get("name", DEFAULT_LABEL_NAME) ], label_types=["binary"], cohort_names=[ self.config.get("cohort_config", {}).get("name", None) ], user_metadata=self.config.get("user_metadata", {}), ) self.matrix_builder = MatrixBuilder( db_config={ "features_schema_name": self.features_schema_name, "labels_schema_name": "public", "labels_table_name": self.labels_table_name, "cohort_table_name": self.cohort_table_name, }, matrix_storage_engine=self.matrix_storage_engine, experiment_hash=self.experiment_hash, include_missing_labels_in_train_as=self.config.get( "label_config", {}).get("include_missing_labels_in_train_as", None), engine=self.db_engine, replace=self.replace, run_id=self.run_id, ) self.subsetter = Subsetter(db_engine=self.db_engine, replace=self.replace, as_of_times=self.all_as_of_times) self.trainer = ModelTrainer( experiment_hash=self.experiment_hash, model_storage_engine=self.model_storage_engine, model_grouper=ModelGrouper(self.config.get("model_group_keys", [])), db_engine=self.db_engine, replace=self.replace, run_id=self.run_id, ) self.predictor = Predictor( db_engine=self.db_engine, model_storage_engine=self.model_storage_engine, save_predictions=self.save_predictions, replace=self.replace, rank_order=self.config.get("prediction", {}).get("rank_tiebreaker", "worst"), ) self.individual_importance_calculator = IndividualImportanceCalculator( db_engine=self.db_engine, n_ranks=self.config.get("individual_importance", {}).get("n_ranks", 5), methods=self.config.get("individual_importance", {}).get("methods", ["uniform"]), replace=self.replace, ) self.evaluator = ModelEvaluator( db_engine=self.db_engine, testing_metric_groups=self.config.get("scoring", {}).get( "testing_metric_groups", []), training_metric_groups=self.config.get("scoring", {}).get( "training_metric_groups", []), bias_config=self.config.get("bias_audit_config", {})) self.model_train_tester = ModelTrainTester( matrix_storage_engine=self.matrix_storage_engine, model_evaluator=self.evaluator, model_trainer=self.trainer, individual_importance_calculator=self. individual_importance_calculator, predictor=self.predictor, subsets=self.subsets, protected_groups_generator=self.protected_groups_generator, cohort_hash=self.cohort_hash)
def cohort_hash(self): if "query" in self.config.get("cohort_config", {}): return filename_friendly_hash( self.config["cohort_config"]["query"]) else: return None
def generate_plans(self, matrix_set_definitions, feature_dictionaries): """Create build tasks and update the matrix definitions with UUIDs :param matrix_set_definitions: the temporal information needed to generate each matrix :param feature_dictionaries: combinations of features to include in matrices :type matrix_set_definitions: list :type feature_dictionaries: list :return: matrix set definitions (updated with matrix uuids) and build tasks :rtype: tuple (list, dict) """ updated_definitions = [] build_tasks = dict() for matrix_set in matrix_set_definitions: logger.debug("Making plans for matrix set %s", matrix_set) logger.debug( "Iterating over %s label names, %s label_types, %s cohort_names, " "%s feature dictionaries", len(self.label_names), len(self.label_types), len(self.cohort_names), len(feature_dictionaries), ) train_matrix = matrix_set["train_matrix"] for ( label_name, label_type, cohort_name, feature_dictionary, ) in itertools.product( self.label_names, self.label_types, self.cohort_names, feature_dictionaries ): matrix_set_clone = copy.deepcopy(matrix_set) # get a uuid train_metadata = self._make_metadata( train_matrix, feature_dictionary, label_name, label_type, cohort_name, "train", ) train_uuid = filename_friendly_hash(train_metadata) logger.debug( "Matrix UUID %s found for train metadata %s", train_uuid, train_metadata, ) if train_uuid not in build_tasks: build_tasks[train_uuid] = self._generate_build_task( train_metadata, train_uuid, train_matrix, feature_dictionary ) logger.debug( "Train uuid %s not found in build tasks yet, " "so added", train_uuid, ) else: logger.debug( "Train uuid %s already found in build tasks", train_uuid ) matrix_set_clone["train_uuid"] = train_uuid test_uuids = [] for test_matrix in matrix_set_clone["test_matrices"]: test_metadata = self._make_metadata( test_matrix, feature_dictionary, label_name, label_type, cohort_name, "test", ) test_uuid = filename_friendly_hash(test_metadata) logger.debug( "Matrix UUID %s found for test metadata %s", test_uuid, test_metadata, ) if test_uuid not in build_tasks: build_tasks[test_uuid] = self._generate_build_task( test_metadata, test_uuid, test_matrix, feature_dictionary ) logger.debug( "Test uuid %s not found in build tasks " "yet, so added", test_uuid, ) else: logger.debug( "Test uuid %s already found in build tasks", test_uuid ) test_uuids.append(test_uuid) matrix_set_clone["test_uuids"] = test_uuids updated_definitions.append(matrix_set_clone) logger.debug( "Planner is finished generating matrix plans. " "%s matrix definitions and %s unique build tasks found", len(updated_definitions), len(build_tasks.keys()), ) logger.debug("Associated all tasks with experiment in database") return updated_definitions, build_tasks
def __init__(self, db_engine, project_path, model_group_id): self.retrain_hash = None self.db_engine = db_engine upgrade_db(db_engine=self.db_engine) self.project_storage = ProjectStorage(project_path) self.model_group_id = model_group_id self.model_group_info = get_model_group_info(self.db_engine, self.model_group_id) self.matrix_storage_engine = self.project_storage.matrix_storage_engine( ) self.triage_run_id, self.experiment_config = experiment_config_from_model_group_id( self.db_engine, self.model_group_id) # This feels like it needs some refactoring since in some edge cases at least the test matrix temporal parameters # might differ across models in the mdoel group (the training ones shouldn't), but this should probably work for # the vast majorty of use cases... self.experiment_config['temporal_config'].update( temporal_params_from_matrix_metadata( self.db_engine, self.model_group_info['model_id_last_split'])) # Since "testing" here is predicting forward to a single new date, the test_duration should always be '0day' # (regardless of what it may have been before) self.experiment_config['temporal_config']['test_durations'] = ['0day'] # These lists should now only contain one item (the value actually used for the last model in this group) self.training_label_timespan = self.experiment_config[ 'temporal_config']['training_label_timespans'][0] self.test_label_timespan = self.experiment_config['temporal_config'][ 'test_label_timespans'][0] self.test_duration = self.experiment_config['temporal_config'][ 'test_durations'][0] self.feature_start_time = self.experiment_config['temporal_config'][ 'feature_start_time'] self.label_name = self.experiment_config['label_config']['name'] self.cohort_name = self.experiment_config['cohort_config']['name'] self.user_metadata = self.experiment_config['user_metadata'] self.feature_dictionary_creator = FeatureDictionaryCreator( features_schema_name='triage_production', db_engine=self.db_engine) self.label_generator = LabelGenerator( label_name=self.experiment_config['label_config'].get( "name", None), query=self.experiment_config['label_config']["query"], replace=True, db_engine=self.db_engine, ) self.labels_table_name = "labels_{}_{}_production".format( self.experiment_config['label_config'].get('name', 'default'), filename_friendly_hash( self.experiment_config['label_config']['query'])) self.feature_generator = FeatureGenerator( db_engine=self.db_engine, features_schema_name="triage_production", feature_start_time=self.feature_start_time, ) self.model_trainer = ModelTrainer( experiment_hash=None, model_storage_engine=ModelStorageEngine(self.project_storage), db_engine=self.db_engine, replace=True, run_id=self.triage_run_id, )
def test_replace_false_rerun(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) ensure_db(engine) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states, ) dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0), ] with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, replace=False, ) feature_dictionary = { "features0": ["f1", "f2"], "features1": ["f3", "f4"], } matrix_metadata = { "matrix_id": "hi", "state": "active", "label_name": "booking", "end_time": datetime.datetime(2016, 3, 1, 0, 0), "feature_start_time": datetime.datetime(2016, 1, 1, 0, 0), "label_timespan": "1 month", "test_duration": "1 month", "indices": ["entity_id", "as_of_date"], } uuid = filename_friendly_hash(matrix_metadata) builder.build_matrix( as_of_times=dates, label_name="booking", label_type="binary", feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type="test", ) assert len(matrix_storage_engine.get_store(uuid).design_matrix) == 5 # rerun builder.make_entity_date_table = Mock() builder.build_matrix( as_of_times=dates, label_name="booking", label_type="binary", feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type="test", ) assert not builder.make_entity_date_table.called
def retrain(self, prediction_date): """Retrain a model by going back one split from prediction_date, so the as_of_date for training would be (prediction_date - training_label_timespan) Args: prediction_date(str) """ # Retrain config and hash retrain_config = { "model_group_id": self.model_group_id, "prediction_date": prediction_date, "test_label_timespan": self.test_label_timespan, "test_duration": self.test_duration, } self.retrain_hash = save_retrain_and_get_hash(retrain_config, self.db_engine) with get_for_update(self.db_engine, Retrain, self.retrain_hash) as retrain: retrain.prediction_date = prediction_date # Timechop prediction_date = dt_from_str(prediction_date) temporal_config = self.get_temporal_config_for_retrain(prediction_date) timechopper = Timechop(**temporal_config) chops = timechopper.chop_time() assert len(chops) == 1 chops_train_matrix = chops[0]['train_matrix'] as_of_date = datetime.strftime(chops_train_matrix['last_as_of_time'], "%Y-%m-%d") retrain_definition = { 'first_as_of_time': chops_train_matrix['first_as_of_time'], 'last_as_of_time': chops_train_matrix['last_as_of_time'], 'matrix_info_end_time': chops_train_matrix['matrix_info_end_time'], 'as_of_times': [as_of_date], 'training_label_timespan': chops_train_matrix['training_label_timespan'], 'max_training_history': chops_train_matrix['max_training_history'], 'training_as_of_date_frequency': chops_train_matrix['training_as_of_date_frequency'], } # Set ExperimentRun run = TriageRun( start_time=datetime.now(), git_hash=infer_git_hash(), triage_version=infer_triage_version(), python_version=infer_python_version(), run_type="retrain", run_hash=self.retrain_hash, last_updated_time=datetime.now(), current_status=TriageRunStatus.started, installed_libraries=infer_installed_libraries(), platform=platform.platform(), os_user=getpass.getuser(), working_directory=os.getcwd(), ec2_instance_type=infer_ec2_instance_type(), log_location=infer_log_location(), experiment_class_path=classpath(self.__class__), random_seed=retrieve_experiment_seed_from_run_id( self.db_engine, self.triage_run_id), ) run_id = None with scoped_session(self.db_engine) as session: session.add(run) session.commit() run_id = run.run_id if not run_id: raise ValueError("Failed to retrieve run_id from saved row") # set ModelTrainer's run_id and experiment_hash for Retrain run self.model_trainer.run_id = run_id self.model_trainer.experiment_hash = self.retrain_hash # 1. Generate all labels self.generate_all_labels(as_of_date) record_labels_table_name(run_id, self.db_engine, self.labels_table_name) # 2. Generate cohort cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_retrain" self.generate_entity_date_table(as_of_date, cohort_table_name) record_cohort_table_name(run_id, self.db_engine, cohort_table_name) # 3. Generate feature aggregations collate_aggregations = self.get_collate_aggregations( as_of_date, cohort_table_name) feature_aggregation_table_tasks = self.feature_generator.generate_all_table_tasks( collate_aggregations, task_type='aggregation') self.feature_generator.process_table_tasks( feature_aggregation_table_tasks) # 4. Reconstruct feature disctionary from feature_names and generate imputation reconstructed_feature_dict, imputation_table_tasks = self.get_feature_dict_and_imputation_task( collate_aggregations, self.model_group_info['model_id_last_split'], ) feature_group_creator = FeatureGroupCreator( self.experiment_config['feature_group_definition']) feature_group_mixer = FeatureGroupMixer(["all"]) feature_group_dict = feature_group_mixer.generate( feature_group_creator.subsets(reconstructed_feature_dict))[0] self.feature_generator.process_table_tasks(imputation_table_tasks) # 5. Build new matrix db_config = { "features_schema_name": "triage_production", "labels_schema_name": "public", "cohort_table_name": cohort_table_name, "labels_table_name": self.labels_table_name, } record_matrix_building_started(run_id, self.db_engine) matrix_builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=self.matrix_storage_engine, engine=self.db_engine, experiment_hash=None, replace=True, ) new_matrix_metadata = Planner.make_metadata( matrix_definition=retrain_definition, feature_dictionary=feature_group_dict, label_name=self.label_name, label_type='binary', cohort_name=self.cohort_name, matrix_type='train', feature_start_time=dt_from_str(self.feature_start_time), user_metadata=self.user_metadata, ) new_matrix_metadata['matrix_id'] = "_".join([ self.label_name, 'binary', str(as_of_date), 'retrain', ]) matrix_uuid = filename_friendly_hash(new_matrix_metadata) matrix_builder.build_matrix( as_of_times=[as_of_date], label_name=self.label_name, label_type='binary', feature_dictionary=feature_group_dict, matrix_metadata=new_matrix_metadata, matrix_uuid=matrix_uuid, matrix_type="train", ) retrain_model_comment = 'retrain_' + str(datetime.now()) misc_db_parameters = { 'train_end_time': dt_from_str(as_of_date), 'test': False, 'train_matrix_uuid': matrix_uuid, 'training_label_timespan': self.training_label_timespan, 'model_comment': retrain_model_comment, } # get the random seed from the last split last_split_train_matrix_uuid, last_split_matrix_metadata = train_matrix_info_from_model_id( self.db_engine, model_id=self.model_group_info['model_id_last_split']) random_seed = self.model_trainer.get_or_generate_random_seed( model_group_id=self.model_group_id, matrix_metadata=last_split_matrix_metadata, train_matrix_uuid=last_split_train_matrix_uuid) # create retrain model hash retrain_model_hash = self.model_trainer._model_hash( self.matrix_storage_engine.get_store(matrix_uuid).metadata, class_path=self.model_group_info['model_type'], parameters=self.model_group_info['hyperparameters'], random_seed=random_seed, ) associate_models_with_retrain(self.retrain_hash, (retrain_model_hash, ), self.db_engine) record_model_building_started(run_id, self.db_engine) retrain_model_id = self.model_trainer.process_train_task( matrix_store=self.matrix_storage_engine.get_store(matrix_uuid), class_path=self.model_group_info['model_type'], parameters=self.model_group_info['hyperparameters'], model_hash=retrain_model_hash, misc_db_parameters=misc_db_parameters, random_seed=random_seed, retrain=True, model_group_id=self.model_group_id) self.retrain_model_hash = retrieve_model_hash_from_id( self.db_engine, retrain_model_id) self.retrain_matrix_uuid = matrix_uuid self.retrain_model_id = retrain_model_id return { 'retrain_model_comment': retrain_model_comment, 'retrain_model_id': retrain_model_id }
def predict(self, prediction_date): """Predict forward by creating a matrix using as_of_date = prediction_date and applying the retrain model on it Args: prediction_date(str) """ cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_predict" # 1. Generate cohort self.generate_entity_date_table(prediction_date, cohort_table_name) # 2. Generate feature aggregations collate_aggregations = self.get_collate_aggregations( prediction_date, cohort_table_name) self.feature_generator.process_table_tasks( self.feature_generator.generate_all_table_tasks( collate_aggregations, task_type='aggregation')) # 3. Reconstruct feature disctionary from feature_names and generate imputation reconstructed_feature_dict, imputation_table_tasks = self.get_feature_dict_and_imputation_task( collate_aggregations, self.retrain_model_id) self.feature_generator.process_table_tasks(imputation_table_tasks) # 4. Build matrix db_config = { "features_schema_name": "triage_production", "labels_schema_name": "public", "cohort_table_name": cohort_table_name, } matrix_builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=self.matrix_storage_engine, engine=self.db_engine, experiment_hash=None, replace=True, ) # Use timechop to get the time definition for production temporal_config = self.get_temporal_config_for_retrain( dt_from_str(prediction_date)) timechopper = Timechop(**temporal_config) retrain_config = get_retrain_config_from_model_id( self.db_engine, self.retrain_model_id) prod_definitions = timechopper.define_test_matrices( train_test_split_time=dt_from_str(prediction_date), test_duration=retrain_config['test_duration'], test_label_timespan=retrain_config['test_label_timespan']) last_split_definition = prod_definitions[-1] matrix_metadata = Planner.make_metadata( matrix_definition=last_split_definition, feature_dictionary=reconstructed_feature_dict, label_name=self.label_name, label_type='binary', cohort_name=self.cohort_name, matrix_type='production', feature_start_time=self.feature_start_time, user_metadata=self.user_metadata, ) matrix_metadata['matrix_id'] = str( prediction_date ) + f'_model_id_{self.retrain_model_id}' + '_risklist' matrix_uuid = filename_friendly_hash(matrix_metadata) matrix_builder.build_matrix( as_of_times=[prediction_date], label_name=self.label_name, label_type='binary', feature_dictionary=reconstructed_feature_dict, matrix_metadata=matrix_metadata, matrix_uuid=matrix_uuid, matrix_type="production", ) # 5. Predict the risk score for production predictor = Predictor( model_storage_engine=self.project_storage.model_storage_engine(), db_engine=self.db_engine, rank_order='best') predictor.predict( model_id=self.retrain_model_id, matrix_store=self.matrix_storage_engine.get_store(matrix_uuid), misc_db_parameters={}, train_matrix_columns=self.matrix_storage_engine.get_store( self.retrain_matrix_uuid).columns(), ) self.predict_matrix_uuid = matrix_uuid
def predict_forward_with_existed_model(db_engine, project_path, model_id, as_of_date): """Predict forward given model_id and as_of_date and store the prediction in database Args: db_engine (sqlalchemy.db.engine) project_storage (catwalk.storage.ProjectStorage) model_id (int) The id of a given model in the database as_of_date (string) a date string like "YYYY-MM-DD" """ logger.spam("In PREDICT LIST................") upgrade_db(db_engine=db_engine) project_storage = ProjectStorage(project_path) matrix_storage_engine = project_storage.matrix_storage_engine() # 1. Get feature and cohort config from database (train_matrix_uuid, matrix_metadata) = train_matrix_info_from_model_id(db_engine, model_id) experiment_config = experiment_config_from_model_id(db_engine, model_id) # 2. Generate cohort cohort_table_name = f"triage_production.cohort_{experiment_config['cohort_config']['name']}" cohort_table_generator = EntityDateTableGenerator( db_engine=db_engine, query=experiment_config['cohort_config']['query'], entity_date_table_name=cohort_table_name) cohort_table_generator.generate_entity_date_table( as_of_dates=[dt_from_str(as_of_date)]) # 3. Generate feature aggregations feature_generator = FeatureGenerator( db_engine=db_engine, features_schema_name="triage_production", feature_start_time=experiment_config['temporal_config'] ['feature_start_time'], ) collate_aggregations = feature_generator.aggregations( feature_aggregation_config=experiment_config['feature_aggregations'], feature_dates=[as_of_date], state_table=cohort_table_name) feature_generator.process_table_tasks( feature_generator.generate_all_table_tasks(collate_aggregations, task_type='aggregation')) # 4. Reconstruct feature disctionary from feature_names and generate imputation reconstructed_feature_dict = FeatureGroup() imputation_table_tasks = OrderedDict() for aggregation in collate_aggregations: feature_group, feature_names = get_feature_names( aggregation, matrix_metadata) reconstructed_feature_dict[feature_group] = feature_names # Make sure that the features imputed in training should also be imputed in production features_imputed_in_train = get_feature_needs_imputation_in_train( aggregation, feature_names) features_imputed_in_production = get_feature_needs_imputation_in_production( aggregation, db_engine) total_impute_cols = set(features_imputed_in_production) | set( features_imputed_in_train) total_nonimpute_cols = set(f for f in set(feature_names) if '_imp' not in f) - total_impute_cols task_generator = feature_generator._generate_imp_table_tasks_for imputation_table_tasks.update( task_generator(aggregation, impute_cols=list(total_impute_cols), nonimpute_cols=list(total_nonimpute_cols))) feature_generator.process_table_tasks(imputation_table_tasks) # 5. Build matrix db_config = { "features_schema_name": "triage_production", "labels_schema_name": "public", "cohort_table_name": cohort_table_name, } matrix_builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, engine=db_engine, experiment_hash=None, replace=True, ) feature_start_time = experiment_config['temporal_config'][ 'feature_start_time'] label_name = experiment_config['label_config']['name'] label_type = 'binary' cohort_name = experiment_config['cohort_config']['name'] user_metadata = experiment_config['user_metadata'] # Use timechop to get the time definition for production temporal_config = experiment_config["temporal_config"] temporal_config.update( temporal_params_from_matrix_metadata(db_engine, model_id)) timechopper = Timechop(**temporal_config) prod_definitions = timechopper.define_test_matrices( train_test_split_time=dt_from_str(as_of_date), test_duration=temporal_config['test_durations'][0], test_label_timespan=temporal_config['test_label_timespans'][0]) matrix_metadata = Planner.make_metadata( prod_definitions[-1], reconstructed_feature_dict, label_name, label_type, cohort_name, 'production', feature_start_time, user_metadata, ) matrix_metadata['matrix_id'] = str( as_of_date) + f'_model_id_{model_id}' + '_risklist' matrix_uuid = filename_friendly_hash(matrix_metadata) matrix_builder.build_matrix( as_of_times=[as_of_date], label_name=label_name, label_type=label_type, feature_dictionary=reconstructed_feature_dict, matrix_metadata=matrix_metadata, matrix_uuid=matrix_uuid, matrix_type="production", ) # 6. Predict the risk score for production predictor = Predictor( model_storage_engine=project_storage.model_storage_engine(), db_engine=db_engine, rank_order='best') predictor.predict( model_id=model_id, matrix_store=matrix_storage_engine.get_store(matrix_uuid), misc_db_parameters={}, train_matrix_columns=matrix_storage_engine.get_store( train_matrix_uuid).columns())