def __call__(self, args): if args.validate_only: try: logger.info( f"Validating experiment [config file: {self.args.config}]") self.experiment.validate() logger.success( f"Experiment ({self.experiment.experiment_hash})'s configuration file is OK!" ) except Exception: logger.exception(f"Validation failed!") logger.error( f"Experiment [config file: {self.args.config}] configuration file is incorrect" ) elif args.show_timechop: experiment_name = os.path.splitext( os.path.basename(self.args.config))[0] project_storage = ProjectStorage(self.args.project_path) timechop_store = project_storage.get_store( ["images"], f"{experiment_name}.png") with timechop_store.open('wb') as fd: visualize_chops(self.experiment.chopper, save_target=fd) else: try: logger.info( f"Running Experiment ({self.experiment.experiment_hash})") self.experiment.run() logger.success( f"Experiment ({self.experiment.experiment_hash}) ran through completion" ) except Exception: logger.exception("Something went wrong") logger.critical( f"Experiment [config file: {self.args.config}] run failed!" )
def matrix_stores(self): df = pd.DataFrame.from_dict(self.data_dict).set_index(["entity_id"]) with tempfile.TemporaryDirectory() as tmpdir: project_storage = ProjectStorage(tmpdir) tmpcsv = os.path.join(tmpdir, "df.csv") tmpyaml = os.path.join(tmpdir, "df.yaml") tmphdf = os.path.join(tmpdir, "df.h5") with open(tmpyaml, "w") as outfile: yaml.dump(self.metadata, outfile, default_flow_style=False) df.to_csv(tmpcsv) df.to_hdf(tmphdf, "matrix") csv = CSVMatrixStore(project_storage, [], "df") hdf = HDFMatrixStore(project_storage, [], "df") assert csv.matrix.equals(hdf.matrix) yield from [csv, hdf]
def test_s3_save(self): with mock_s3(): client = boto3.client("s3") client.create_bucket(Bucket="fake-matrix-bucket", ACL="public-read-write") example = next(self.matrix_stores()) project_storage = ProjectStorage("s3://fake-matrix-bucket") tosave = CSVMatrixStore(project_storage, [], "test") tosave.matrix = example.matrix tosave.metadata = example.metadata tosave.save() tocheck = CSVMatrixStore(project_storage, [], "test") assert tocheck.metadata == example.metadata assert tocheck.matrix.to_dict() == example.matrix.to_dict()
def test_as_of_dates_entity_index(self): data = { 'entity_id': [1, 2], 'feature_one': [0.5, 0.6], 'feature_two': [0.5, 0.6], } with tempfile.TemporaryDirectory() as tmpdir: project_storage = ProjectStorage(tmpdir) matrix_store = CSVMatrixStore(project_storage, [], 'test') matrix_store.matrix = pd.DataFrame.from_dict(data) matrix_store.metadata = { 'end_time': '2016-01-01', 'indices': ['entity_id'] } self.assertEqual(matrix_store.as_of_dates, ['2016-01-01'])
def test_as_of_dates_entity_date_index(self): data = { "entity_id": [1, 2, 1, 2], "feature_one": [0.5, 0.6, 0.5, 0.6], "feature_two": [0.5, 0.6, 0.5, 0.6], "as_of_date": ["2016-01-01", "2016-01-01", "2017-01-01", "2017-01-01"], } with tempfile.TemporaryDirectory() as tmpdir: project_storage = ProjectStorage(tmpdir) matrix_store = CSVMatrixStore(project_storage, [], "test") matrix_store.matrix = pd.DataFrame.from_dict(data).set_index( ["entity_id", "as_of_date"] ) matrix_store.metadata = {"indices": ["entity_id", "as_of_date"]} self.assertEqual(matrix_store.as_of_dates, ["2016-01-01", "2017-01-01"])
def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, ): self._check_config_version(config) self.config = config if isinstance(db_engine, Engine): logging.warning( "Raw, unserializable SQLAlchemy engine passed. " "URL will be used, other options may be lost in multi-process environments" ) self.db_engine = create_engine(db_engine.url) else: self.db_engine = db_engine self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine( self.project_storage, matrix_storage_class) self.project_path = project_path self.replace = replace upgrade_db(db_engine=self.db_engine) self.features_schema_name = "features" self.experiment_hash = save_experiment_and_get_hash( self.config, self.db_engine) self.labels_table_name = "labels_{}".format(self.experiment_hash) self.initialize_components() self.cleanup = cleanup if self.cleanup: logging.info( "cleanup is set to True, so intermediate tables (labels and states) " "will be removed after matrix creation") else: logging.info( "cleanup is set to False, so intermediate tables (labels and states) " "will not be removed after matrix creation") self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout)
def test_s3_save(): with mock_s3(): client = boto3.client("s3") client.create_bucket(Bucket="fake-matrix-bucket", ACL="public-read-write") for example in matrix_stores(): if not isinstance(example, CSVMatrixStore): continue project_storage = ProjectStorage("s3://fake-matrix-bucket") tosave = CSVMatrixStore(project_storage, [], "test") tosave.metadata = example.metadata tosave.matrix_label_tuple = example.matrix_label_tuple tosave.save() tocheck = CSVMatrixStore(project_storage, [], "test") assert tocheck.metadata == example.metadata assert tocheck.design_matrix.to_dict() == example.design_matrix.to_dict()
def test_s3_save(self): with mock_s3(): import boto3 client = boto3.client('s3') client.create_bucket(Bucket='fake-matrix-bucket', ACL='public-read-write') example = next(self.matrix_stores()) project_storage = ProjectStorage('s3://fake-matrix-bucket') tosave = CSVMatrixStore(project_storage, [], 'test') tosave.matrix = example.matrix tosave.metadata = example.metadata tosave.save() tocheck = CSVMatrixStore(project_storage, [], 'test') assert tocheck.metadata == example.metadata assert tocheck.matrix.to_dict() == example.matrix.to_dict()
def matrix_stores(): df = pd.DataFrame.from_dict(DATA_DICT).set_index(MatrixStore.indices) with tempfile.TemporaryDirectory() as tmpdir: project_storage = ProjectStorage(tmpdir) tmpcsv = os.path.join(tmpdir, "df.csv.gz") tmpyaml = os.path.join(tmpdir, "df.yaml") with open(tmpyaml, "w") as outfile: yaml.dump(METADATA, outfile, default_flow_style=False) df.to_csv(tmpcsv, compression="gzip") csv = CSVMatrixStore(project_storage, [], "df") # first test with caching with csv.cache(): yield csv # with the caching out of scope they will be nuked # and this last version will not have any cache yield csv
def matrix_stores(): df = pd.DataFrame.from_dict(DATA_DICT).set_index(["entity_id"]) with tempfile.TemporaryDirectory() as tmpdir: project_storage = ProjectStorage(tmpdir) tmpcsv = os.path.join(tmpdir, "df.csv") tmpyaml = os.path.join(tmpdir, "df.yaml") tmphdf = os.path.join(tmpdir, "df.h5") with open(tmpyaml, "w") as outfile: yaml.dump(METADATA, outfile, default_flow_style=False) df.to_csv(tmpcsv) df.to_hdf(tmphdf, "matrix") csv = CSVMatrixStore(project_storage, [], "df") hdf = HDFMatrixStore(project_storage, [], "df") assert csv.design_matrix.equals(hdf.design_matrix) # first test with caching with csv.cache(), hdf.cache(): yield csv yield hdf # with the caching out of scope they will be nuked # and these last two versions will not have any cache yield csv yield hdf
class ExperimentBase(ABC): """The base class for all Experiments. Subclasses must implement the following four methods: process_query_tasks process_matrix_build_tasks process_train_tasks process_model_test_tasks Look at singlethreaded.py for reference implementation of each. Args: config (dict) db_engine (triage.util.db.SerializableDbEngine or sqlalchemy.engine.Engine) project_path (string) replace (bool) cleanup_timeout (int) materialize_subquery_fromobjs (bool, default True) Whether or not to create and index tables for feature "from objects" that are subqueries. Can speed up performance when building features for many as-of-dates. profile (bool) """ cleanup_timeout = 60 # seconds def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, materialize_subquery_fromobjs=True, profile=False, ): self._check_config_version(config) self.config = config self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine( self.project_storage, matrix_storage_class ) self.project_path = project_path self.replace = replace self.db_engine = db_engine upgrade_db(db_engine=self.db_engine) self.features_schema_name = "features" self.materialize_subquery_fromobjs = materialize_subquery_fromobjs self.experiment_hash = save_experiment_and_get_hash(self.config, self.db_engine) self.labels_table_name = "labels_{}".format(self.experiment_hash) self.cohort_table_name = "cohort_{}".format(self.experiment_hash) self.initialize_components() self.cleanup = cleanup if self.cleanup: logging.info( "cleanup is set to True, so intermediate tables (labels and states) " "will be removed after matrix creation" ) else: logging.info( "cleanup is set to False, so intermediate tables (labels and states) " "will not be removed after matrix creation" ) self.cleanup_timeout = ( self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout ) self.profile = profile logging.info("Generate profiling stats? (profile option): %s", self.profile) def _check_config_version(self, config): if "config_version" in config: config_version = config["config_version"] else: logging.warning( "config_version key not found in experiment config. " "Assuming v1, which may not be correct" ) config_version = "v1" if config_version != CONFIG_VERSION: raise ValueError( "Experiment config '{}' " "does not match current version '{}'. " "Will not run experiment.".format(config_version, CONFIG_VERSION) ) def initialize_components(self): split_config = self.config["temporal_config"] self.chopper = Timechop(**split_config) cohort_config = self.config.get("cohort_config", {}) if "query" in cohort_config: self.cohort_table_generator = CohortTableGenerator( cohort_table_name=self.cohort_table_name, db_engine=self.db_engine, query=cohort_config["query"], replace=self.replace ) else: logging.warning( "cohort_config missing or unrecognized. Without a cohort, " "you will not be able to make matrices or perform feature imputation." ) self.cohort_table_generator = CohortTableGeneratorNoOp() if "label_config" in self.config: self.label_generator = LabelGenerator( label_name=self.config["label_config"].get("name", None), query=self.config["label_config"]["query"], replace=self.replace, db_engine=self.db_engine, ) else: self.label_generator = LabelGeneratorNoOp() logging.warning( "label_config missing or unrecognized. Without labels, " "you will not be able to make matrices." ) self.feature_dictionary_creator = FeatureDictionaryCreator( features_schema_name=self.features_schema_name, db_engine=self.db_engine ) self.feature_generator = FeatureGenerator( features_schema_name=self.features_schema_name, replace=self.replace, db_engine=self.db_engine, feature_start_time=split_config["feature_start_time"], materialize_subquery_fromobjs=self.materialize_subquery_fromobjs ) self.feature_group_creator = FeatureGroupCreator( self.config.get("feature_group_definition", {"all": [True]}) ) self.feature_group_mixer = FeatureGroupMixer( self.config.get("feature_group_strategies", ["all"]) ) self.planner = Planner( feature_start_time=dt_from_str(split_config["feature_start_time"]), label_names=[ self.config.get("label_config", {}).get("name", DEFAULT_LABEL_NAME) ], label_types=["binary"], cohort_names=[self.config.get("cohort_config", {}).get("name", None)], user_metadata=self.config.get("user_metadata", {}), ) self.matrix_builder = MatrixBuilder( db_config={ "features_schema_name": self.features_schema_name, "labels_schema_name": "public", "labels_table_name": self.labels_table_name, "cohort_table_name": self.cohort_table_name, }, matrix_storage_engine=self.matrix_storage_engine, experiment_hash=self.experiment_hash, include_missing_labels_in_train_as=self.config.get("label_config", {}).get( "include_missing_labels_in_train_as", None ), engine=self.db_engine, replace=self.replace, ) self.trainer = ModelTrainer( experiment_hash=self.experiment_hash, model_storage_engine=self.model_storage_engine, model_grouper=ModelGrouper(self.config.get("model_group_keys", [])), db_engine=self.db_engine, replace=self.replace, ) self.tester = ModelTester( model_storage_engine=self.model_storage_engine, matrix_storage_engine=self.matrix_storage_engine, replace=self.replace, db_engine=self.db_engine, individual_importance_config=self.config.get("individual_importance", {}), evaluator_config=self.config.get("scoring", {}), ) @cachedproperty def split_definitions(self): """Temporal splits based on the experiment's configuration Returns: (dict) temporal splits Example: ``` { 'feature_start_time': {datetime}, 'feature_end_time': {datetime}, 'label_start_time': {datetime}, 'label_end_time': {datetime}, 'train_matrix': { 'first_as_of_time': {datetime}, 'last_as_of_time': {datetime}, 'matrix_info_end_time': {datetime}, 'training_label_timespan': {str}, 'training_as_of_date_frequency': {str}, 'max_training_history': {str}, 'as_of_times': [list of {datetime}s] }, 'test_matrices': [list of matrix defs similar to train_matrix] } ``` (When updating/setting split definitions, matrices should have UUIDs.) """ split_definitions = self.chopper.chop_time() logging.info("Computed and stored split definitions: %s", split_definitions) logging.info("\n----TIME SPLIT SUMMARY----\n") logging.info("Number of time splits: {}".format(len(split_definitions))) for split_index, split in enumerate(split_definitions): train_times = split["train_matrix"]["as_of_times"] test_times = [ as_of_time for test_matrix in split["test_matrices"] for as_of_time in test_matrix["as_of_times"] ] logging.info( """Split index {}: Training as_of_time_range: {} to {} ({} total) Testing as_of_time range: {} to {} ({} total)\n\n""".format( split_index, min(train_times), max(train_times), len(train_times), min(test_times), max(test_times), len(test_times), ) ) return split_definitions @cachedproperty def all_as_of_times(self): """All 'as of times' in experiment config Used for label and feature generation. Returns: (list) of datetimes """ all_as_of_times = [] for split in self.split_definitions: all_as_of_times.extend(split["train_matrix"]["as_of_times"]) logging.debug( "Adding as_of_times from train matrix: %s", split["train_matrix"]["as_of_times"], ) for test_matrix in split["test_matrices"]: logging.debug( "Adding as_of_times from test matrix: %s", test_matrix["as_of_times"], ) all_as_of_times.extend(test_matrix["as_of_times"]) logging.info( "Computed %s total as_of_times for label and feature generation", len(all_as_of_times), ) distinct_as_of_times = list(set(all_as_of_times)) logging.info( "Computed %s distinct as_of_times for label and feature generation", len(distinct_as_of_times), ) logging.info( "You can view all as_of_times by inspecting `.all_as_of_times` on this Experiment" ) return distinct_as_of_times @cachedproperty def collate_aggregations(self): """Collation of ``Aggregation`` objects used by this experiment. Returns: (list) of ``collate.Aggregation`` objects """ logging.info("Creating collate aggregations") if "feature_aggregations" not in self.config: logging.warning("No feature_aggregation config is available") return [] return self.feature_generator.aggregations( feature_aggregation_config=self.config["feature_aggregations"], feature_dates=self.all_as_of_times, state_table=self.cohort_table_name, ) @cachedproperty def feature_aggregation_table_tasks(self): """All feature table query tasks specified by this ``Experiment``. Returns: (dict) keys are group table names, values are themselves dicts, each with keys for different stages of table creation (prepare, inserts, finalize) and with values being lists of SQL commands """ logging.info( "Calculating feature tasks for %s as_of_times", len(self.all_as_of_times) ) return self.feature_generator.generate_all_table_tasks( self.collate_aggregations, task_type="aggregation" ) @cachedproperty def feature_imputation_table_tasks(self): """All feature imputation query tasks specified by this ``Experiment``. Returns: (dict) keys are group table names, values are themselves dicts, each with keys for different stages of table creation (prepare, inserts, finalize) and with values being lists of SQL commands """ logging.info( "Calculating feature tasks for %s as_of_times", len(self.all_as_of_times) ) return self.feature_generator.generate_all_table_tasks( self.collate_aggregations, task_type="imputation" ) @cachedproperty def master_feature_dictionary(self): """All possible features found in the database. Not all features will necessarily end up in matrices Returns: (list) of dicts, keys being feature table names and values being lists of feature names """ result = self.feature_dictionary_creator.feature_dictionary( feature_table_names=self.feature_imputation_table_tasks.keys(), index_column_lookup=self.feature_generator.index_column_lookup( self.collate_aggregations ), ) logging.info("Computed master feature dictionary: %s", result) return result @property def feature_dicts(self): """Feature dictionaries, representing the feature tables and columns configured in this experiment after computing feature groups. Returns: (list) of dicts, keys being feature table names and values being lists of feature names """ return self.feature_group_mixer.generate( self.feature_group_creator.subsets(self.master_feature_dictionary) ) @cachedproperty def matrix_build_tasks(self): """Tasks for all matrices that need to be built as a part of this Experiment. Each task contains arguments understood by ``Architect.build_matrix``. Returns: (list) of dicts """ if not table_has_data(self.cohort_table_name, self.db_engine): logging.warning("cohort table is not populated, cannot build any matrices") return {} if not table_has_data(self.labels_table_name, self.db_engine): logging.warning("labels table is not populated, cannot build any matrices") return {} (updated_split_definitions, matrix_build_tasks) = self.planner.generate_plans( self.split_definitions, self.feature_dicts ) self.full_matrix_definitions = updated_split_definitions return matrix_build_tasks @cachedproperty def full_matrix_definitions(self): """Full matrix definitions Returns: (list) temporal and feature information for each matrix """ (updated_split_definitions, matrix_build_tasks) = self.planner.generate_plans( self.split_definitions, self.feature_dicts ) self.matrix_build_tasks = matrix_build_tasks return updated_split_definitions @property def all_label_timespans(self): """All train and test label timespans Returns: (list) label timespans, in string form as they appeared in the experiment config """ return list( set( self.config["temporal_config"]["training_label_timespans"] + self.config["temporal_config"]["test_label_timespans"] ) ) def generate_labels(self): """Generate labels based on experiment configuration Results are stored in the database, not returned """ self.label_generator.generate_all_labels( self.labels_table_name, self.all_as_of_times, self.all_label_timespans ) def generate_cohort(self): self.cohort_table_generator.generate_cohort_table( as_of_dates=self.all_as_of_times ) def log_split(self, split_num, split): logging.info( "Starting train/test for %s out of %s: train range: %s to %s", split_num + 1, len(self.full_matrix_definitions), split["train_matrix"]["first_as_of_time"], split["train_matrix"]["matrix_info_end_time"], ) @abstractmethod def process_train_tasks(self, train_tasks): pass @abstractmethod def process_query_tasks(self, query_tasks): pass @abstractmethod def process_matrix_build_tasks(self, matrix_build_tasks): pass def generate_preimputation_features(self): self.process_query_tasks(self.feature_aggregation_table_tasks) logging.info( "Finished running preimputation feature queries. The final results are in tables: %s", ",".join(agg.get_table_name() for agg in self.collate_aggregations), ) def impute_missing_features(self): self.process_query_tasks(self.feature_imputation_table_tasks) logging.info( "Finished running postimputation feature queries. The final results are in tables: %s", ",".join( agg.get_table_name(imputed=True) for agg in self.collate_aggregations ), ) def build_matrices(self): associate_matrices_with_experiment( self.experiment_hash, self.matrix_build_tasks.keys(), self.db_engine ) self.process_matrix_build_tasks(self.matrix_build_tasks) def generate_matrices(self): logging.info("Creating cohort") self.generate_cohort() logging.info("Creating labels") self.generate_labels() logging.info("Creating feature aggregation tables") self.generate_preimputation_features() logging.info("Creating feature imputation tables") self.impute_missing_features() logging.info("Building all matrices") self.build_matrices() def train_and_test_models(self): if "grid_config" not in self.config: logging.warning( "No grid_config was passed in the experiment config. No models will be trained" ) return for split_num, split in enumerate(self.full_matrix_definitions): self.log_split(split_num, split) train_store = self.matrix_storage_engine.get_store(split["train_uuid"]) if train_store.empty: logging.warning( """Train matrix for split %s was empty, no point in training this model. Skipping """, split["train_uuid"], ) continue if len(train_store.labels().unique()) == 1: logging.warning( """Train Matrix for split %s had only one unique value, no point in training this model. Skipping """, split["train_uuid"], ) continue logging.info("Training models") train_tasks = self.trainer.generate_train_tasks( grid_config=self.config["grid_config"], misc_db_parameters=dict( test=False, model_comment=self.config.get("model_comment", None) ), matrix_store=train_store, ) associate_models_with_experiment( self.experiment_hash, [train_task['model_hash'] for train_task in train_tasks], self.db_engine ) model_ids = self.process_train_tasks(train_tasks) logging.info("Done training models for split %s", split_num) test_tasks = self.tester.generate_model_test_tasks( split=split, train_store=train_store, model_ids=model_ids ) logging.info( "Found %s non-empty test matrices for split %s", len(test_tasks), split_num, ) self.process_model_test_tasks(test_tasks) def validate(self, strict=True): ExperimentValidator(self.db_engine, strict=strict).run(self.config) def _run(self): try: logging.info("Generating matrices") self.generate_matrices() finally: if self.cleanup: self.clean_up_tables() self.train_and_test_models() logging.info("Experiment complete") self._log_end_of_run_report() def _log_end_of_run_report(self): missing_models = missing_model_hashes(self.experiment_hash, self.db_engine) if len(missing_models) > 0: logging.info("Found %s missing model hashes." "This means that they were supposed to either be trained or reused" "by this experiment but are not present in the models table." "Inspect the logs for any training errors. Full list: %s", len(missing_models), missing_models ) else: logging.info("All models that were supposed to be trained were trained. Awesome!") missing_matrices = missing_matrix_uuids(self.experiment_hash, self.db_engine) if len(missing_matrices) > 0: logging.info("Found %s missing matrix uuids." "This means that they were supposed to either be build or reused" "by this experiment but are not present in the matrices table." "Inspect the logs for any matrix building errors. Full list: %s", len(missing_matrices), missing_matrices ) else: logging.info("All matrices that were supposed to be build were built. Awesome!") def clean_up_tables(self): logging.info("Cleaning up state and labels tables") with timeout(self.cleanup_timeout): self.cohort_table_generator.clean_up() self.label_generator.clean_up(self.labels_table_name) def _run_profile(self): cp = cProfile.Profile() cp.runcall(self._run) store = self.project_storage.get_store( ["profiling_stats"], f"{int(time.time())}.profile" ) with store.open('wb') as fd: cp.create_stats() marshal.dump(cp.stats, fd) logging.info("Profiling stats of this Triage run calculated and written to %s" "in cProfile format.", store) def run(self): try: if self.profile: self._run_profile() else: self._run() except Exception: logging.exception("Run interrupted by uncaught exception") raise __call__ = run
def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, materialize_subquery_fromobjs=True, features_ignore_cohort=False, additional_bigtrain_classnames=None, profile=False, save_predictions=True, skip_validation=False, partial_run=False, ): # For a partial run, skip validation and avoid cleaning up # we'll also skip filling default config values below if partial_run: cleanup = False skip_validation = True experiment_kwargs = bind_kwargs( self.__class__, **{ key: value for (key, value) in locals().items() if key not in {"db_engine", "config", "self"} }, ) self._check_config_version(config) self.config = config if self.config.get("cohort_config") is not None: self.config["cohort_config"] = load_query_if_needed( self.config["cohort_config"] ) if self.config.get("label_config") is not None: self.config["label_config"] = load_query_if_needed( self.config["label_config"] ) self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine( self.project_storage, matrix_storage_class ) self.project_path = project_path logger.verbose( f"Matrices and trained models will be saved in {self.project_path}" ) self.replace = replace if self.replace: logger.notice( f"Replace flag is set to true. Matrices, models, " "evaluations and predictions (if they exist) will be replaced" ) self.save_predictions = save_predictions if not self.save_predictions: logger.notice( f"Save predictions flag is set to false. " "Individual predictions won't be stored in the predictions " "table. This will decrease both the running time " "of an experiment and also decrease the space needed in the db" ) self.skip_validation = skip_validation if self.skip_validation: logger.notice( f"Warning: Skip validation flag is set to true. " "The experiment config file specified won't be validated. " "This will reduce (a little) the running time of the experiment, " "but has some potential risks, e.g. the experiment could fail" "after some time due to some misconfiguration. Proceed with care." ) self.db_engine = db_engine results_schema.upgrade_if_clean(dburl=self.db_engine.url) self.features_schema_name = "features" self.materialize_subquery_fromobjs = materialize_subquery_fromobjs if not self.materialize_subquery_fromobjs: logger.notice( "Materialize from_objs is set to false. " "The from_objs will be calculated on the fly every time." ) self.features_ignore_cohort = features_ignore_cohort if self.features_ignore_cohort: logger.notice( "Features will be calculated for all the entities " "(i.e. ignoring cohort) this setting will have the effect " "that more db space will be used, but potentially could save " "time if you are running several similar experiments with " "different cohorts." ) self.additional_bigtrain_classnames = additional_bigtrain_classnames # only fill default values for full runs if not partial_run: ## Defaults to sane values self.config["temporal_config"] = fill_timechop_config_missing( self.config, self.db_engine ) ## Defaults to all the entities found in the features_aggregation's from_obj self.config["cohort_config"] = fill_cohort_config_missing(self.config) ## Defaults to all the feature_aggregation's prefixes self.config["feature_group_definition"] = fill_feature_group_definition( self.config ) grid_config = fill_model_grid_presets(self.config) self.config.pop("model_grid_preset", None) if grid_config is not None: self.config["grid_config"] = grid_config if not self.config.get("random_seed", None): logger.notice( "Random seed not specified. A random seed will be provided. " "This could have interesting side effects, " "e.g. new models per model group are trained, " "tested and evaluated everytime that you run this experiment configuration" ) self.random_seed = self.config.pop("random_seed", random.randint(1, 1e7)) logger.verbose( f"Using random seed [{self.random_seed}] for running the experiment" ) random.seed(self.random_seed) ###################### RUBICON ###################### self.experiment_hash = save_experiment_and_get_hash(self.config, self.db_engine) logger.debug(f"Experiment hash [{self.experiment_hash}] assigned") self.run_id = initialize_tracking_and_get_run_id( self.experiment_hash, experiment_class_path=classpath(self.__class__), random_seed=self.random_seed, experiment_kwargs=experiment_kwargs, db_engine=self.db_engine, ) logger.debug(f"Experiment run id [{self.run_id}] assigned") self.initialize_components() self.cleanup = cleanup if self.cleanup: logger.notice( "Cleanup is set to true, so intermediate tables (labels and cohort) " "will be removed after matrix creation and subset tables will be " "removed after model training and testing" ) self.cleanup_timeout = ( self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout ) self.profile = profile if self.profile: logger.spam("Profiling will be stored using cProfile")
def get_matrix_storage_engine(): with TemporaryDirectory() as temp_dir: yield ProjectStorage(temp_dir).matrix_storage_engine()
def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, materialize_subquery_fromobjs=True, features_ignore_cohort=False, profile=False, save_predictions=True, skip_validation=False, partial_run=False, ): # For a partial run, skip validation and avoid cleaning up # we'll also skip filling default config values below if partial_run: cleanup = False skip_validation = True experiment_kwargs = bind_kwargs( self.__class__, **{ key: value for (key, value) in locals().items() if key not in {'db_engine', 'config', 'self'} }) self._check_config_version(config) self.config = config self.config['random_seed'] = self.config.get('random_seed', random.randint(1, 1e7)) random.seed(self.config['random_seed']) self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine( self.project_storage, matrix_storage_class) self.project_path = project_path self.replace = replace self.save_predictions = save_predictions self.skip_validation = skip_validation self.db_engine = db_engine results_schema.upgrade_if_clean(dburl=self.db_engine.url) self.features_schema_name = "features" self.materialize_subquery_fromobjs = materialize_subquery_fromobjs self.features_ignore_cohort = features_ignore_cohort # only fill default values for full runs if not partial_run: ## Defaults to sane values self.config['temporal_config'] = fill_timechop_config_missing( self.config, self.db_engine) ## Defaults to all the entities found in the features_aggregation's from_obj self.config['cohort_config'] = fill_cohort_config_missing( self.config) ## Defaults to all the feature_aggregation's prefixes self.config[ 'feature_group_definition'] = fill_feature_group_definition( self.config) grid_config = fill_model_grid_presets(self.config) self.config.pop('model_grid_preset', None) if grid_config is not None: self.config['grid_config'] = grid_config ###################### RUBICON ###################### self.experiment_hash = save_experiment_and_get_hash( self.config, self.db_engine) self.run_id = initialize_tracking_and_get_run_id( self.experiment_hash, experiment_class_path=classpath(self.__class__), experiment_kwargs=experiment_kwargs, db_engine=self.db_engine) self.initialize_components() self.cleanup = cleanup if self.cleanup: logging.info( "cleanup is set to True, so intermediate tables (labels and cohort) " "will be removed after matrix creation and subset tables will be " "removed after model training and testing") else: logging.info( "cleanup is set to False, so intermediate tables (labels, cohort, and subsets) " "will not be removed") self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout) self.profile = profile logging.info("Generate profiling stats? (profile option): %s", self.profile)
def add_predictions(db_engine, model_groups, project_path, experiment_hashes=None, train_end_times_range=None, rank_order='worst', replace=True): """ For a set of modl_groups generate test predictions and write to DB Args: db_engine: Sqlalchemy engine model_groups (list): The list of model group ids we are interested in (ideally, chosen through audition) project_path (str): Path where the created matrices and trained model objects are stored for the experiment experiment_hashes (List[str]): Optional. hash(es) of the experiments we are interested in. Can be used to narrow down the model_ids in the model groups specified range_train_end_times (Dict): Optional. If provided, only the models with train_end_times that fall in the range are scored. This too, helps narrow down model_ids in the model groups specified. A dictionary with two possible keys 'range_start_date' and 'range_end_date'. Either or both could be set rank_order (str) : How to deal with ties in the scores. replace (bool) : Whether to overwrite the preditctions for a model_id, if already found in the DB. Returns: None This directly writes to the test_results.predictions table """ model_matrix_info = _fetch_relevant_model_matrix_info( db_engine=db_engine, model_groups=model_groups, experiment_hashes=experiment_hashes) # If we are only generating predictions for a specific time range if train_end_times_range is not None: if 'range_start_date' in train_end_times_range: range_start = train_end_times_range['range_start_date'] msk = (model_matrix_info['train_end_time'] >= range_start) logging.info( 'Filtering out models with a train_end_time before {}'.format( range_start)) model_matrix_info = model_matrix_info[msk] if 'range_end_date' in train_end_times_range: range_end = train_end_times_range['range_end_date'] msk = (model_matrix_info['train_end_time'] <= range_end) logging.info( 'Filtering out models with a train_end_time after {}'.format( range_end)) model_matrix_info = model_matrix_info[msk] if len(model_matrix_info) == 0: raise ValueError('Configis not valid. No models were found!') # Al the model groups specified in the config file should valid (even if the experiment_hashes and train_end_times are specified) not_fetched_model_grps = [ x for x in model_groups if not x in model_matrix_info['model_group_id'].unique() ] if len(not_fetched_model_grps) > 0: raise ValueError( 'The config is not valid. No models were found for the model group(s) {}. All specified model groups should be present' .format(not_fetched_model_grps)) logging.info('Scoring {} model ids'.format(len(model_matrix_info))) # summary of the models that we are scoring. To check any special things worth noting _summary_of_models(model_matrix_info) logging.info('Instantiating storage engines and the predictor') # Storage objects to handle already stored models and matrices project_storage = ProjectStorage(project_path) model_storage_engine = project_storage.model_storage_engine() matrix_storage_engine = project_storage.matrix_storage_engine() # Prediction generation is handled by the Predictor class in catwalk predictor = Predictor(model_storage_engine=model_storage_engine, db_engine=db_engine, rank_order=rank_order, replace=replace, save_predictions=True) # Organizing prediction run over unique (train_mat, test_mat) pairs # This is to reduce no. the times the matrices get loaded to memory groupby_obj = model_matrix_info.groupby( ['train_matrix_uuid', 'test_matrix_uuid']) for group, _ in groupby_obj: train_uuid = group[0] test_uuid = group[1] df_grp = groupby_obj.get_group(group) logging.info( 'Processing {} model_ids for train matrix {} and test matrix {}'. format(len(df_grp), train_uuid, test_uuid)) train_matrix_store = matrix_storage_engine.get_store( matrix_uuid=train_uuid) # To ensure that the column order we use for predictions match the order we used in model training train_matrix_columns = list(train_matrix_store.design_matrix.columns) test_matrix_store = matrix_storage_engine.get_store( matrix_uuid=test_uuid) for model_id in df_grp['model_id'].tolist(): logging.info( 'Writing predictions for model_id {}'.format(model_id)) predictor.predict(model_id=model_id, matrix_store=test_matrix_store, train_matrix_columns=train_matrix_columns, misc_db_parameters={}) logging.info('Successfully generated predictions for {} models!'.format( len(model_matrix_info)))
def basic_integration_test(state_filters, feature_group_create_rules, feature_group_mix_rules, expected_matrix_multiplier, expected_group_lists): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) Base.metadata.create_all(db_engine) populate_source_data(db_engine) with TemporaryDirectory() as temp_dir: chopper = Timechop( feature_start_time=datetime(2010, 1, 1), feature_end_time=datetime(2014, 1, 1), label_start_time=datetime(2011, 1, 1), label_end_time=datetime(2014, 1, 1), model_update_frequency='1year', training_label_timespans=['6months'], test_label_timespans=['6months'], training_as_of_date_frequencies='1day', test_as_of_date_frequencies='3months', max_training_histories=['1months'], test_durations=['1months'], ) state_table_generator = StateTableGeneratorFromDense( db_engine=db_engine, experiment_hash='abcd', dense_state_table='states', ) label_generator = LabelGenerator( db_engine=db_engine, query=sample_config()['label_config']['query']) feature_generator = FeatureGenerator( db_engine=db_engine, features_schema_name='features', replace=True, ) feature_dictionary_creator = FeatureDictionaryCreator( db_engine=db_engine, features_schema_name='features') feature_group_creator = FeatureGroupCreator( feature_group_create_rules) feature_group_mixer = FeatureGroupMixer(feature_group_mix_rules) project_storage = ProjectStorage(temp_dir) planner = Planner( feature_start_time=datetime(2010, 1, 1), label_names=['outcome'], label_types=['binary'], states=state_filters, user_metadata={}, ) builder = MatrixBuilder( engine=db_engine, db_config={ 'features_schema_name': 'features', 'labels_schema_name': 'public', 'labels_table_name': 'labels', 'sparse_state_table_name': 'tmp_sparse_states_abcd', }, matrix_storage_engine=project_storage.matrix_storage_engine(), replace=True) # chop time split_definitions = chopper.chop_time() num_split_matrices = sum(1 + len(split['test_matrices']) for split in split_definitions) # generate as_of_times for feature/label/state generation all_as_of_times = [] for split in split_definitions: all_as_of_times.extend(split['train_matrix']['as_of_times']) for test_matrix in split['test_matrices']: all_as_of_times.extend(test_matrix['as_of_times']) all_as_of_times = list(set(all_as_of_times)) # generate sparse state table state_table_generator.generate_sparse_table( as_of_dates=all_as_of_times) # create labels table label_generator.generate_all_labels(labels_table='labels', as_of_dates=all_as_of_times, label_timespans=['6months']) # create feature table tasks # we would use FeatureGenerator#create_all_tables but want to use # the tasks dict directly to create a feature dict aggregations = feature_generator.aggregations( feature_aggregation_config=[{ 'prefix': 'cat', 'from_obj': 'cat_complaints', 'knowledge_date_column': 'as_of_date', 'aggregates': [{ 'quantity': 'cat_sightings', 'metrics': ['count', 'avg'], 'imputation': { 'all': { 'type': 'mean' } } }], 'intervals': ['1y'], 'groups': ['entity_id'] }, { 'prefix': 'dog', 'from_obj': 'dog_complaints', 'knowledge_date_column': 'as_of_date', 'aggregates_imputation': { 'count': { 'type': 'constant', 'value': 7 }, 'sum': { 'type': 'mean' }, 'avg': { 'type': 'zero' } }, 'aggregates': [{ 'quantity': 'dog_sightings', 'metrics': ['count', 'avg'], }], 'intervals': ['1y'], 'groups': ['entity_id'] }], feature_dates=all_as_of_times, state_table=state_table_generator.sparse_table_name) feature_table_agg_tasks = feature_generator.generate_all_table_tasks( aggregations, task_type='aggregation') # create feature aggregation tables feature_generator.process_table_tasks(feature_table_agg_tasks) feature_table_imp_tasks = feature_generator.generate_all_table_tasks( aggregations, task_type='imputation') # create feature imputation tables feature_generator.process_table_tasks(feature_table_imp_tasks) # build feature dictionaries from feature tables and # subsetting config master_feature_dict = feature_dictionary_creator.feature_dictionary( feature_table_names=feature_table_imp_tasks.keys(), index_column_lookup=feature_generator.index_column_lookup( aggregations)) feature_dicts = feature_group_mixer.generate( feature_group_creator.subsets(master_feature_dict)) # figure out what matrices need to be built _, matrix_build_tasks =\ planner.generate_plans( split_definitions, feature_dicts ) # go and build the matrices builder.build_all_matrices(matrix_build_tasks) # super basic assertion: did matrices we expect get created? matrices_records = list( db_engine.execute( '''select matrix_uuid, num_observations, matrix_type from model_metadata.matrices ''')) matrix_directory = os.path.join(temp_dir, 'matrices') matrices = [ path for path in os.listdir(matrix_directory) if '.csv' in path ] metadatas = [ path for path in os.listdir(matrix_directory) if '.yaml' in path ] assert len(matrices) == num_split_matrices * \ expected_matrix_multiplier assert len(metadatas) == num_split_matrices * \ expected_matrix_multiplier assert len(matrices) == len(matrices_records) feature_group_name_lists = [] for metadata_path in metadatas: with open(os.path.join(matrix_directory, metadata_path)) as f: metadata = yaml.load(f) feature_group_name_lists.append(metadata['feature_groups']) for matrix_uuid, num_observations, matrix_type in matrices_records: assert matrix_uuid in matrix_build_tasks #the hashes of the matrices assert type(num_observations) is int assert matrix_type == matrix_build_tasks[matrix_uuid][ 'matrix_type'] def deep_unique_tuple(l): return set([tuple(i) for i in l]) assert deep_unique_tuple( feature_group_name_lists) == deep_unique_tuple( expected_group_lists)
def predict_forward_with_existed_model(db_engine, project_path, model_id, as_of_date): """Predict forward given model_id and as_of_date and store the prediction in database Args: db_engine (sqlalchemy.db.engine) project_storage (catwalk.storage.ProjectStorage) model_id (int) The id of a given model in the database as_of_date (string) a date string like "YYYY-MM-DD" """ logger.spam("In PREDICT LIST................") upgrade_db(db_engine=db_engine) project_storage = ProjectStorage(project_path) matrix_storage_engine = project_storage.matrix_storage_engine() # 1. Get feature and cohort config from database (train_matrix_uuid, matrix_metadata) = train_matrix_info_from_model_id(db_engine, model_id) experiment_config = experiment_config_from_model_id(db_engine, model_id) # 2. Generate cohort cohort_table_name = f"triage_production.cohort_{experiment_config['cohort_config']['name']}" cohort_table_generator = EntityDateTableGenerator( db_engine=db_engine, query=experiment_config['cohort_config']['query'], entity_date_table_name=cohort_table_name) cohort_table_generator.generate_entity_date_table( as_of_dates=[dt_from_str(as_of_date)]) # 3. Generate feature aggregations feature_generator = FeatureGenerator( db_engine=db_engine, features_schema_name="triage_production", feature_start_time=experiment_config['temporal_config'] ['feature_start_time'], ) collate_aggregations = feature_generator.aggregations( feature_aggregation_config=experiment_config['feature_aggregations'], feature_dates=[as_of_date], state_table=cohort_table_name) feature_generator.process_table_tasks( feature_generator.generate_all_table_tasks(collate_aggregations, task_type='aggregation')) # 4. Reconstruct feature disctionary from feature_names and generate imputation reconstructed_feature_dict = FeatureGroup() imputation_table_tasks = OrderedDict() for aggregation in collate_aggregations: feature_group, feature_names = get_feature_names( aggregation, matrix_metadata) reconstructed_feature_dict[feature_group] = feature_names # Make sure that the features imputed in training should also be imputed in production features_imputed_in_train = get_feature_needs_imputation_in_train( aggregation, feature_names) features_imputed_in_production = get_feature_needs_imputation_in_production( aggregation, db_engine) total_impute_cols = set(features_imputed_in_production) | set( features_imputed_in_train) total_nonimpute_cols = set(f for f in set(feature_names) if '_imp' not in f) - total_impute_cols task_generator = feature_generator._generate_imp_table_tasks_for imputation_table_tasks.update( task_generator(aggregation, impute_cols=list(total_impute_cols), nonimpute_cols=list(total_nonimpute_cols))) feature_generator.process_table_tasks(imputation_table_tasks) # 5. Build matrix db_config = { "features_schema_name": "triage_production", "labels_schema_name": "public", "cohort_table_name": cohort_table_name, } matrix_builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, engine=db_engine, experiment_hash=None, replace=True, ) feature_start_time = experiment_config['temporal_config'][ 'feature_start_time'] label_name = experiment_config['label_config']['name'] label_type = 'binary' cohort_name = experiment_config['cohort_config']['name'] user_metadata = experiment_config['user_metadata'] # Use timechop to get the time definition for production temporal_config = experiment_config["temporal_config"] temporal_config.update( temporal_params_from_matrix_metadata(db_engine, model_id)) timechopper = Timechop(**temporal_config) prod_definitions = timechopper.define_test_matrices( train_test_split_time=dt_from_str(as_of_date), test_duration=temporal_config['test_durations'][0], test_label_timespan=temporal_config['test_label_timespans'][0]) matrix_metadata = Planner.make_metadata( prod_definitions[-1], reconstructed_feature_dict, label_name, label_type, cohort_name, 'production', feature_start_time, user_metadata, ) matrix_metadata['matrix_id'] = str( as_of_date) + f'_model_id_{model_id}' + '_risklist' matrix_uuid = filename_friendly_hash(matrix_metadata) matrix_builder.build_matrix( as_of_times=[as_of_date], label_name=label_name, label_type=label_type, feature_dictionary=reconstructed_feature_dict, matrix_metadata=matrix_metadata, matrix_uuid=matrix_uuid, matrix_type="production", ) # 6. Predict the risk score for production predictor = Predictor( model_storage_engine=project_storage.model_storage_engine(), db_engine=db_engine, rank_order='best') predictor.predict( model_id=model_id, matrix_store=matrix_storage_engine.get_store(matrix_uuid), misc_db_parameters={}, train_matrix_columns=matrix_storage_engine.get_store( train_matrix_uuid).columns())
def __init__(self, db_engine, project_path, model_group_id): self.retrain_hash = None self.db_engine = db_engine upgrade_db(db_engine=self.db_engine) self.project_storage = ProjectStorage(project_path) self.model_group_id = model_group_id self.model_group_info = get_model_group_info(self.db_engine, self.model_group_id) self.matrix_storage_engine = self.project_storage.matrix_storage_engine( ) self.triage_run_id, self.experiment_config = experiment_config_from_model_group_id( self.db_engine, self.model_group_id) # This feels like it needs some refactoring since in some edge cases at least the test matrix temporal parameters # might differ across models in the mdoel group (the training ones shouldn't), but this should probably work for # the vast majorty of use cases... self.experiment_config['temporal_config'].update( temporal_params_from_matrix_metadata( self.db_engine, self.model_group_info['model_id_last_split'])) # Since "testing" here is predicting forward to a single new date, the test_duration should always be '0day' # (regardless of what it may have been before) self.experiment_config['temporal_config']['test_durations'] = ['0day'] # These lists should now only contain one item (the value actually used for the last model in this group) self.training_label_timespan = self.experiment_config[ 'temporal_config']['training_label_timespans'][0] self.test_label_timespan = self.experiment_config['temporal_config'][ 'test_label_timespans'][0] self.test_duration = self.experiment_config['temporal_config'][ 'test_durations'][0] self.feature_start_time = self.experiment_config['temporal_config'][ 'feature_start_time'] self.label_name = self.experiment_config['label_config']['name'] self.cohort_name = self.experiment_config['cohort_config']['name'] self.user_metadata = self.experiment_config['user_metadata'] self.feature_dictionary_creator = FeatureDictionaryCreator( features_schema_name='triage_production', db_engine=self.db_engine) self.label_generator = LabelGenerator( label_name=self.experiment_config['label_config'].get( "name", None), query=self.experiment_config['label_config']["query"], replace=True, db_engine=self.db_engine, ) self.labels_table_name = "labels_{}_{}_production".format( self.experiment_config['label_config'].get('name', 'default'), filename_friendly_hash( self.experiment_config['label_config']['query'])) self.feature_generator = FeatureGenerator( db_engine=self.db_engine, features_schema_name="triage_production", feature_start_time=self.feature_start_time, ) self.model_trainer = ModelTrainer( experiment_hash=None, model_storage_engine=ModelStorageEngine(self.project_storage), db_engine=self.db_engine, replace=True, run_id=self.triage_run_id, )
class Retrainer: """Given a model_group_id and prediction_date, retrain a model using the all the data till prediction_date Args: db_engine (sqlalchemy.engine) project_path (string) model_group_id (string) """ def __init__(self, db_engine, project_path, model_group_id): self.retrain_hash = None self.db_engine = db_engine upgrade_db(db_engine=self.db_engine) self.project_storage = ProjectStorage(project_path) self.model_group_id = model_group_id self.model_group_info = get_model_group_info(self.db_engine, self.model_group_id) self.matrix_storage_engine = self.project_storage.matrix_storage_engine( ) self.triage_run_id, self.experiment_config = experiment_config_from_model_group_id( self.db_engine, self.model_group_id) # This feels like it needs some refactoring since in some edge cases at least the test matrix temporal parameters # might differ across models in the mdoel group (the training ones shouldn't), but this should probably work for # the vast majorty of use cases... self.experiment_config['temporal_config'].update( temporal_params_from_matrix_metadata( self.db_engine, self.model_group_info['model_id_last_split'])) # Since "testing" here is predicting forward to a single new date, the test_duration should always be '0day' # (regardless of what it may have been before) self.experiment_config['temporal_config']['test_durations'] = ['0day'] # These lists should now only contain one item (the value actually used for the last model in this group) self.training_label_timespan = self.experiment_config[ 'temporal_config']['training_label_timespans'][0] self.test_label_timespan = self.experiment_config['temporal_config'][ 'test_label_timespans'][0] self.test_duration = self.experiment_config['temporal_config'][ 'test_durations'][0] self.feature_start_time = self.experiment_config['temporal_config'][ 'feature_start_time'] self.label_name = self.experiment_config['label_config']['name'] self.cohort_name = self.experiment_config['cohort_config']['name'] self.user_metadata = self.experiment_config['user_metadata'] self.feature_dictionary_creator = FeatureDictionaryCreator( features_schema_name='triage_production', db_engine=self.db_engine) self.label_generator = LabelGenerator( label_name=self.experiment_config['label_config'].get( "name", None), query=self.experiment_config['label_config']["query"], replace=True, db_engine=self.db_engine, ) self.labels_table_name = "labels_{}_{}_production".format( self.experiment_config['label_config'].get('name', 'default'), filename_friendly_hash( self.experiment_config['label_config']['query'])) self.feature_generator = FeatureGenerator( db_engine=self.db_engine, features_schema_name="triage_production", feature_start_time=self.feature_start_time, ) self.model_trainer = ModelTrainer( experiment_hash=None, model_storage_engine=ModelStorageEngine(self.project_storage), db_engine=self.db_engine, replace=True, run_id=self.triage_run_id, ) def get_temporal_config_for_retrain(self, prediction_date): temporal_config = self.experiment_config['temporal_config'].copy() temporal_config['feature_end_time'] = datetime.strftime( prediction_date, "%Y-%m-%d") temporal_config['label_end_time'] = datetime.strftime( prediction_date + convert_str_to_relativedelta(self.test_label_timespan), "%Y-%m-%d") # just needs to be bigger than the gap between the label start and end times # to ensure we only get one time split for the retraining temporal_config['model_update_frequency'] = '%syears' % ( dt_from_str(temporal_config['label_end_time']).year - dt_from_str(temporal_config['label_start_time']).year + 10) return temporal_config def generate_all_labels(self, as_of_date): self.label_generator.generate_all_labels( labels_table=self.labels_table_name, as_of_dates=[as_of_date], label_timespans=[self.training_label_timespan]) def generate_entity_date_table(self, as_of_date, entity_date_table_name): cohort_table_generator = EntityDateTableGenerator( db_engine=self.db_engine, query=self.experiment_config['cohort_config']['query'], entity_date_table_name=entity_date_table_name) cohort_table_generator.generate_entity_date_table( as_of_dates=[dt_from_str(as_of_date)]) def get_collate_aggregations(self, as_of_date, state_table): collate_aggregations = self.feature_generator.aggregations( feature_aggregation_config=self. experiment_config['feature_aggregations'], feature_dates=[as_of_date], state_table=state_table) return collate_aggregations def get_feature_dict_and_imputation_task(self, collate_aggregations, model_id): (train_matrix_uuid, matrix_metadata) = train_matrix_info_from_model_id( self.db_engine, model_id) reconstructed_feature_dict = FeatureGroup() imputation_table_tasks = OrderedDict() for aggregation in collate_aggregations: feature_group, feature_names = get_feature_names( aggregation, matrix_metadata) reconstructed_feature_dict[feature_group] = feature_names # Make sure that the features imputed in training should also be imputed in production features_imputed_in_train = get_feature_needs_imputation_in_train( aggregation, feature_names) features_imputed_in_production = get_feature_needs_imputation_in_production( aggregation, self.db_engine) total_impute_cols = set(features_imputed_in_production) | set( features_imputed_in_train) total_nonimpute_cols = set(f for f in set(feature_names) if '_imp' not in f) - total_impute_cols task_generator = self.feature_generator._generate_imp_table_tasks_for imputation_table_tasks.update( task_generator(aggregation, impute_cols=list(total_impute_cols), nonimpute_cols=list(total_nonimpute_cols))) return reconstructed_feature_dict, imputation_table_tasks def retrain(self, prediction_date): """Retrain a model by going back one split from prediction_date, so the as_of_date for training would be (prediction_date - training_label_timespan) Args: prediction_date(str) """ # Retrain config and hash retrain_config = { "model_group_id": self.model_group_id, "prediction_date": prediction_date, "test_label_timespan": self.test_label_timespan, "test_duration": self.test_duration, } self.retrain_hash = save_retrain_and_get_hash(retrain_config, self.db_engine) with get_for_update(self.db_engine, Retrain, self.retrain_hash) as retrain: retrain.prediction_date = prediction_date # Timechop prediction_date = dt_from_str(prediction_date) temporal_config = self.get_temporal_config_for_retrain(prediction_date) timechopper = Timechop(**temporal_config) chops = timechopper.chop_time() assert len(chops) == 1 chops_train_matrix = chops[0]['train_matrix'] as_of_date = datetime.strftime(chops_train_matrix['last_as_of_time'], "%Y-%m-%d") retrain_definition = { 'first_as_of_time': chops_train_matrix['first_as_of_time'], 'last_as_of_time': chops_train_matrix['last_as_of_time'], 'matrix_info_end_time': chops_train_matrix['matrix_info_end_time'], 'as_of_times': [as_of_date], 'training_label_timespan': chops_train_matrix['training_label_timespan'], 'max_training_history': chops_train_matrix['max_training_history'], 'training_as_of_date_frequency': chops_train_matrix['training_as_of_date_frequency'], } # Set ExperimentRun run = TriageRun( start_time=datetime.now(), git_hash=infer_git_hash(), triage_version=infer_triage_version(), python_version=infer_python_version(), run_type="retrain", run_hash=self.retrain_hash, last_updated_time=datetime.now(), current_status=TriageRunStatus.started, installed_libraries=infer_installed_libraries(), platform=platform.platform(), os_user=getpass.getuser(), working_directory=os.getcwd(), ec2_instance_type=infer_ec2_instance_type(), log_location=infer_log_location(), experiment_class_path=classpath(self.__class__), random_seed=retrieve_experiment_seed_from_run_id( self.db_engine, self.triage_run_id), ) run_id = None with scoped_session(self.db_engine) as session: session.add(run) session.commit() run_id = run.run_id if not run_id: raise ValueError("Failed to retrieve run_id from saved row") # set ModelTrainer's run_id and experiment_hash for Retrain run self.model_trainer.run_id = run_id self.model_trainer.experiment_hash = self.retrain_hash # 1. Generate all labels self.generate_all_labels(as_of_date) record_labels_table_name(run_id, self.db_engine, self.labels_table_name) # 2. Generate cohort cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_retrain" self.generate_entity_date_table(as_of_date, cohort_table_name) record_cohort_table_name(run_id, self.db_engine, cohort_table_name) # 3. Generate feature aggregations collate_aggregations = self.get_collate_aggregations( as_of_date, cohort_table_name) feature_aggregation_table_tasks = self.feature_generator.generate_all_table_tasks( collate_aggregations, task_type='aggregation') self.feature_generator.process_table_tasks( feature_aggregation_table_tasks) # 4. Reconstruct feature disctionary from feature_names and generate imputation reconstructed_feature_dict, imputation_table_tasks = self.get_feature_dict_and_imputation_task( collate_aggregations, self.model_group_info['model_id_last_split'], ) feature_group_creator = FeatureGroupCreator( self.experiment_config['feature_group_definition']) feature_group_mixer = FeatureGroupMixer(["all"]) feature_group_dict = feature_group_mixer.generate( feature_group_creator.subsets(reconstructed_feature_dict))[0] self.feature_generator.process_table_tasks(imputation_table_tasks) # 5. Build new matrix db_config = { "features_schema_name": "triage_production", "labels_schema_name": "public", "cohort_table_name": cohort_table_name, "labels_table_name": self.labels_table_name, } record_matrix_building_started(run_id, self.db_engine) matrix_builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=self.matrix_storage_engine, engine=self.db_engine, experiment_hash=None, replace=True, ) new_matrix_metadata = Planner.make_metadata( matrix_definition=retrain_definition, feature_dictionary=feature_group_dict, label_name=self.label_name, label_type='binary', cohort_name=self.cohort_name, matrix_type='train', feature_start_time=dt_from_str(self.feature_start_time), user_metadata=self.user_metadata, ) new_matrix_metadata['matrix_id'] = "_".join([ self.label_name, 'binary', str(as_of_date), 'retrain', ]) matrix_uuid = filename_friendly_hash(new_matrix_metadata) matrix_builder.build_matrix( as_of_times=[as_of_date], label_name=self.label_name, label_type='binary', feature_dictionary=feature_group_dict, matrix_metadata=new_matrix_metadata, matrix_uuid=matrix_uuid, matrix_type="train", ) retrain_model_comment = 'retrain_' + str(datetime.now()) misc_db_parameters = { 'train_end_time': dt_from_str(as_of_date), 'test': False, 'train_matrix_uuid': matrix_uuid, 'training_label_timespan': self.training_label_timespan, 'model_comment': retrain_model_comment, } # get the random seed from the last split last_split_train_matrix_uuid, last_split_matrix_metadata = train_matrix_info_from_model_id( self.db_engine, model_id=self.model_group_info['model_id_last_split']) random_seed = self.model_trainer.get_or_generate_random_seed( model_group_id=self.model_group_id, matrix_metadata=last_split_matrix_metadata, train_matrix_uuid=last_split_train_matrix_uuid) # create retrain model hash retrain_model_hash = self.model_trainer._model_hash( self.matrix_storage_engine.get_store(matrix_uuid).metadata, class_path=self.model_group_info['model_type'], parameters=self.model_group_info['hyperparameters'], random_seed=random_seed, ) associate_models_with_retrain(self.retrain_hash, (retrain_model_hash, ), self.db_engine) record_model_building_started(run_id, self.db_engine) retrain_model_id = self.model_trainer.process_train_task( matrix_store=self.matrix_storage_engine.get_store(matrix_uuid), class_path=self.model_group_info['model_type'], parameters=self.model_group_info['hyperparameters'], model_hash=retrain_model_hash, misc_db_parameters=misc_db_parameters, random_seed=random_seed, retrain=True, model_group_id=self.model_group_id) self.retrain_model_hash = retrieve_model_hash_from_id( self.db_engine, retrain_model_id) self.retrain_matrix_uuid = matrix_uuid self.retrain_model_id = retrain_model_id return { 'retrain_model_comment': retrain_model_comment, 'retrain_model_id': retrain_model_id } def predict(self, prediction_date): """Predict forward by creating a matrix using as_of_date = prediction_date and applying the retrain model on it Args: prediction_date(str) """ cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_predict" # 1. Generate cohort self.generate_entity_date_table(prediction_date, cohort_table_name) # 2. Generate feature aggregations collate_aggregations = self.get_collate_aggregations( prediction_date, cohort_table_name) self.feature_generator.process_table_tasks( self.feature_generator.generate_all_table_tasks( collate_aggregations, task_type='aggregation')) # 3. Reconstruct feature disctionary from feature_names and generate imputation reconstructed_feature_dict, imputation_table_tasks = self.get_feature_dict_and_imputation_task( collate_aggregations, self.retrain_model_id) self.feature_generator.process_table_tasks(imputation_table_tasks) # 4. Build matrix db_config = { "features_schema_name": "triage_production", "labels_schema_name": "public", "cohort_table_name": cohort_table_name, } matrix_builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=self.matrix_storage_engine, engine=self.db_engine, experiment_hash=None, replace=True, ) # Use timechop to get the time definition for production temporal_config = self.get_temporal_config_for_retrain( dt_from_str(prediction_date)) timechopper = Timechop(**temporal_config) retrain_config = get_retrain_config_from_model_id( self.db_engine, self.retrain_model_id) prod_definitions = timechopper.define_test_matrices( train_test_split_time=dt_from_str(prediction_date), test_duration=retrain_config['test_duration'], test_label_timespan=retrain_config['test_label_timespan']) last_split_definition = prod_definitions[-1] matrix_metadata = Planner.make_metadata( matrix_definition=last_split_definition, feature_dictionary=reconstructed_feature_dict, label_name=self.label_name, label_type='binary', cohort_name=self.cohort_name, matrix_type='production', feature_start_time=self.feature_start_time, user_metadata=self.user_metadata, ) matrix_metadata['matrix_id'] = str( prediction_date ) + f'_model_id_{self.retrain_model_id}' + '_risklist' matrix_uuid = filename_friendly_hash(matrix_metadata) matrix_builder.build_matrix( as_of_times=[prediction_date], label_name=self.label_name, label_type='binary', feature_dictionary=reconstructed_feature_dict, matrix_metadata=matrix_metadata, matrix_uuid=matrix_uuid, matrix_type="production", ) # 5. Predict the risk score for production predictor = Predictor( model_storage_engine=self.project_storage.model_storage_engine(), db_engine=self.db_engine, rank_order='best') predictor.predict( model_id=self.retrain_model_id, matrix_store=self.matrix_storage_engine.get_store(matrix_uuid), misc_db_parameters={}, train_matrix_columns=self.matrix_storage_engine.get_store( self.retrain_matrix_uuid).columns(), ) self.predict_matrix_uuid = matrix_uuid
def basic_integration_test( cohort_names, feature_group_create_rules, feature_group_mix_rules, expected_matrix_multiplier, expected_group_lists, ): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) Base.metadata.create_all(db_engine) populate_source_data(db_engine) with TemporaryDirectory() as temp_dir: chopper = Timechop( feature_start_time=datetime(2010, 1, 1), feature_end_time=datetime(2014, 1, 1), label_start_time=datetime(2011, 1, 1), label_end_time=datetime(2014, 1, 1), model_update_frequency="1year", training_label_timespans=["6months"], test_label_timespans=["6months"], training_as_of_date_frequencies="1day", test_as_of_date_frequencies="3months", max_training_histories=["1months"], test_durations=["1months"], ) entity_date_table_generator = EntityDateTableGenerator( db_engine=db_engine, entity_date_table_name="cohort_abcd", query="select distinct(entity_id) from events") label_generator = LabelGenerator( db_engine=db_engine, query=sample_config()["label_config"]["query"]) feature_generator = FeatureGenerator( db_engine=db_engine, features_schema_name="features", replace=True) feature_dictionary_creator = FeatureDictionaryCreator( db_engine=db_engine, features_schema_name="features") feature_group_creator = FeatureGroupCreator( feature_group_create_rules) feature_group_mixer = FeatureGroupMixer(feature_group_mix_rules) project_storage = ProjectStorage(temp_dir) planner = Planner( feature_start_time=datetime(2010, 1, 1), label_names=["outcome"], label_types=["binary"], cohort_names=cohort_names, user_metadata={}, ) builder = MatrixBuilder( engine=db_engine, db_config={ "features_schema_name": "features", "labels_schema_name": "public", "labels_table_name": "labels", "cohort_table_name": "cohort_abcd", }, experiment_hash=None, matrix_storage_engine=project_storage.matrix_storage_engine(), replace=True, ) # chop time split_definitions = chopper.chop_time() num_split_matrices = sum(1 + len(split["test_matrices"]) for split in split_definitions) # generate as_of_times for feature/label/state generation all_as_of_times = [] for split in split_definitions: all_as_of_times.extend(split["train_matrix"]["as_of_times"]) for test_matrix in split["test_matrices"]: all_as_of_times.extend(test_matrix["as_of_times"]) all_as_of_times = list(set(all_as_of_times)) # generate entity_date state table entity_date_table_generator.generate_entity_date_table( as_of_dates=all_as_of_times) # create labels table label_generator.generate_all_labels( labels_table="labels", as_of_dates=all_as_of_times, label_timespans=["6months"], ) # create feature table tasks # we would use FeatureGenerator#create_all_tables but want to use # the tasks dict directly to create a feature dict aggregations = feature_generator.aggregations( feature_aggregation_config=[ { "prefix": "cat", "from_obj": "cat_complaints", "knowledge_date_column": "as_of_date", "aggregates": [{ "quantity": "cat_sightings", "metrics": ["count", "avg"], "imputation": { "all": { "type": "mean" } }, }], "intervals": ["1y"], "groups": ["entity_id"], }, { "prefix": "dog", "from_obj": "dog_complaints", "knowledge_date_column": "as_of_date", "aggregates_imputation": { "count": { "type": "constant", "value": 7 }, "sum": { "type": "mean" }, "avg": { "type": "zero" }, }, "aggregates": [{ "quantity": "dog_sightings", "metrics": ["count", "avg"] }], "intervals": ["1y"], "groups": ["entity_id"], }, ], feature_dates=all_as_of_times, state_table=entity_date_table_generator.entity_date_table_name, ) feature_table_agg_tasks = feature_generator.generate_all_table_tasks( aggregations, task_type="aggregation") # create feature aggregation tables feature_generator.process_table_tasks(feature_table_agg_tasks) feature_table_imp_tasks = feature_generator.generate_all_table_tasks( aggregations, task_type="imputation") # create feature imputation tables feature_generator.process_table_tasks(feature_table_imp_tasks) # build feature dictionaries from feature tables and # subsetting config master_feature_dict = feature_dictionary_creator.feature_dictionary( feature_table_names=feature_table_imp_tasks.keys(), index_column_lookup=feature_generator.index_column_lookup( aggregations), ) feature_dicts = feature_group_mixer.generate( feature_group_creator.subsets(master_feature_dict)) # figure out what matrices need to be built _, matrix_build_tasks = planner.generate_plans( split_definitions, feature_dicts) # go and build the matrices builder.build_all_matrices(matrix_build_tasks) # super basic assertion: did matrices we expect get created? matrices_records = list( db_engine.execute( """select matrix_uuid, num_observations, matrix_type from triage_metadata.matrices """)) matrix_directory = os.path.join(temp_dir, "matrices") matrices = [ path for path in os.listdir(matrix_directory) if ".csv" in path ] metadatas = [ path for path in os.listdir(matrix_directory) if ".yaml" in path ] assert len(matrices) == num_split_matrices * \ expected_matrix_multiplier assert len(metadatas) == num_split_matrices * \ expected_matrix_multiplier assert len(matrices) == len(matrices_records) feature_group_name_lists = [] for metadata_path in metadatas: with open(os.path.join(matrix_directory, metadata_path)) as f: metadata = yaml.full_load(f) feature_group_name_lists.append(metadata["feature_groups"]) for matrix_uuid, num_observations, matrix_type in matrices_records: assert matrix_uuid in matrix_build_tasks # the hashes of the matrices assert type(num_observations) is int assert matrix_type == matrix_build_tasks[matrix_uuid][ "matrix_type"] def deep_unique_tuple(l): return set([tuple(i) for i in l]) assert deep_unique_tuple( feature_group_name_lists) == deep_unique_tuple( expected_group_lists)
def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, materialize_subquery_fromobjs=True, features_ignore_cohort=False, profile=False, save_predictions=True, skip_validation=False, ): experiment_kwargs = bind_kwargs( self.__class__, **{ key: value for (key, value) in locals().items() if key not in {'db_engine', 'config', 'self'} }) self._check_config_version(config) self.config = config random.seed(config['random_seed']) self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine( self.project_storage, matrix_storage_class) self.project_path = project_path self.replace = replace self.save_predictions = save_predictions self.skip_validation = skip_validation self.db_engine = db_engine results_schema.upgrade_if_clean(dburl=self.db_engine.url) self.features_schema_name = "features" self.materialize_subquery_fromobjs = materialize_subquery_fromobjs self.features_ignore_cohort = features_ignore_cohort self.experiment_hash = save_experiment_and_get_hash( self.config, self.db_engine) self.run_id = initialize_tracking_and_get_run_id( self.experiment_hash, experiment_class_path=classpath(self.__class__), experiment_kwargs=experiment_kwargs, db_engine=self.db_engine) self.initialize_components() self.cleanup = cleanup if self.cleanup: logging.info( "cleanup is set to True, so intermediate tables (labels and cohort) " "will be removed after matrix creation and subset tables will be " "removed after model training and testing") else: logging.info( "cleanup is set to False, so intermediate tables (labels, cohort, and subsets) " "will not be removed") self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout) self.profile = profile logging.info("Generate profiling stats? (profile option): %s", self.profile)
class ExperimentBase(ABC): """The base class for all Experiments. Subclasses must implement the following four methods: process_query_tasks process_matrix_build_tasks process_subset_tasks process_train_test_batches Look at singlethreaded.py for reference implementation of each. Args: config (dict) db_engine (triage.util.db.SerializableDbEngine or sqlalchemy.engine.Engine) project_path (string) replace (bool) cleanup_timeout (int) materialize_subquery_fromobjs (bool, default True) Whether or not to create and index tables for feature "from objects" that are subqueries. Can speed up performance when building features for many as-of-dates. profile (bool) """ cleanup_timeout = 60 # seconds def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, materialize_subquery_fromobjs=True, features_ignore_cohort=False, profile=False, save_predictions=True, skip_validation=False, partial_run=False, ): # For a partial run, skip validation and avoid cleaning up # we'll also skip filling default config values below if partial_run: cleanup = False skip_validation = True experiment_kwargs = bind_kwargs( self.__class__, **{ key: value for (key, value) in locals().items() if key not in {'db_engine', 'config', 'self'} }) self._check_config_version(config) self.config = config self.config['random_seed'] = self.config.get('random_seed', random.randint(1, 1e7)) random.seed(self.config['random_seed']) self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine( self.project_storage, matrix_storage_class) self.project_path = project_path self.replace = replace self.save_predictions = save_predictions self.skip_validation = skip_validation self.db_engine = db_engine results_schema.upgrade_if_clean(dburl=self.db_engine.url) self.features_schema_name = "features" self.materialize_subquery_fromobjs = materialize_subquery_fromobjs self.features_ignore_cohort = features_ignore_cohort # only fill default values for full runs if not partial_run: ## Defaults to sane values self.config['temporal_config'] = fill_timechop_config_missing( self.config, self.db_engine) ## Defaults to all the entities found in the features_aggregation's from_obj self.config['cohort_config'] = fill_cohort_config_missing( self.config) ## Defaults to all the feature_aggregation's prefixes self.config[ 'feature_group_definition'] = fill_feature_group_definition( self.config) grid_config = fill_model_grid_presets(self.config) self.config.pop('model_grid_preset', None) if grid_config is not None: self.config['grid_config'] = grid_config ###################### RUBICON ###################### self.experiment_hash = save_experiment_and_get_hash( self.config, self.db_engine) self.run_id = initialize_tracking_and_get_run_id( self.experiment_hash, experiment_class_path=classpath(self.__class__), experiment_kwargs=experiment_kwargs, db_engine=self.db_engine) self.initialize_components() self.cleanup = cleanup if self.cleanup: logging.info( "cleanup is set to True, so intermediate tables (labels and cohort) " "will be removed after matrix creation and subset tables will be " "removed after model training and testing") else: logging.info( "cleanup is set to False, so intermediate tables (labels, cohort, and subsets) " "will not be removed") self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout) self.profile = profile logging.info("Generate profiling stats? (profile option): %s", self.profile) def _check_config_version(self, config): if "config_version" in config: config_version = config["config_version"] else: logging.warning( "config_version key not found in experiment config. " "Assuming v1, which may not be correct") config_version = "v1" if config_version != CONFIG_VERSION: raise ValueError("Experiment config '{}' " "does not match current version '{}'. " "Will not run experiment.".format( config_version, CONFIG_VERSION)) @cachedproperty def cohort_hash(self): if "query" in self.config.get("cohort_config", {}): return filename_friendly_hash( self.config["cohort_config"]["query"]) else: return None def initialize_components(self): split_config = self.config["temporal_config"] self.chopper = Timechop(**split_config) cohort_config = self.config.get("cohort_config", {}) if "query" in cohort_config: self.cohort_table_name = "cohort_{}_{}".format( cohort_config.get('name', 'default'), self.cohort_hash) self.cohort_table_generator = EntityDateTableGenerator( entity_date_table_name=self.cohort_table_name, db_engine=self.db_engine, query=cohort_config["query"], replace=self.replace) else: logging.warning( "cohort_config missing or unrecognized. Without a cohort, " "you will not be able to make matrices, perform feature imputation, " "or save time by only computing features for that cohort.") self.features_ignore_cohort = True self.cohort_table_name = "cohort_{}".format(self.experiment_hash) self.cohort_table_generator = EntityDateTableGeneratorNoOp() self.subsets = [None] + self.config.get("scoring", {}).get( "subsets", []) if "label_config" in self.config: label_config = self.config["label_config"] self.labels_table_name = "labels_{}_{}".format( label_config.get('name', 'default'), filename_friendly_hash(label_config['query'])) self.label_generator = LabelGenerator( label_name=label_config.get("name", None), query=label_config["query"], replace=self.replace, db_engine=self.db_engine, ) else: self.labels_table_name = "labels_{}".format(self.experiment_hash) self.label_generator = LabelGeneratorNoOp() logging.warning( "label_config missing or unrecognized. Without labels, " "you will not be able to make matrices.") if "bias_audit_config" in self.config: bias_config = self.config["bias_audit_config"] self.bias_hash = filename_friendly_hash(bias_config) self.protected_groups_table_name = f"protected_groups_{self.bias_hash}" self.protected_groups_generator = ProtectedGroupsGenerator( db_engine=self.db_engine, from_obj=parse_from_obj(bias_config, 'bias_from_obj'), attribute_columns=bias_config.get("attribute_columns", None), entity_id_column=bias_config.get("entity_id_column", None), knowledge_date_column=bias_config.get("knowledge_date_column", None), protected_groups_table_name=self.protected_groups_table_name, replace=self.replace) else: self.protected_groups_generator = ProtectedGroupsGeneratorNoOp() logging.warning( "bias_audit_config missing or unrecognized. Without protected groups, " "you will not audit your models for bias and fairness.") self.feature_dictionary_creator = FeatureDictionaryCreator( features_schema_name=self.features_schema_name, db_engine=self.db_engine) self.feature_generator = FeatureGenerator( features_schema_name=self.features_schema_name, replace=self.replace, db_engine=self.db_engine, feature_start_time=split_config["feature_start_time"], materialize_subquery_fromobjs=self.materialize_subquery_fromobjs, features_ignore_cohort=self.features_ignore_cohort) self.feature_group_creator = FeatureGroupCreator( self.config.get("feature_group_definition", {"all": [True]})) self.feature_group_mixer = FeatureGroupMixer( self.config.get("feature_group_strategies", ["all"])) self.planner = Planner( feature_start_time=dt_from_str(split_config["feature_start_time"]), label_names=[ self.config.get("label_config", {}).get("name", DEFAULT_LABEL_NAME) ], label_types=["binary"], cohort_names=[ self.config.get("cohort_config", {}).get("name", None) ], user_metadata=self.config.get("user_metadata", {}), ) self.matrix_builder = MatrixBuilder( db_config={ "features_schema_name": self.features_schema_name, "labels_schema_name": "public", "labels_table_name": self.labels_table_name, "cohort_table_name": self.cohort_table_name, }, matrix_storage_engine=self.matrix_storage_engine, experiment_hash=self.experiment_hash, include_missing_labels_in_train_as=self.config.get( "label_config", {}).get("include_missing_labels_in_train_as", None), engine=self.db_engine, replace=self.replace, run_id=self.run_id, ) self.subsetter = Subsetter(db_engine=self.db_engine, replace=self.replace, as_of_times=self.all_as_of_times) self.trainer = ModelTrainer( experiment_hash=self.experiment_hash, model_storage_engine=self.model_storage_engine, model_grouper=ModelGrouper(self.config.get("model_group_keys", [])), db_engine=self.db_engine, replace=self.replace, run_id=self.run_id, ) self.predictor = Predictor( db_engine=self.db_engine, model_storage_engine=self.model_storage_engine, save_predictions=self.save_predictions, replace=self.replace, rank_order=self.config.get("prediction", {}).get("rank_tiebreaker", "worst"), ) self.individual_importance_calculator = IndividualImportanceCalculator( db_engine=self.db_engine, n_ranks=self.config.get("individual_importance", {}).get("n_ranks", 5), methods=self.config.get("individual_importance", {}).get("methods", ["uniform"]), replace=self.replace, ) self.evaluator = ModelEvaluator( db_engine=self.db_engine, testing_metric_groups=self.config.get("scoring", {}).get( "testing_metric_groups", []), training_metric_groups=self.config.get("scoring", {}).get( "training_metric_groups", []), bias_config=self.config.get("bias_audit_config", {})) self.model_train_tester = ModelTrainTester( matrix_storage_engine=self.matrix_storage_engine, model_evaluator=self.evaluator, model_trainer=self.trainer, individual_importance_calculator=self. individual_importance_calculator, predictor=self.predictor, subsets=self.subsets, protected_groups_generator=self.protected_groups_generator, cohort_hash=self.cohort_hash) def get_for_update(self): return get_for_update(self.db_engine, results_schema.Experiment, self.experiment_hash) @cachedproperty def split_definitions(self): """Temporal splits based on the experiment's configuration Returns: (dict) temporal splits Example: ``` { 'feature_start_time': {datetime}, 'feature_end_time': {datetime}, 'label_start_time': {datetime}, 'label_end_time': {datetime}, 'train_matrix': { 'first_as_of_time': {datetime}, 'last_as_of_time': {datetime}, 'matrix_info_end_time': {datetime}, 'training_label_timespan': {str}, 'training_as_of_date_frequency': {str}, 'max_training_history': {str}, 'as_of_times': [list of {datetime}s] }, 'test_matrices': [list of matrix defs similar to train_matrix] } ``` (When updating/setting split definitions, matrices should have UUIDs.) """ split_definitions = self.chopper.chop_time() logging.info("Computed and stored split definitions: %s", split_definitions) logging.info("\n----TIME SPLIT SUMMARY----\n") logging.info("Number of time splits: {}".format( len(split_definitions))) for split_index, split in enumerate(split_definitions): train_times = split["train_matrix"]["as_of_times"] test_times = [ as_of_time for test_matrix in split["test_matrices"] for as_of_time in test_matrix["as_of_times"] ] logging.info("""Split index {}: Training as_of_time_range: {} to {} ({} total) Testing as_of_time range: {} to {} ({} total)\n\n""".format( split_index, min(train_times), max(train_times), len(train_times), min(test_times), max(test_times), len(test_times), )) with self.get_for_update() as experiment: experiment.time_splits = len(split_definitions) return split_definitions @cachedproperty def all_as_of_times(self): """All 'as of times' in experiment config Used for label and feature generation. Returns: (list) of datetimes """ all_as_of_times = [] for split in self.split_definitions: all_as_of_times.extend(split["train_matrix"]["as_of_times"]) logging.debug( "Adding as_of_times from train matrix: %s", split["train_matrix"]["as_of_times"], ) for test_matrix in split["test_matrices"]: logging.debug( "Adding as_of_times from test matrix: %s", test_matrix["as_of_times"], ) all_as_of_times.extend(test_matrix["as_of_times"]) logging.info( "Computed %s total as_of_times for label and feature generation", len(all_as_of_times), ) distinct_as_of_times = list(set(all_as_of_times)) logging.info( "Computed %s distinct as_of_times for label and feature generation", len(distinct_as_of_times), ) logging.info( "You can view all as_of_times by inspecting `.all_as_of_times` on this Experiment" ) with self.get_for_update() as experiment: experiment.as_of_times = len(distinct_as_of_times) return distinct_as_of_times @cachedproperty def collate_aggregations(self): """Collation of ``Aggregation`` objects used by this experiment. Returns: (list) of ``collate.Aggregation`` objects """ logging.info("Creating collate aggregations") if "feature_aggregations" not in self.config: logging.warning("No feature_aggregation config is available") return [] aggregations = self.feature_generator.aggregations( feature_aggregation_config=self.config["feature_aggregations"], feature_dates=self.all_as_of_times, state_table=self.cohort_table_name, ) with self.get_for_update() as experiment: experiment.feature_blocks = len(aggregations) return aggregations @cachedproperty def feature_aggregation_table_tasks(self): """All feature table query tasks specified by this ``Experiment``. Returns: (dict) keys are group table names, values are themselves dicts, each with keys for different stages of table creation (prepare, inserts, finalize) and with values being lists of SQL commands """ logging.info("Calculating feature tasks for %s as_of_times", len(self.all_as_of_times)) return self.feature_generator.generate_all_table_tasks( self.collate_aggregations, task_type="aggregation") @cachedproperty def feature_imputation_table_tasks(self): """All feature imputation query tasks specified by this ``Experiment``. Returns: (dict) keys are group table names, values are themselves dicts, each with keys for different stages of table creation (prepare, inserts, finalize) and with values being lists of SQL commands """ logging.info("Calculating feature tasks for %s as_of_times", len(self.all_as_of_times)) return self.feature_generator.generate_all_table_tasks( self.collate_aggregations, task_type="imputation") @cachedproperty def master_feature_dictionary(self): """All possible features found in the database. Not all features will necessarily end up in matrices Returns: (list) of dicts, keys being feature table names and values being lists of feature names """ result = self.feature_dictionary_creator.feature_dictionary( feature_table_names=self.feature_imputation_table_tasks.keys(), index_column_lookup=self.feature_generator.index_column_lookup( self.collate_aggregations), ) logging.info("Computed master feature dictionary: %s", result) with self.get_for_update() as experiment: experiment.total_features = sum( 1 for _feature in itertools.chain.from_iterable(result.values())) return result @cachedproperty def feature_dicts(self): """Feature dictionaries, representing the feature tables and columns configured in this experiment after computing feature groups. Returns: (list) of dicts, keys being feature table names and values being lists of feature names """ combinations = self.feature_group_mixer.generate( self.feature_group_creator.subsets(self.master_feature_dictionary)) with self.get_for_update() as experiment: experiment.feature_group_combinations = len(combinations) return combinations @cachedproperty def matrix_build_tasks(self): """Tasks for all matrices that need to be built as a part of this Experiment. Each task contains arguments understood by ``Architect.build_matrix``. Returns: (list) of dicts """ if not table_has_data(self.cohort_table_name, self.db_engine): logging.warning( "cohort table is not populated, cannot build any matrices") return {} if not table_has_data(self.labels_table_name, self.db_engine): logging.warning( "labels table is not populated, cannot build any matrices") return {} (updated_split_definitions, matrix_build_tasks) = self.planner.generate_plans( self.split_definitions, self.feature_dicts) self.full_matrix_definitions = updated_split_definitions return matrix_build_tasks @cachedproperty def full_matrix_definitions(self): """Full matrix definitions Returns: (list) temporal and feature information for each matrix """ (updated_split_definitions, matrix_build_tasks) = self.planner.generate_plans( self.split_definitions, self.feature_dicts) self.matrix_build_tasks = matrix_build_tasks return updated_split_definitions @property def all_label_timespans(self): """All train and test label timespans Returns: (list) label timespans, in string form as they appeared in the experiment config """ return list( set(self.config["temporal_config"]["training_label_timespans"] + self.config["temporal_config"]["test_label_timespans"])) @cachedproperty def subset_tasks(self): return self.subsetter.generate_tasks(self.subsets) @experiment_entrypoint def generate_labels(self): """Generate labels based on experiment configuration Results are stored in the database, not returned """ self.label_generator.generate_all_labels(self.labels_table_name, self.all_as_of_times, self.all_label_timespans) @experiment_entrypoint def generate_cohort(self): self.cohort_table_generator.generate_entity_date_table( as_of_dates=self.all_as_of_times) @experiment_entrypoint def generate_protected_groups(self): """Generate protected groups table based on experiment configuration Results are stored in the database, not returned """ self.protected_groups_generator.generate_all_dates( self.all_as_of_times, self.cohort_table_name, self.cohort_hash) def generate_subset(self, subset_hash): self.subsets[ "subset_hash"].subset_table_generator.generate_entity_date_table( as_of_dates=self.all_as_of_times) def log_split(self, split_num, split): logging.info( "Starting train/test for %s out of %s: train range: %s to %s", split_num + 1, len(self.full_matrix_definitions), split["train_matrix"]["first_as_of_time"], split["train_matrix"]["matrix_info_end_time"], ) @abstractmethod def process_subset_tasks(self, subset_tasks): pass @abstractmethod def process_train_test_batches(self, train_test_batches): pass @abstractmethod def process_query_tasks(self, query_tasks): pass @abstractmethod def process_matrix_build_tasks(self, matrix_build_tasks): pass @experiment_entrypoint def generate_preimputation_features(self): self.process_query_tasks(self.feature_aggregation_table_tasks) logging.info( "Finished running preimputation feature queries. The final results are in tables: %s", ",".join(agg.get_table_name() for agg in self.collate_aggregations), ) @experiment_entrypoint def impute_missing_features(self): self.process_query_tasks(self.feature_imputation_table_tasks) logging.info( "Finished running postimputation feature queries. The final results are in tables: %s", ",".join( agg.get_table_name(imputed=True) for agg in self.collate_aggregations), ) def build_matrices(self): associate_matrices_with_experiment(self.experiment_hash, self.matrix_build_tasks.keys(), self.db_engine) with self.get_for_update() as experiment: experiment.matrices_needed = len(self.matrix_build_tasks.keys()) record_matrix_building_started(self.run_id, self.db_engine) self.process_matrix_build_tasks(self.matrix_build_tasks) @experiment_entrypoint def generate_matrices(self): logging.info("Creating cohort") self.generate_cohort() logging.info("Creating labels") self.generate_labels() logging.info("Creating feature aggregation tables") self.generate_preimputation_features() logging.info("Creating feature imputation tables") self.impute_missing_features() logging.info("Building all matrices") self.build_matrices() @experiment_entrypoint def generate_subsets(self): if self.subsets: logging.info("Beginning subset generation") self.process_subset_tasks(self.subset_tasks) else: logging.info( "No subsets found. Proceeding to training and testing models") def _all_train_test_batches(self): if "grid_config" not in self.config: logging.warning( "No grid_config was passed in the experiment config. No models will be trained" ) return return self.model_train_tester.generate_task_batches( splits=self.full_matrix_definitions, grid_config=self.config.get('grid_config'), model_comment=self.config.get('model_comment', None)) @experiment_entrypoint def train_and_test_models(self): self.generate_subsets() logging.info("Creating protected groups table") self.generate_protected_groups() batches = self._all_train_test_batches() if not batches: logging.warning("No train/test tasks found, so no training to do") return with self.get_for_update() as experiment: experiment.grid_size = sum( 1 for _param in self.trainer.flattened_grid_config( self.config.get('grid_config'))) logging.info("%s train/test batches found. Beginning training.", len(batches)) model_hashes = set(task['train_kwargs']['model_hash'] for batch in batches for task in batch.tasks) associate_models_with_experiment(self.experiment_hash, model_hashes, self.db_engine) with self.get_for_update() as experiment: experiment.models_needed = len(model_hashes) record_model_building_started(self.run_id, self.db_engine) self.process_train_test_batches(batches) def validate(self, strict=True): ExperimentValidator(self.db_engine, strict=strict).run(self.config) def _run(self): if not self.skip_validation: self.validate() logging.info("Generating matrices") try: self.generate_matrices() self.train_and_test_models() finally: if self.cleanup: self.clean_up_matrix_building_tables() self.clean_up_subset_tables() logging.info("Experiment complete") self._log_end_of_run_report() def _log_end_of_run_report(self): missing_models = missing_model_hashes(self.experiment_hash, self.db_engine) if len(missing_models) > 0: logging.info( "Found %s missing model hashes." "This means that they were supposed to either be trained or reused" "by this experiment but are not present in the models table." "Inspect the logs for any training errors. Full list: %s", len(missing_models), missing_models) else: logging.info( "All models that were supposed to be trained were trained. Awesome!" ) missing_matrices = missing_matrix_uuids(self.experiment_hash, self.db_engine) if len(missing_matrices) > 0: logging.info( "Found %s missing matrix uuids." "This means that they were supposed to either be build or reused" "by this experiment but are not present in the matrices table." "Inspect the logs for any matrix building errors. Full list: %s", len(missing_matrices), missing_matrices) else: logging.info( "All matrices that were supposed to be build were built. Awesome!" ) def clean_up_matrix_building_tables(self): logging.info("Cleaning up cohort and labels tables") with timeout(self.cleanup_timeout): self.cohort_table_generator.clean_up() self.label_generator.clean_up(self.labels_table_name) def clean_up_subset_tables(self): logging.info("Cleaning up cohort and labels tables") with timeout(self.cleanup_timeout): for subset_task in self.subset_tasks: subset_task["subset_table_generator"].clean_up() def _run_profile(self): cp = cProfile.Profile() cp.runcall(self._run) store = self.project_storage.get_store(["profiling_stats"], f"{int(time.time())}.profile") with store.open('wb') as fd: cp.create_stats() marshal.dump(cp.stats, fd) logging.info( "Profiling stats of this Triage run calculated and written to %s" "in cProfile format.", store) @experiment_entrypoint def run(self): try: if self.profile: self._run_profile() else: self._run() except Exception: logging.exception("Run interrupted by uncaught exception") raise __call__ = run
def project_storage(project_path): """Set up a temporary project storage engine on the filesystem Yields (catwalk.storage.ProjectStorage) """ yield ProjectStorage(project_path)