def _run(self, label_config): logger.spam("Validating label configuration") if not label_config: raise ValueError( dedent( """ Section: label_config - Section not found. You must define a label config.""" ) ) if len(set(label_config.keys()).intersection({"query", "filepath"})) != 1: raise ValueError( dedent( """ Section: label_config - keys ({label_config.keys()}) do not contain exactly one of 'filepath' or 'query'. You must pass a filepath to a label query or include one in the config.""" ) ) label_config = load_query_if_needed(label_config) if "name" in label_config and not string_is_tablesafe(label_config["name"]): raise ValueError( "Section: label_config - " "name should only contain lowercase letters, numbers, and underscores" ) self._validate_query(label_config["query"]) self._validate_include_missing_labels_in_train_as( label_config.get("include_missing_labels_in_train_as", None) ) logger.debug("Validation of label configuration was successful")
def test_update_in_place_if_filepath(self): with unittest.mock.patch("triage.util.conf.open", side_effect=open_side_effect) as mock_file: file_cohort_config = sample_cohort_config("filepath") query_cohort_config = sample_cohort_config("query") loaded_config = load_query_if_needed(file_cohort_config) assert loaded_config == query_cohort_config
def _run(self, cohort_config): logger.spam("Validating of cohort configuration") if len(set(cohort_config.keys()).intersection({"query", "filepath"})) != 1: raise ValueError( dedent( """ Section: cohort_config - keys ({cohort_config.keys()}) do not contain exactly one of 'filepath' or 'query'. You must pass a filepath to a cohort query or include one in the config.""" ) ) cohort_config = load_query_if_needed(cohort_config) query = cohort_config["query"] if "{as_of_date}" not in query: raise ValueError( dedent( """ Section: cohort_config - If 'query' is used as cohort_config, {as_of_date} must be present""" ) ) if "name" in cohort_config and not string_is_tablesafe(cohort_config["name"]): raise ValueError( "Section: cohort_config - " "name should only contain lowercase letters, numbers, and underscores" ) dated_query = query.replace("{as_of_date}", "2016-01-01") logger.spam("Validating cohort query via SQL EXPLAIN") try: self.db_engine.execute(f"explain {dated_query}") logger.debug("Validation of cohort query was successful") except Exception as e: raise ValueError( dedent( f""" Section: cohort_config - given query can not be run with a sample as_of_date . query: "{query}" Full error: {e}""" ) ) logger.debug("Validation of cohort configuration was successful")
def __call__(self, args): self.root.setup() # Loading configuration (if exists) db_engine = create_engine(self.root.db_url) full_config = yaml.full_load(args.feature_config_file) feature_config = full_config["feature_aggregations"] cohort_config = load_query_if_needed(full_config.get("cohort_config", None)) if cohort_config: EntityDateTableGenerator( entity_date_table_name="features_test.test_cohort", db_engine=db_engine, query=cohort_config["query"], replace=True, ).generate_entity_date_table(as_of_dates=[args.as_of_date]) FeatureGenerator(db_engine, "features_test").create_features_before_imputation( feature_aggregation_config=feature_config, feature_dates=[args.as_of_date], state_table="features_test.test_cohort", ) logger.success( f"Features created for feature_config {feature_config} and date {args.as_of_date}" )
def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, materialize_subquery_fromobjs=True, features_ignore_cohort=False, additional_bigtrain_classnames=None, profile=False, save_predictions=True, skip_validation=False, partial_run=False, ): # For a partial run, skip validation and avoid cleaning up # we'll also skip filling default config values below if partial_run: cleanup = False skip_validation = True experiment_kwargs = bind_kwargs( self.__class__, **{ key: value for (key, value) in locals().items() if key not in {"db_engine", "config", "self"} }, ) self._check_config_version(config) self.config = config if self.config.get("cohort_config") is not None: self.config["cohort_config"] = load_query_if_needed( self.config["cohort_config"] ) if self.config.get("label_config") is not None: self.config["label_config"] = load_query_if_needed( self.config["label_config"] ) self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine( self.project_storage, matrix_storage_class ) self.project_path = project_path logger.verbose( f"Matrices and trained models will be saved in {self.project_path}" ) self.replace = replace if self.replace: logger.notice( f"Replace flag is set to true. Matrices, models, " "evaluations and predictions (if they exist) will be replaced" ) self.save_predictions = save_predictions if not self.save_predictions: logger.notice( f"Save predictions flag is set to false. " "Individual predictions won't be stored in the predictions " "table. This will decrease both the running time " "of an experiment and also decrease the space needed in the db" ) self.skip_validation = skip_validation if self.skip_validation: logger.notice( f"Warning: Skip validation flag is set to true. " "The experiment config file specified won't be validated. " "This will reduce (a little) the running time of the experiment, " "but has some potential risks, e.g. the experiment could fail" "after some time due to some misconfiguration. Proceed with care." ) self.db_engine = db_engine results_schema.upgrade_if_clean(dburl=self.db_engine.url) self.features_schema_name = "features" self.materialize_subquery_fromobjs = materialize_subquery_fromobjs if not self.materialize_subquery_fromobjs: logger.notice( "Materialize from_objs is set to false. " "The from_objs will be calculated on the fly every time." ) self.features_ignore_cohort = features_ignore_cohort if self.features_ignore_cohort: logger.notice( "Features will be calculated for all the entities " "(i.e. ignoring cohort) this setting will have the effect " "that more db space will be used, but potentially could save " "time if you are running several similar experiments with " "different cohorts." ) self.additional_bigtrain_classnames = additional_bigtrain_classnames # only fill default values for full runs if not partial_run: ## Defaults to sane values self.config["temporal_config"] = fill_timechop_config_missing( self.config, self.db_engine ) ## Defaults to all the entities found in the features_aggregation's from_obj self.config["cohort_config"] = fill_cohort_config_missing(self.config) ## Defaults to all the feature_aggregation's prefixes self.config["feature_group_definition"] = fill_feature_group_definition( self.config ) grid_config = fill_model_grid_presets(self.config) self.config.pop("model_grid_preset", None) if grid_config is not None: self.config["grid_config"] = grid_config if not self.config.get("random_seed", None): logger.notice( "Random seed not specified. A random seed will be provided. " "This could have interesting side effects, " "e.g. new models per model group are trained, " "tested and evaluated everytime that you run this experiment configuration" ) self.random_seed = self.config.pop("random_seed", random.randint(1, 1e7)) logger.verbose( f"Using random seed [{self.random_seed}] for running the experiment" ) random.seed(self.random_seed) ###################### RUBICON ###################### self.experiment_hash = save_experiment_and_get_hash(self.config, self.db_engine) logger.debug(f"Experiment hash [{self.experiment_hash}] assigned") self.run_id = initialize_tracking_and_get_run_id( self.experiment_hash, experiment_class_path=classpath(self.__class__), random_seed=self.random_seed, experiment_kwargs=experiment_kwargs, db_engine=self.db_engine, ) logger.debug(f"Experiment run id [{self.run_id}] assigned") self.initialize_components() self.cleanup = cleanup if self.cleanup: logger.notice( "Cleanup is set to true, so intermediate tables (labels and cohort) " "will be removed after matrix creation and subset tables will be " "removed after model training and testing" ) self.cleanup_timeout = ( self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout ) self.profile = profile if self.profile: logger.spam("Profiling will be stored using cProfile")