예제 #1
0
    def _run(self, label_config):
        logger.spam("Validating label configuration")
        if not label_config:
            raise ValueError(
                dedent(
                    """
            Section: label_config -
            Section not found. You must define a label config."""
                )
            )

        if len(set(label_config.keys()).intersection({"query", "filepath"})) != 1:
            raise ValueError(
                dedent(
                    """
            Section: label_config -
            keys ({label_config.keys()}) do not contain exactly one of 'filepath'
            or 'query'. You must pass a filepath to a label query or include one
            in the config."""
                )
            )
        label_config = load_query_if_needed(label_config)
        if "name" in label_config and not string_is_tablesafe(label_config["name"]):
            raise ValueError(
                "Section: label_config - "
                "name should only contain lowercase letters, numbers, and underscores"
            )
        self._validate_query(label_config["query"])
        self._validate_include_missing_labels_in_train_as(
            label_config.get("include_missing_labels_in_train_as", None)
        )
        logger.debug("Validation of label configuration was successful")
예제 #2
0
 def test_update_in_place_if_filepath(self):
     with unittest.mock.patch("triage.util.conf.open",
                              side_effect=open_side_effect) as mock_file:
         file_cohort_config = sample_cohort_config("filepath")
         query_cohort_config = sample_cohort_config("query")
         loaded_config = load_query_if_needed(file_cohort_config)
         assert loaded_config == query_cohort_config
예제 #3
0
 def _run(self, cohort_config):
     logger.spam("Validating of cohort configuration")
     if len(set(cohort_config.keys()).intersection({"query", "filepath"})) != 1:
         raise ValueError(
             dedent(
                 """
         Section: cohort_config -
         keys ({cohort_config.keys()}) do not contain exactly one of 'filepath'
         or 'query'. You must pass a filepath to a cohort query or include one
         in the config."""
             )
         )
     cohort_config = load_query_if_needed(cohort_config)
     query = cohort_config["query"]
     if "{as_of_date}" not in query:
         raise ValueError(
             dedent(
                 """
         Section: cohort_config -
         If 'query' is used as cohort_config,
         {as_of_date} must be present"""
             )
         )
     if "name" in cohort_config and not string_is_tablesafe(cohort_config["name"]):
         raise ValueError(
             "Section: cohort_config - "
             "name should only contain lowercase letters, numbers, and underscores"
         )
     dated_query = query.replace("{as_of_date}", "2016-01-01")
     logger.spam("Validating cohort query via SQL EXPLAIN")
     try:
         self.db_engine.execute(f"explain {dated_query}")
         logger.debug("Validation of cohort query was successful")
     except Exception as e:
         raise ValueError(
             dedent(
                 f"""
             Section: cohort_config -
             given query can not be run with a sample as_of_date .
             query: "{query}"
             Full error: {e}"""
             )
         )
     logger.debug("Validation of cohort configuration was successful")
예제 #4
0
    def __call__(self, args):
        self.root.setup()  # Loading configuration (if exists)
        db_engine = create_engine(self.root.db_url)
        full_config = yaml.full_load(args.feature_config_file)
        feature_config = full_config["feature_aggregations"]
        cohort_config = load_query_if_needed(full_config.get("cohort_config", None))
        if cohort_config:
            EntityDateTableGenerator(
                entity_date_table_name="features_test.test_cohort",
                db_engine=db_engine,
                query=cohort_config["query"],
                replace=True,
            ).generate_entity_date_table(as_of_dates=[args.as_of_date])

        FeatureGenerator(db_engine, "features_test").create_features_before_imputation(
            feature_aggregation_config=feature_config,
            feature_dates=[args.as_of_date],
            state_table="features_test.test_cohort",
        )
        logger.success(
            f"Features created for feature_config {feature_config} and date {args.as_of_date}"
        )
예제 #5
0
    def __init__(
        self,
        config,
        db_engine,
        project_path=None,
        matrix_storage_class=CSVMatrixStore,
        replace=True,
        cleanup=False,
        cleanup_timeout=None,
        materialize_subquery_fromobjs=True,
        features_ignore_cohort=False,
        additional_bigtrain_classnames=None,
        profile=False,
        save_predictions=True,
        skip_validation=False,
        partial_run=False,
    ):
        # For a partial run, skip validation and avoid cleaning up
        # we'll also skip filling default config values below
        if partial_run:
            cleanup = False
            skip_validation = True

        experiment_kwargs = bind_kwargs(
            self.__class__,
            **{
                key: value
                for (key, value) in locals().items()
                if key not in {"db_engine", "config", "self"}
            },
        )

        self._check_config_version(config)
        self.config = config

        if self.config.get("cohort_config") is not None:
            self.config["cohort_config"] = load_query_if_needed(
                self.config["cohort_config"]
            )
        if self.config.get("label_config") is not None:
            self.config["label_config"] = load_query_if_needed(
                self.config["label_config"]
            )

        self.project_storage = ProjectStorage(project_path)
        self.model_storage_engine = ModelStorageEngine(self.project_storage)
        self.matrix_storage_engine = MatrixStorageEngine(
            self.project_storage, matrix_storage_class
        )
        self.project_path = project_path
        logger.verbose(
            f"Matrices and trained models will be saved in {self.project_path}"
        )
        self.replace = replace
        if self.replace:
            logger.notice(
                f"Replace flag is set to true. Matrices, models, "
                "evaluations and predictions (if they exist) will be replaced"
            )

        self.save_predictions = save_predictions
        if not self.save_predictions:
            logger.notice(
                f"Save predictions flag is set to false. "
                "Individual predictions won't be stored in the predictions "
                "table. This will decrease both the running time "
                "of an experiment and also decrease the space needed in the db"
            )

        self.skip_validation = skip_validation
        if self.skip_validation:
            logger.notice(
                f"Warning: Skip validation flag is set to true. "
                "The experiment config file specified won't be validated. "
                "This will reduce (a little) the running time of the experiment, "
                "but has some potential risks, e.g. the experiment could fail"
                "after some time due to some misconfiguration. Proceed with care."
            )

        self.db_engine = db_engine
        results_schema.upgrade_if_clean(dburl=self.db_engine.url)

        self.features_schema_name = "features"

        self.materialize_subquery_fromobjs = materialize_subquery_fromobjs
        if not self.materialize_subquery_fromobjs:
            logger.notice(
                "Materialize from_objs is set to false. "
                "The from_objs will be calculated on the fly every time."
            )

        self.features_ignore_cohort = features_ignore_cohort
        if self.features_ignore_cohort:
            logger.notice(
                "Features will be calculated for all the entities "
                "(i.e. ignoring cohort) this setting will have the effect "
                "that more db space will be used, but potentially could save "
                "time if you are running several similar experiments with "
                "different cohorts."
            )

        self.additional_bigtrain_classnames = additional_bigtrain_classnames
        # only fill default values for full runs
        if not partial_run:
            ## Defaults to sane values
            self.config["temporal_config"] = fill_timechop_config_missing(
                self.config, self.db_engine
            )
            ## Defaults to all the entities found in the features_aggregation's from_obj
            self.config["cohort_config"] = fill_cohort_config_missing(self.config)
            ## Defaults to all the feature_aggregation's prefixes
            self.config["feature_group_definition"] = fill_feature_group_definition(
                self.config
            )

        grid_config = fill_model_grid_presets(self.config)
        self.config.pop("model_grid_preset", None)
        if grid_config is not None:
            self.config["grid_config"] = grid_config

        if not self.config.get("random_seed", None):
            logger.notice(
                "Random seed not specified. A random seed will be provided. "
                "This could have interesting side effects, "
                "e.g. new models per model group are trained, "
                "tested and evaluated everytime that you run this experiment configuration"
            )

        self.random_seed = self.config.pop("random_seed", random.randint(1, 1e7))

        logger.verbose(
            f"Using random seed [{self.random_seed}] for running the experiment"
        )
        random.seed(self.random_seed)

        ###################### RUBICON ######################

        self.experiment_hash = save_experiment_and_get_hash(self.config, self.db_engine)
        logger.debug(f"Experiment hash [{self.experiment_hash}] assigned")
        self.run_id = initialize_tracking_and_get_run_id(
            self.experiment_hash,
            experiment_class_path=classpath(self.__class__),
            random_seed=self.random_seed,
            experiment_kwargs=experiment_kwargs,
            db_engine=self.db_engine,
        )
        logger.debug(f"Experiment run id [{self.run_id}] assigned")

        self.initialize_components()

        self.cleanup = cleanup
        if self.cleanup:
            logger.notice(
                "Cleanup is set to true, so intermediate tables (labels and cohort) "
                "will be removed after matrix creation and subset tables will be "
                "removed after model training and testing"
            )

        self.cleanup_timeout = (
            self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout
        )

        self.profile = profile
        if self.profile:
            logger.spam("Profiling will be stored using cProfile")