Exemplo n.º 1
0
def test_save_experiment_and_get_hash():
    # no reason to make assertions on the config itself, use a basic dict
    experiment_config = {"one": "two"}
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)
        exp_hash = save_experiment_and_get_hash(experiment_config, engine)
        assert isinstance(exp_hash, str)
        new_hash = save_experiment_and_get_hash(experiment_config, engine)
        assert new_hash == exp_hash
Exemplo n.º 2
0
def test_custom_groups(grid_config, db_engine_with_results_schema, project_storage):
    model_storage_engine = project_storage.model_storage_engine()
    experiment_hash = save_experiment_and_get_hash(
        config={'foo': 'bar'}, 
        db_engine=db_engine_with_results_schema
        )
    run_id = initialize_tracking_and_get_run_id(
        experiment_hash,
        experiment_class_path="",
        random_seed=5,
        experiment_kwargs={},
        db_engine=db_engine_with_results_schema
    )
    trainer = ModelTrainer(
        experiment_hash=experiment_hash,
        model_storage_engine=model_storage_engine,
        model_grouper=ModelGrouper(["class_path"]),
        db_engine=db_engine_with_results_schema,
        run_id=run_id,
    )
    # create training set
    model_ids = trainer.train_models(
        grid_config=grid_config,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    # expect only one model group now
    records = [
        row[0]
        for row in db_engine_with_results_schema.execute(
            "select distinct model_group_id from triage_metadata.models"
        )
    ]
    assert len(records) == 1
    assert records[0] == model_ids[0]
Exemplo n.º 3
0
    def __init__(
        self,
        config,
        db_engine,
        model_storage_class=FSModelStorageEngine,
        project_path=None,
        replace=True,
        cleanup=False,
        cleanup_timeout=None,
    ):
        self._check_config_version(config)
        self.config = config

        if isinstance(db_engine, Engine):
            logging.warning(
                'Raw, unserializable SQLAlchemy engine passed. URL will be used, other options may be lost in multi-process environments'
            )
            self.db_engine = create_engine(db_engine.url)
        else:
            self.db_engine = db_engine

        if model_storage_class:
            self.model_storage_engine = model_storage_class(
                project_path=project_path)
        self.matrix_store_class = CSVMatrixStore  # can't be configurable until Architect obeys
        self.project_path = project_path
        self.replace = replace
        ensure_db(self.db_engine)

        self.features_schema_name = 'features'
        if project_path:
            self.matrices_directory = os.path.join(self.project_path,
                                                   'matrices')
            if not os.path.exists(self.matrices_directory):
                os.makedirs(self.matrices_directory)

        self.experiment_hash = save_experiment_and_get_hash(
            self.config, self.db_engine)
        self.labels_table_name = 'labels_{}'.format(self.experiment_hash)
        self.initialize_components()

        self.cleanup = cleanup
        if self.cleanup:
            logging.info(
                'cleanup is set to True, so intermediate tables (labels and states) will be removed after matrix creation'
            )
        else:
            logging.info(
                'cleanup is set to False, so intermediate tables (labels and states) will not be removed after matrix creation'
            )
        self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None
                                else cleanup_timeout)
Exemplo n.º 4
0
    def __init__(
        self,
        config,
        db_engine,
        project_path=None,
        matrix_storage_class=CSVMatrixStore,
        replace=True,
        cleanup=False,
        cleanup_timeout=None,
        materialize_subquery_fromobjs=True,
        features_ignore_cohort=False,
        profile=False,
        save_predictions=True,
    ):
        self._check_config_version(config)
        self.config = config

        self.project_storage = ProjectStorage(project_path)
        self.model_storage_engine = ModelStorageEngine(self.project_storage)
        self.matrix_storage_engine = MatrixStorageEngine(
            self.project_storage, matrix_storage_class)
        self.project_path = project_path
        self.replace = replace
        self.save_predictions = save_predictions
        self.db_engine = db_engine
        upgrade_db(db_engine=self.db_engine)

        self.features_schema_name = "features"
        self.materialize_subquery_fromobjs = materialize_subquery_fromobjs
        self.features_ignore_cohort = features_ignore_cohort
        self.experiment_hash = save_experiment_and_get_hash(
            self.config, self.db_engine)
        self.initialize_components()

        self.cleanup = cleanup
        if self.cleanup:
            logging.info(
                "cleanup is set to True, so intermediate tables (labels and cohort) "
                "will be removed after matrix creation and subset tables will be "
                "removed after model training and testing")
        else:
            logging.info(
                "cleanup is set to False, so intermediate tables (labels, cohort, and subsets) "
                "will not be removed")
        self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None
                                else cleanup_timeout)
        self.profile = profile
        logging.info("Generate profiling stats? (profile option): %s",
                     self.profile)
Exemplo n.º 5
0
    def __init__(
        self,
        config,
        db_engine,
        project_path=None,
        matrix_storage_class=CSVMatrixStore,
        replace=True,
        cleanup=False,
        cleanup_timeout=None,
    ):
        self._check_config_version(config)
        self.config = config

        if isinstance(db_engine, Engine):
            logging.warning(
                "Raw, unserializable SQLAlchemy engine passed. "
                "URL will be used, other options may be lost in multi-process environments"
            )
            self.db_engine = create_engine(db_engine.url)
        else:
            self.db_engine = db_engine

        self.project_storage = ProjectStorage(project_path)
        self.model_storage_engine = ModelStorageEngine(self.project_storage)
        self.matrix_storage_engine = MatrixStorageEngine(
            self.project_storage, matrix_storage_class)
        self.project_path = project_path
        self.replace = replace
        upgrade_db(db_engine=self.db_engine)

        self.features_schema_name = "features"
        self.experiment_hash = save_experiment_and_get_hash(
            self.config, self.db_engine)
        self.labels_table_name = "labels_{}".format(self.experiment_hash)
        self.initialize_components()

        self.cleanup = cleanup
        if self.cleanup:
            logging.info(
                "cleanup is set to True, so intermediate tables (labels and states) "
                "will be removed after matrix creation")
        else:
            logging.info(
                "cleanup is set to False, so intermediate tables (labels and states) "
                "will not be removed after matrix creation")
        self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None
                                else cleanup_timeout)
Exemplo n.º 6
0
def test_ModelTrainTester_generate_tasks(db_engine_with_results_schema,
                                         project_storage,
                                         sample_timechop_splits,
                                         sample_grid_config):
    db_engine = db_engine_with_results_schema
    model_storage_engine = ModelStorageEngine(project_storage)
    matrix_storage_engine = MatrixStorageEngine(project_storage)
    sample_matrix_store = get_matrix_store(project_storage)
    experiment_hash = save_experiment_and_get_hash({}, db_engine)
    run_id = initialize_tracking_and_get_run_id(
        experiment_hash,
        experiment_class_path="",
        random_seed=5,
        experiment_kwargs={},
        db_engine=db_engine_with_results_schema)
    # instantiate pipeline objects
    trainer = ModelTrainer(
        experiment_hash=experiment_hash,
        model_storage_engine=model_storage_engine,
        db_engine=db_engine,
        run_id=run_id,
    )
    train_tester = ModelTrainTester(
        matrix_storage_engine=matrix_storage_engine,
        model_trainer=trainer,
        model_evaluator=None,
        individual_importance_calculator=None,
        predictor=None,
        subsets=None,
        protected_groups_generator=None,
    )
    with patch.object(matrix_storage_engine,
                      'get_store',
                      return_value=sample_matrix_store):
        batches = train_tester.generate_task_batches(
            splits=sample_timechop_splits, grid_config=sample_grid_config)
        assert len(batches) == 3
        # we expect to have a task for each combination of split and classifier
        flattened_tasks = list(task for batch in batches
                               for task in batch.tasks)
        assert len(flattened_tasks) == \
            len(sample_timechop_splits) * len(list(flatten_grid_config(sample_grid_config)))
        # we also expect each task to match the call signature of process_task
        with patch.object(train_tester, 'process_task', autospec=True):
            for task in flattened_tasks:
                train_tester.process_task(**task)
Exemplo n.º 7
0
def test_missing_matrix_uuids():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)

        experiment_hash = save_experiment_and_get_hash({}, 1234, db_engine)
        matrix_uuids = ['abcd', 'bcde', 'cdef']

        # if we associate matrix uuids with an experiment but don't actually build the matrices
        # they should show up as missing
        associate_matrices_with_experiment(experiment_hash, matrix_uuids, db_engine)
        assert missing_matrix_uuids(experiment_hash, db_engine) == matrix_uuids

        # if we insert a matrix row they should no longer be considered missing
        db_engine.execute(
            f"insert into {Matrix.__table__.fullname} (matrix_uuid) values (%s)",
            matrix_uuids[0]
        )
        assert missing_matrix_uuids(experiment_hash, db_engine) == matrix_uuids[1:]
Exemplo n.º 8
0
def test_missing_model_hashes():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)

        experiment_hash = save_experiment_and_get_hash({}, 1234, db_engine)
        model_hashes = ['abcd', 'bcde', 'cdef']

        # if we associate model hashes with an experiment but don't actually train the models
        # they should show up as missing
        associate_models_with_experiment(experiment_hash, model_hashes, db_engine)
        assert missing_model_hashes(experiment_hash, db_engine) == model_hashes

        # if we insert a model row they should no longer be considered missing
        db_engine.execute(
            f"insert into {Model.__table__.fullname} (model_hash) values (%s)",
            model_hashes[0]
        )
        assert missing_model_hashes(experiment_hash, db_engine) == model_hashes[1:]
Exemplo n.º 9
0
def default_model_trainer(db_engine_with_results_schema, project_storage):
    model_storage_engine = project_storage.model_storage_engine()
    experiment_hash = save_experiment_and_get_hash(
        config={'foo': 'bar'}, 
        db_engine=db_engine_with_results_schema
        )
    run_id = initialize_tracking_and_get_run_id(
        experiment_hash,
        experiment_class_path="",
        random_seed=5,
        experiment_kwargs={},
        db_engine=db_engine_with_results_schema
    )
    # import pdb; pdb.set_trace()
    trainer = ModelTrainer(
        experiment_hash=experiment_hash,
        model_storage_engine=model_storage_engine,
        db_engine=db_engine_with_results_schema,
        model_grouper=ModelGrouper(),
        run_id=run_id,
    )
    yield trainer
Exemplo n.º 10
0
    def __init__(
        self,
        config,
        db_engine,
        model_storage_class=None,
        project_path=None,
        replace=True,
        cleanup_timeout=None,
    ):
        self._check_config_version(config)
        self.config = config

        self.db_engine = db_engine
        if model_storage_class:
            self.model_storage_engine = model_storage_class(
                project_path=project_path)
        self.matrix_store_class = CSVMatrixStore  # can't be configurable until Architect obeys
        self.project_path = project_path
        self.replace = replace
        ensure_db(self.db_engine)

        self.labels_table_name = 'labels'
        self.features_schema_name = 'features'
        if project_path:
            self.matrices_directory = os.path.join(self.project_path,
                                                   'matrices')
            if not os.path.exists(self.matrices_directory):
                os.makedirs(self.matrices_directory)

        self.experiment_hash = save_experiment_and_get_hash(
            self.config, self.db_engine)
        self.initialize_factories()
        self.initialize_components()

        self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None
                                else cleanup_timeout)
Exemplo n.º 11
0
    def __init__(
        self,
        config,
        db_engine,
        project_path=None,
        matrix_storage_class=CSVMatrixStore,
        replace=True,
        cleanup=False,
        cleanup_timeout=None,
        materialize_subquery_fromobjs=True,
        features_ignore_cohort=False,
        profile=False,
        save_predictions=True,
        skip_validation=False,
        partial_run=False,
    ):
        # For a partial run, skip validation and avoid cleaning up
        # we'll also skip filling default config values below
        if partial_run:
            cleanup = False
            skip_validation = True

        experiment_kwargs = bind_kwargs(
            self.__class__, **{
                key: value
                for (key, value) in locals().items()
                if key not in {'db_engine', 'config', 'self'}
            })

        self._check_config_version(config)
        self.config = config

        self.config['random_seed'] = self.config.get('random_seed',
                                                     random.randint(1, 1e7))

        random.seed(self.config['random_seed'])

        self.project_storage = ProjectStorage(project_path)
        self.model_storage_engine = ModelStorageEngine(self.project_storage)
        self.matrix_storage_engine = MatrixStorageEngine(
            self.project_storage, matrix_storage_class)
        self.project_path = project_path
        self.replace = replace
        self.save_predictions = save_predictions
        self.skip_validation = skip_validation
        self.db_engine = db_engine
        results_schema.upgrade_if_clean(dburl=self.db_engine.url)

        self.features_schema_name = "features"
        self.materialize_subquery_fromobjs = materialize_subquery_fromobjs
        self.features_ignore_cohort = features_ignore_cohort

        # only fill default values for full runs
        if not partial_run:
            ## Defaults to sane values
            self.config['temporal_config'] = fill_timechop_config_missing(
                self.config, self.db_engine)
            ## Defaults to all the entities found in the features_aggregation's from_obj
            self.config['cohort_config'] = fill_cohort_config_missing(
                self.config)
            ## Defaults to all the feature_aggregation's prefixes
            self.config[
                'feature_group_definition'] = fill_feature_group_definition(
                    self.config)

        grid_config = fill_model_grid_presets(self.config)
        self.config.pop('model_grid_preset', None)
        if grid_config is not None:
            self.config['grid_config'] = grid_config

        ###################### RUBICON ######################

        self.experiment_hash = save_experiment_and_get_hash(
            self.config, self.db_engine)
        self.run_id = initialize_tracking_and_get_run_id(
            self.experiment_hash,
            experiment_class_path=classpath(self.__class__),
            experiment_kwargs=experiment_kwargs,
            db_engine=self.db_engine)
        self.initialize_components()

        self.cleanup = cleanup
        if self.cleanup:
            logging.info(
                "cleanup is set to True, so intermediate tables (labels and cohort) "
                "will be removed after matrix creation and subset tables will be "
                "removed after model training and testing")
        else:
            logging.info(
                "cleanup is set to False, so intermediate tables (labels, cohort, and subsets) "
                "will not be removed")
        self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None
                                else cleanup_timeout)
        self.profile = profile
        logging.info("Generate profiling stats? (profile option): %s",
                     self.profile)
Exemplo n.º 12
0
def test_integration():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            project_path = 'econ-dev/inspections'

            # create train and test matrices
            train_matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            train_metadata = {
                'feature_start_time': datetime.date(2012, 12, 20),
                'end_time': datetime.date(2016, 12, 20),
                'label_name': 'label',
                'label_timespan': '1y',
                'feature_names': ['ft1', 'ft2'],
                'metta-uuid': '1234',
                'indices': ['entity_id'],
                'matrix_type': 'train'
            }
            # Creates a matrix entry in the matrices table with uuid from train_metadata
            MatrixFactory(matrix_uuid="1234")
            session.commit()

            train_store = InMemoryMatrixStore(train_matrix, sample_metadata())

            as_of_dates = [
                datetime.date(2016, 12, 21),
                datetime.date(2017, 1, 21)
            ]

            test_stores = [
                InMemoryMatrixStore(
                    pandas.DataFrame.from_dict({
                        'entity_id': [3],
                        'feature_one': [8],
                        'feature_two': [5],
                        'label': [5]
                    }), {
                        'label_name': 'label',
                        'label_timespan': '1y',
                        'end_time': as_of_date,
                        'metta-uuid': '1234',
                        'indices': ['entity_id'],
                        'matrix_type': 'test',
                        'as_of_date_frequency': '1month'
                    }) for as_of_date in as_of_dates
            ]

            model_storage_engine = S3ModelStorageEngine(project_path)

            experiment_hash = save_experiment_and_get_hash({}, db_engine)
            # instantiate pipeline objects
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=experiment_hash,
                model_storage_engine=model_storage_engine,
                db_engine=db_engine,
            )
            predictor = Predictor(project_path, model_storage_engine,
                                  db_engine)
            model_evaluator = ModelEvaluator([{
                'metrics': ['precision@'],
                'thresholds': {
                    'top_n': [5]
                }
            }], [{}], db_engine)

            # run the pipeline
            grid_config = {
                'sklearn.linear_model.LogisticRegression': {
                    'C': [0.00001, 0.0001],
                    'penalty': ['l1', 'l2'],
                    'random_state': [2193]
                }
            }
            model_ids = trainer.train_models(grid_config=grid_config,
                                             misc_db_parameters=dict(),
                                             matrix_store=train_store)

            for model_id in model_ids:
                for as_of_date, test_store in zip(as_of_dates, test_stores):
                    predictions_proba = predictor.predict(
                        model_id,
                        test_store,
                        misc_db_parameters=dict(),
                        train_matrix_columns=['feature_one', 'feature_two'])

                    model_evaluator.evaluate(
                        predictions_proba,
                        test_store,
                        model_id,
                    )

            # assert
            # 1. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row for row in db_engine.execute(
                    '''select entity_id, model_id, as_of_date
                from test_results.test_predictions
                join model_metadata.models using (model_id)
                order by 3, 2''')
            ]
            assert records == [
                (3, 1, datetime.datetime(2016, 12, 21)),
                (3, 2, datetime.datetime(2016, 12, 21)),
                (3, 3, datetime.datetime(2016, 12, 21)),
                (3, 4, datetime.datetime(2016, 12, 21)),
                (3, 1, datetime.datetime(2017, 1, 21)),
                (3, 2, datetime.datetime(2017, 1, 21)),
                (3, 3, datetime.datetime(2017, 1, 21)),
                (3, 4, datetime.datetime(2017, 1, 21)),
            ]

            # that evaluations are there
            records = [
                row for row in db_engine.execute('''
                    select model_id, evaluation_start_time, metric, parameter
                    from test_results.test_evaluations order by 2, 1''')
            ]
            assert records == [
                (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
            ]
Exemplo n.º 13
0
def test_integration():
    with rig_engines() as (db_engine, project_storage):
        train_store = get_matrix_store(
            project_storage, matrix_creator(),
            matrix_metadata_creator(matrix_type='train'))
        as_of_dates = [datetime.date(2016, 12, 21), datetime.date(2017, 1, 21)]

        test_stores = []
        for as_of_date in as_of_dates:
            matrix_store = get_matrix_store(
                project_storage,
                pandas.DataFrame.from_dict({
                    'entity_id': [3],
                    'feature_one': [8],
                    'feature_two': [5],
                    'label': [0]
                }).set_index('entity_id'),
                matrix_metadata_creator(end_time=as_of_date,
                                        indices=['entity_id']))
            test_stores.append(matrix_store)

        model_storage_engine = ModelStorageEngine(project_storage)

        experiment_hash = save_experiment_and_get_hash({}, db_engine)
        # instantiate pipeline objects
        trainer = ModelTrainer(
            experiment_hash=experiment_hash,
            model_storage_engine=model_storage_engine,
            db_engine=db_engine,
        )
        predictor = Predictor(model_storage_engine, db_engine)
        model_evaluator = ModelEvaluator([{
            'metrics': ['precision@'],
            'thresholds': {
                'top_n': [5]
            }
        }], [{}], db_engine)

        # run the pipeline
        grid_config = {
            'sklearn.linear_model.LogisticRegression': {
                'C': [0.00001, 0.0001],
                'penalty': ['l1', 'l2'],
                'random_state': [2193]
            }
        }
        model_ids = trainer.train_models(grid_config=grid_config,
                                         misc_db_parameters=dict(),
                                         matrix_store=train_store)

        for model_id in model_ids:
            for as_of_date, test_store in zip(as_of_dates, test_stores):
                predictions_proba = predictor.predict(
                    model_id,
                    test_store,
                    misc_db_parameters=dict(),
                    train_matrix_columns=['feature_one', 'feature_two'])

                model_evaluator.evaluate(
                    predictions_proba,
                    test_store,
                    model_id,
                )

        # assert
        # 1. that the predictions table entries are present and
        # can be linked to the original models
        records = [
            row for row in db_engine.execute(
                '''select entity_id, model_id, as_of_date
            from test_results.predictions
            join model_metadata.models using (model_id)
            order by 3, 2''')
        ]
        assert records == [
            (3, 1, datetime.datetime(2016, 12, 21)),
            (3, 2, datetime.datetime(2016, 12, 21)),
            (3, 3, datetime.datetime(2016, 12, 21)),
            (3, 4, datetime.datetime(2016, 12, 21)),
            (3, 1, datetime.datetime(2017, 1, 21)),
            (3, 2, datetime.datetime(2017, 1, 21)),
            (3, 3, datetime.datetime(2017, 1, 21)),
            (3, 4, datetime.datetime(2017, 1, 21)),
        ]

        # that evaluations are there
        records = [
            row for row in db_engine.execute('''
                select model_id, evaluation_start_time, metric, parameter
                from test_results.evaluations order by 2, 1''')
        ]
        assert records == [
            (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
            (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
            (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
            (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
            (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
            (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
            (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
            (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
        ]
Exemplo n.º 14
0
    def __init__(
        self,
        config,
        db_engine,
        project_path=None,
        matrix_storage_class=CSVMatrixStore,
        replace=True,
        cleanup=False,
        cleanup_timeout=None,
        materialize_subquery_fromobjs=True,
        features_ignore_cohort=False,
        additional_bigtrain_classnames=None,
        profile=False,
        save_predictions=True,
        skip_validation=False,
        partial_run=False,
    ):
        # For a partial run, skip validation and avoid cleaning up
        # we'll also skip filling default config values below
        if partial_run:
            cleanup = False
            skip_validation = True

        experiment_kwargs = bind_kwargs(
            self.__class__,
            **{
                key: value
                for (key, value) in locals().items()
                if key not in {"db_engine", "config", "self"}
            },
        )

        self._check_config_version(config)
        self.config = config

        if self.config.get("cohort_config") is not None:
            self.config["cohort_config"] = load_query_if_needed(
                self.config["cohort_config"]
            )
        if self.config.get("label_config") is not None:
            self.config["label_config"] = load_query_if_needed(
                self.config["label_config"]
            )

        self.project_storage = ProjectStorage(project_path)
        self.model_storage_engine = ModelStorageEngine(self.project_storage)
        self.matrix_storage_engine = MatrixStorageEngine(
            self.project_storage, matrix_storage_class
        )
        self.project_path = project_path
        logger.verbose(
            f"Matrices and trained models will be saved in {self.project_path}"
        )
        self.replace = replace
        if self.replace:
            logger.notice(
                f"Replace flag is set to true. Matrices, models, "
                "evaluations and predictions (if they exist) will be replaced"
            )

        self.save_predictions = save_predictions
        if not self.save_predictions:
            logger.notice(
                f"Save predictions flag is set to false. "
                "Individual predictions won't be stored in the predictions "
                "table. This will decrease both the running time "
                "of an experiment and also decrease the space needed in the db"
            )

        self.skip_validation = skip_validation
        if self.skip_validation:
            logger.notice(
                f"Warning: Skip validation flag is set to true. "
                "The experiment config file specified won't be validated. "
                "This will reduce (a little) the running time of the experiment, "
                "but has some potential risks, e.g. the experiment could fail"
                "after some time due to some misconfiguration. Proceed with care."
            )

        self.db_engine = db_engine
        results_schema.upgrade_if_clean(dburl=self.db_engine.url)

        self.features_schema_name = "features"

        self.materialize_subquery_fromobjs = materialize_subquery_fromobjs
        if not self.materialize_subquery_fromobjs:
            logger.notice(
                "Materialize from_objs is set to false. "
                "The from_objs will be calculated on the fly every time."
            )

        self.features_ignore_cohort = features_ignore_cohort
        if self.features_ignore_cohort:
            logger.notice(
                "Features will be calculated for all the entities "
                "(i.e. ignoring cohort) this setting will have the effect "
                "that more db space will be used, but potentially could save "
                "time if you are running several similar experiments with "
                "different cohorts."
            )

        self.additional_bigtrain_classnames = additional_bigtrain_classnames
        # only fill default values for full runs
        if not partial_run:
            ## Defaults to sane values
            self.config["temporal_config"] = fill_timechop_config_missing(
                self.config, self.db_engine
            )
            ## Defaults to all the entities found in the features_aggregation's from_obj
            self.config["cohort_config"] = fill_cohort_config_missing(self.config)
            ## Defaults to all the feature_aggregation's prefixes
            self.config["feature_group_definition"] = fill_feature_group_definition(
                self.config
            )

        grid_config = fill_model_grid_presets(self.config)
        self.config.pop("model_grid_preset", None)
        if grid_config is not None:
            self.config["grid_config"] = grid_config

        if not self.config.get("random_seed", None):
            logger.notice(
                "Random seed not specified. A random seed will be provided. "
                "This could have interesting side effects, "
                "e.g. new models per model group are trained, "
                "tested and evaluated everytime that you run this experiment configuration"
            )

        self.random_seed = self.config.pop("random_seed", random.randint(1, 1e7))

        logger.verbose(
            f"Using random seed [{self.random_seed}] for running the experiment"
        )
        random.seed(self.random_seed)

        ###################### RUBICON ######################

        self.experiment_hash = save_experiment_and_get_hash(self.config, self.db_engine)
        logger.debug(f"Experiment hash [{self.experiment_hash}] assigned")
        self.run_id = initialize_tracking_and_get_run_id(
            self.experiment_hash,
            experiment_class_path=classpath(self.__class__),
            random_seed=self.random_seed,
            experiment_kwargs=experiment_kwargs,
            db_engine=self.db_engine,
        )
        logger.debug(f"Experiment run id [{self.run_id}] assigned")

        self.initialize_components()

        self.cleanup = cleanup
        if self.cleanup:
            logger.notice(
                "Cleanup is set to true, so intermediate tables (labels and cohort) "
                "will be removed after matrix creation and subset tables will be "
                "removed after model training and testing"
            )

        self.cleanup_timeout = (
            self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout
        )

        self.profile = profile
        if self.profile:
            logger.spam("Profiling will be stored using cProfile")
Exemplo n.º 15
0
def test_reuse_model_random_seeds(grid_config, default_model_trainer):
    trainer = default_model_trainer
    db_engine = trainer.db_engine
    project_storage = trainer.model_storage_engine.project_storage
    model_storage_engine = trainer.model_storage_engine

    # re-using the random seeds requires the association between experiments and models
    # to exist, which we're not getting in these tests since we aren't using the experiment
    # architecture, so back-fill these associations after each train_models() run
    def update_experiment_models(db_engine):
        sql = """
            INSERT INTO triage_metadata.experiment_models(experiment_hash,model_hash) 
            SELECT er.run_hash, m.model_hash
            FROM triage_metadata.models m
            LEFT JOIN triage_metadata.triage_runs er
                ON m.built_in_triage_run = er.id
            LEFT JOIN triage_metadata.experiment_models em 
                ON m.model_hash = em.model_hash
                AND er.run_hash = em.experiment_hash
            WHERE em.experiment_hash IS NULL
            """
        db_engine.execute(sql)
        db_engine.execute('COMMIT;')

    random.seed(5)
    model_ids = trainer.train_models(
        grid_config=grid_config,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    update_experiment_models(db_engine)

    # simulate running a new experiment where the experiment hash has changed
    # (e.g. because the model grid is different), but experiment seed is the
    # same, so previously-trained models should not get new seeds
    experiment_hash = save_experiment_and_get_hash(
        config={'baz': 'qux'}, 
        db_engine=db_engine
        )
    run_id = initialize_tracking_and_get_run_id(
        experiment_hash,
        experiment_class_path="",
        random_seed=5,
        experiment_kwargs={},
        db_engine=db_engine
    )
    trainer = ModelTrainer(
        experiment_hash=experiment_hash,
        model_storage_engine=model_storage_engine,
        db_engine=db_engine,
        model_grouper=ModelGrouper(),
        run_id=run_id,
    )
    new_grid = grid_config.copy()
    new_grid['sklearn.tree.DecisionTreeClassifier']['min_samples_split'] = [3,10,100]
    random.seed(5)
    new_model_ids = trainer.train_models(
        grid_config=new_grid,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    update_experiment_models(db_engine)

    # should have received 5 models
    assert len(new_model_ids) == 6

    # all the original model ids should be in the new set
    assert len(set(new_model_ids) & set(model_ids)) == len(model_ids)

    # however, we should NOT re-use the random seeds (and so get new model_ids)
    # if the experiment-level seed is different
    experiment_hash = save_experiment_and_get_hash(
        config={'lorem': 'ipsum'}, 
        db_engine=db_engine
        )
    run_id = initialize_tracking_and_get_run_id(
        experiment_hash,
        experiment_class_path="",
        random_seed=42,
        experiment_kwargs={},
        db_engine=db_engine
    )
    trainer = ModelTrainer(
        experiment_hash=experiment_hash,
        model_storage_engine=model_storage_engine,
        db_engine=db_engine,
        model_grouper=ModelGrouper(),
        run_id=run_id,
    )
    random.seed(42) # different from above
    newer_model_ids = trainer.train_models(
        grid_config=new_grid,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    update_experiment_models(db_engine)

    # should get entirely new models now (different IDs)
    assert len(newer_model_ids) == 6
    assert len(set(new_model_ids) & set(newer_model_ids)) == 0
Exemplo n.º 16
0
def test_model_trainer(grid_config, default_model_trainer):
    trainer = default_model_trainer
    db_engine = trainer.db_engine
    project_storage = trainer.model_storage_engine.project_storage
    model_storage_engine = trainer.model_storage_engine

    def set_test_seed():
        random.seed(5)
    set_test_seed()
    model_ids = trainer.train_models(
        grid_config=grid_config,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    # assert
    # 1. that the models and feature importances table entries are present
    records = [
        row
        for row in db_engine.execute(
            "select * from train_results.feature_importances"
        )
    ]
    assert len(records) == 4 * 2  # maybe exclude entity_id? yes

    records = [
        row
        for row in db_engine.execute("select model_hash from triage_metadata.models")
    ]
    assert len(records) == 4
    hashes = [row[0] for row in records]

    # 2. that the model groups are distinct
    records = [
        row
        for row in db_engine.execute(
            "select distinct model_group_id from triage_metadata.models"
        )
    ]
    assert len(records) == 4

    # 2. that the random seeds are distinct
    records = [
        row
        for row in db_engine.execute(
            "select distinct random_seed from triage_metadata.models"
        )
    ]
    assert len(records) == 4

    # 3. that the model sizes are saved in the table and all are < 1 kB
    records = [
        row
        for row in db_engine.execute("select model_size from triage_metadata.models")
    ]
    assert len(records) == 4
    for i in records:
        size = i[0]
        assert size < 1

    # 4. that all four models are cached
    model_pickles = [model_storage_engine.load(model_hash) for model_hash in hashes]
    assert len(model_pickles) == 4
    assert len([x for x in model_pickles if x is not None]) == 4

    # 5. that their results can have predictions made on it
    test_matrix = pd.DataFrame.from_dict(
        {"entity_id": [3, 4], "feature_one": [4, 4], "feature_two": [6, 5]}
    ).set_index("entity_id")

    for model_pickle in model_pickles:
        predictions = model_pickle.predict(test_matrix)
        assert len(predictions) == 2

    # 6. when run again with the same starting seed, same models are returned
    set_test_seed()
    new_model_ids = trainer.train_models(
        grid_config=grid_config,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    assert (
        len(
            [
                row
                for row in db_engine.execute(
                    "select model_hash from triage_metadata.models"
                )
            ]
        )
        == 4
    )
    assert model_ids == new_model_ids

    # 7. if replace is set, update non-unique attributes and feature importances
    max_batch_run_time = [
        row[0]
        for row in db_engine.execute(
            "select max(batch_run_time) from triage_metadata.models"
        )
    ][0]
    experiment_hash = save_experiment_and_get_hash(
        config={'foo': 'bar'}, 
        db_engine=db_engine
        )
    run_id = initialize_tracking_and_get_run_id(
        experiment_hash,
        experiment_class_path="",
        random_seed=5,
        experiment_kwargs={},
        db_engine=db_engine
    )
    trainer = ModelTrainer(
        experiment_hash=experiment_hash,
        model_storage_engine=model_storage_engine,
        model_grouper=ModelGrouper(
            model_group_keys=["label_name", "label_timespan"]
        ),
        db_engine=db_engine,
        replace=True,
        run_id=run_id,
    )
    set_test_seed()
    new_model_ids = trainer.train_models(
        grid_config=grid_config,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    assert model_ids == new_model_ids
    assert [
        row["model_id"]
        for row in db_engine.execute(
            "select model_id from triage_metadata.models order by 1 asc"
        )
    ] == model_ids
    new_max_batch_run_time = [
        row[0]
        for row in db_engine.execute(
            "select max(batch_run_time) from triage_metadata.models"
        )
    ][0]
    assert new_max_batch_run_time > max_batch_run_time

    records = [
        row
        for row in db_engine.execute(
            "select * from train_results.feature_importances"
        )
    ]
    assert len(records) == 4 * 2  # maybe exclude entity_id? yes

    # 8. if the cache is missing but the metadata is still there, reuse the metadata
    set_test_seed()
    for row in db_engine.execute("select model_hash from triage_metadata.models"):
        model_storage_engine.delete(row[0])
    new_model_ids = trainer.train_models(
        grid_config=grid_config,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    assert model_ids == sorted(new_model_ids)

    # 9. that the generator interface works the same way
    set_test_seed()
    new_model_ids = trainer.generate_trained_models(
        grid_config=grid_config,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    assert model_ids == sorted([model_id for model_id in new_model_ids])
Exemplo n.º 17
0
def test_integration():
    with rig_engines() as (db_engine, project_storage):
        train_store = get_matrix_store(
            project_storage,
            matrix_creator(),
            matrix_metadata_creator(matrix_type="train"),
        )
        as_of_dates = [datetime.date(2016, 12, 21), datetime.date(2017, 1, 21)]

        test_stores = []
        for as_of_date in as_of_dates:
            matrix_store = get_matrix_store(
                project_storage,
                pandas.DataFrame.from_dict({
                    "entity_id": [3],
                    "feature_one": [8],
                    "feature_two": [5],
                    "label": [0],
                }).set_index("entity_id"),
                matrix_metadata_creator(end_time=as_of_date,
                                        indices=["entity_id"]),
            )
            test_stores.append(matrix_store)

        model_storage_engine = ModelStorageEngine(project_storage)

        experiment_hash = save_experiment_and_get_hash({}, db_engine)
        # instantiate pipeline objects
        trainer = ModelTrainer(
            experiment_hash=experiment_hash,
            model_storage_engine=model_storage_engine,
            db_engine=db_engine,
        )
        predictor = Predictor(model_storage_engine, db_engine)
        model_evaluator = ModelEvaluator([{
            "metrics": ["precision@"],
            "thresholds": {
                "top_n": [5]
            }
        }], [{}], db_engine)

        # run the pipeline
        grid_config = {
            "sklearn.linear_model.LogisticRegression": {
                "C": [0.00001, 0.0001],
                "penalty": ["l1", "l2"],
                "random_state": [2193],
            }
        }
        model_ids = trainer.train_models(grid_config=grid_config,
                                         misc_db_parameters=dict(),
                                         matrix_store=train_store)

        for model_id in model_ids:
            for as_of_date, test_store in zip(as_of_dates, test_stores):
                predictions_proba = predictor.predict(
                    model_id,
                    test_store,
                    misc_db_parameters=dict(),
                    train_matrix_columns=["feature_one", "feature_two"],
                )

                model_evaluator.evaluate(predictions_proba, test_store,
                                         model_id)

        # assert
        # 1. that the predictions table entries are present and
        # can be linked to the original models
        records = [
            row for row in db_engine.execute(
                """select entity_id, model_id, as_of_date
            from test_results.predictions
            join model_metadata.models using (model_id)
            order by 3, 2""")
        ]
        assert records == [
            (3, 1, datetime.datetime(2016, 12, 21)),
            (3, 2, datetime.datetime(2016, 12, 21)),
            (3, 3, datetime.datetime(2016, 12, 21)),
            (3, 4, datetime.datetime(2016, 12, 21)),
            (3, 1, datetime.datetime(2017, 1, 21)),
            (3, 2, datetime.datetime(2017, 1, 21)),
            (3, 3, datetime.datetime(2017, 1, 21)),
            (3, 4, datetime.datetime(2017, 1, 21)),
        ]

        # that evaluations are there
        records = [
            row for row in db_engine.execute("""
                select model_id, evaluation_start_time, metric, parameter
                from test_results.evaluations order by 2, 1""")
        ]
        assert records == [
            (1, datetime.datetime(2016, 12, 21), "precision@", "5_abs"),
            (2, datetime.datetime(2016, 12, 21), "precision@", "5_abs"),
            (3, datetime.datetime(2016, 12, 21), "precision@", "5_abs"),
            (4, datetime.datetime(2016, 12, 21), "precision@", "5_abs"),
            (1, datetime.datetime(2017, 1, 21), "precision@", "5_abs"),
            (2, datetime.datetime(2017, 1, 21), "precision@", "5_abs"),
            (3, datetime.datetime(2017, 1, 21), "precision@", "5_abs"),
            (4, datetime.datetime(2017, 1, 21), "precision@", "5_abs"),
        ]
Exemplo n.º 18
0
    def __init__(
        self,
        config,
        db_engine,
        project_path=None,
        matrix_storage_class=CSVMatrixStore,
        replace=True,
        cleanup=False,
        cleanup_timeout=None,
        materialize_subquery_fromobjs=True,
        features_ignore_cohort=False,
        profile=False,
        save_predictions=True,
        skip_validation=False,
    ):
        experiment_kwargs = bind_kwargs(
            self.__class__, **{
                key: value
                for (key, value) in locals().items()
                if key not in {'db_engine', 'config', 'self'}
            })

        self._check_config_version(config)
        self.config = config
        random.seed(config['random_seed'])

        self.project_storage = ProjectStorage(project_path)
        self.model_storage_engine = ModelStorageEngine(self.project_storage)
        self.matrix_storage_engine = MatrixStorageEngine(
            self.project_storage, matrix_storage_class)
        self.project_path = project_path
        self.replace = replace
        self.save_predictions = save_predictions
        self.skip_validation = skip_validation
        self.db_engine = db_engine
        results_schema.upgrade_if_clean(dburl=self.db_engine.url)

        self.features_schema_name = "features"
        self.materialize_subquery_fromobjs = materialize_subquery_fromobjs
        self.features_ignore_cohort = features_ignore_cohort
        self.experiment_hash = save_experiment_and_get_hash(
            self.config, self.db_engine)
        self.run_id = initialize_tracking_and_get_run_id(
            self.experiment_hash,
            experiment_class_path=classpath(self.__class__),
            experiment_kwargs=experiment_kwargs,
            db_engine=self.db_engine)
        self.initialize_components()

        self.cleanup = cleanup
        if self.cleanup:
            logging.info(
                "cleanup is set to True, so intermediate tables (labels and cohort) "
                "will be removed after matrix creation and subset tables will be "
                "removed after model training and testing")
        else:
            logging.info(
                "cleanup is set to False, so intermediate tables (labels, cohort, and subsets) "
                "will not be removed")
        self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None
                                else cleanup_timeout)
        self.profile = profile
        logging.info("Generate profiling stats? (profile option): %s",
                     self.profile)