Пример #1
0
def test_predictor_save_predictions(matrix_type, predict_setup_args):
    """Test the save_predictions flag being set to False

    We still want to return predict_proba, but not save data to the DB
    """
    (project_storage, db_engine, model_id) = predict_setup_args
    # if save_predictions is sent as False, don't save
    predictor = Predictor(project_storage.model_storage_engine(),
                          db_engine,
                          rank_order='worst',
                          save_predictions=False)

    matrix_store = get_matrix_store(project_storage)
    train_matrix_columns = matrix_store.columns()

    predict_proba = predictor.predict(
        model_id,
        matrix_store,
        misc_db_parameters=dict(),
        train_matrix_columns=train_matrix_columns,
    )

    # assert
    # 1. that the returned predictions are of the desired length
    assert len(predict_proba) == 2

    # 2. that the predictions table entries are present and
    # can be linked to the original models
    assert not table_has_data(f"{matrix_type}_predictions", db_engine)
def test_uniform_distribution_entity_id_index():
    with rig_engines() as (db_engine, project_storage):
        model = ModelFactory()
        feature_importances = [
            FeatureImportanceFactory(model_rel=model,
                                     feature='feature_{}'.format(i))
            for i in range(0, 10)
        ]
        data_dict = {'entity_id': [1, 2]}
        for imp in feature_importances:
            data_dict[imp.feature] = [0.5, 0.5]
        metadata = matrix_metadata_creator(indices='entity_id')
        test_store = get_matrix_store(
            project_storage,
            pandas.DataFrame.from_dict(data_dict).set_index(
                metadata['indices']), metadata)
        results = uniform_distribution(db_engine,
                                       model_id=model.model_id,
                                       as_of_date='2016-01-01',
                                       test_matrix_store=test_store,
                                       n_ranks=5)

        assert len(results) == 10  # 5 features x 2 entities
        for result in results:
            assert 'entity_id' in result
            assert 'feature_name' in result
            assert 'score' in result
            assert 'feature_value' in result
            assert result['feature_value'] == 0.5
            assert result['score'] >= 0
            assert result['score'] <= 1
            assert isinstance(result['feature_name'], str)
            assert result['entity_id'] in [1, 2]
Пример #3
0
    def test_retry_recovery(self):
        db_engine = None
        trainer = None
        port = None
        with rig_engines() as (db_engine, project_storage):
            port = db_engine.url.port
            trainer = ModelTrainer(
                experiment_hash=None,
                model_storage_engine=project_storage.model_storage_engine(),
                db_engine=db_engine,
                model_grouper=ModelGrouper(),
            )
            matrix_store = get_matrix_store(project_storage)

        # start without a database server
        # then bring it back up after the first sleep
        # use self so it doesn't go out of scope too early and shut down
        self.new_server = None

        def replace_db(arg):
            self.new_server = testing.postgresql.Postgresql(port=port)
            db_engine = create_engine(self.new_server.url())
            ensure_db(db_engine)
            init_engine(db_engine)
            get_matrix_store(project_storage)

        with patch("time.sleep") as time_mock:
            time_mock.side_effect = replace_db
            try:
                trainer.train_models(grid_config(), dict(), matrix_store)
            finally:
                if self.new_server is not None:
                    self.new_server.stop()
            assert len(time_mock.mock_calls) == 1
Пример #4
0
def test_n_jobs_not_new_model(default_model_trainer):
    grid_config = {
        "sklearn.ensemble.AdaBoostClassifier": {
            "n_estimators": [10, 100, 1000]
        },
        "sklearn.ensemble.RandomForestClassifier": {
            "n_estimators": [10, 100],
            "max_features": ["sqrt", "log2"],
            "max_depth": [5, 10, 15, 20],
            "criterion": ["gini", "entropy"],
            "n_jobs": [12, 24],
        },
    }

    trainer = default_model_trainer
    project_storage = trainer.model_storage_engine.project_storage
    db_engine = trainer.db_engine

    train_tasks = trainer.generate_train_tasks(
        grid_config, dict(), get_matrix_store(project_storage))

    assert len(
        train_tasks) == 35  # 32+3, would be (32*2)+3 if we didn't remove
    assert (len([
        task for task in train_tasks if "n_jobs" in task["parameters"]
    ]) == 32)

    for train_task in train_tasks:
        trainer.process_train_task(**train_task)

    for row in db_engine.execute(
            "select hyperparameters from model_metadata.model_groups"):
        assert "n_jobs" not in row[0]
Пример #5
0
def test_custom_groups(grid_config, db_engine_with_results_schema, project_storage):
    model_storage_engine = project_storage.model_storage_engine()
    experiment_hash = save_experiment_and_get_hash(
        config={'foo': 'bar'}, 
        db_engine=db_engine_with_results_schema
        )
    run_id = initialize_tracking_and_get_run_id(
        experiment_hash,
        experiment_class_path="",
        random_seed=5,
        experiment_kwargs={},
        db_engine=db_engine_with_results_schema
    )
    trainer = ModelTrainer(
        experiment_hash=experiment_hash,
        model_storage_engine=model_storage_engine,
        model_grouper=ModelGrouper(["class_path"]),
        db_engine=db_engine_with_results_schema,
        run_id=run_id,
    )
    # create training set
    model_ids = trainer.train_models(
        grid_config=grid_config,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    # expect only one model group now
    records = [
        row[0]
        for row in db_engine_with_results_schema.execute(
            "select distinct model_group_id from triage_metadata.models"
        )
    ]
    assert len(records) == 1
    assert records[0] == model_ids[0]
def test_uniform_distribution():
    with rig_engines() as (db_engine, project_storage):
        model = ModelFactory()
        feature_importances = [
            FeatureImportanceFactory(model_rel=model, feature="feature_{}".format(i))
            for i in range(0, 10)
        ]
        data_dict = {"entity_id": [1, 1], "as_of_date": ["2016-01-01", "2017-01-01"], "label": [0, 1]}
        for imp in feature_importances:
            data_dict[imp.feature] = [0.5, 0.5]
        metadata = matrix_metadata_creator()
        test_store = get_matrix_store(
            project_storage,
            pandas.DataFrame.from_dict(data_dict),
            metadata,
        )
        results = uniform_distribution(
            db_engine,
            model_id=model.model_id,
            as_of_date=datetime.date(2016, 1, 1),
            test_matrix_store=test_store,
            n_ranks=5,
        )

        assert len(results) == 5  # 5 features x 1 entity for this as_of_date
        for result in results:
            assert "entity_id" in result
            assert "feature_name" in result
            assert "score" in result
            assert "feature_value" in result
            assert result["feature_value"] == 0.5
            assert result["score"] >= 0
            assert result["score"] <= 1
            assert isinstance(result["feature_name"], str)
            assert result["entity_id"] in [1, 2]
Пример #7
0
def test_predictor_save_predictions(matrix_type, predict_setup_args):
    (project_storage, db_engine, model_id) = predict_setup_args
    # if save_predictions is sent as False, don't save
    predictor = Predictor(project_storage.model_storage_engine(),
                          db_engine,
                          save_predictions=False)

    matrix = matrix_creator(index="entity_id")
    metadata = matrix_metadata_creator(end_time=AS_OF_DATE,
                                       matrix_type=matrix_type,
                                       indices=["entity_id"])

    matrix_store = get_matrix_store(project_storage, matrix, metadata)
    train_matrix_columns = matrix.columns[0:-1].tolist()

    predict_proba = predictor.predict(
        model_id,
        matrix_store,
        misc_db_parameters=dict(),
        train_matrix_columns=train_matrix_columns,
    )

    # assert
    # 1. that the returned predictions are of the desired length
    assert len(predict_proba) == 2

    # 2. that the predictions table entries are present and
    # can be linked to the original models
    assert not table_has_data(f"{matrix_type}_predictions", db_engine)
Пример #8
0
def test_predictor_get_train_columns():
    with prepare() as (project_storage, db_engine, model_id):
        predictor = Predictor(project_storage.model_storage_engine(),
                              db_engine)
        train_store = get_matrix_store(
            project_storage=project_storage,
            matrix=matrix_creator(),
            metadata=matrix_metadata_creator(matrix_type="train"),
        )

        # flip the order of some feature columns in the test matrix
        other_order_matrix = matrix_creator()
        order = other_order_matrix.columns.tolist()
        order[0], order[1] = order[1], order[0]
        other_order_matrix = other_order_matrix[order]
        test_store = get_matrix_store(
            project_storage=project_storage,
            matrix=other_order_matrix,
            metadata=matrix_metadata_creator(matrix_type="test"),
        )

        # Runs the same test for training and testing predictions
        for store, mat_type in zip((train_store, test_store),
                                   ("train", "test")):
            predict_proba = predictor.predict(
                model_id,
                store,
                misc_db_parameters=dict(),
                train_matrix_columns=train_store.columns(),
            )
            # assert
            # 1. that we calculated predictions
            assert len(predict_proba) > 0

            # 2. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row
                for row in db_engine.execute("""select entity_id, as_of_date
                from {}_results.predictions
                join model_metadata.models using (model_id)""".format(
                    mat_type, mat_type))
            ]
            assert len(records) > 0
Пример #9
0
def test_predictor_retrieve():
    with prepare() as (project_storage, db_engine, model_id):
        predictor = Predictor(project_storage.model_storage_engine(),
                              db_engine,
                              replace=False)

        # create prediction set
        matrix = matrix_creator()
        metadata = matrix_metadata_creator()
        matrix_store = get_matrix_store(project_storage, matrix, metadata)

        predict_proba = predictor.predict(
            model_id,
            matrix_store,
            misc_db_parameters=dict(),
            train_matrix_columns=matrix.columns[0:-1].tolist())

        # When run again, the predictions retrieved from the database
        # should match.
        #
        # Some trickiness here. Let's explain:
        #
        # If we are not careful, retrieving predictions from the database and
        # presenting them as a numpy array can result in a bad ordering,
        # since the given matrix may not be 'ordered' by some criteria
        # that can be easily represented by an ORDER BY clause.
        #
        # It will sometimes work, because without ORDER BY you will get
        # it back in the table's physical order, which unless something has
        # happened to the table will be the order you inserted it,
        # which could very well be the order in the matrix.
        # So it's not a bug that would necessarily immediately show itself,
        # but when it does go wrong your scores will be garbage.
        #
        # So we simulate a table order mutation that can happen over time:
        # Remove the first row and put it at the end.
        # If the Predictor doesn't explicitly reorder the results, this will fail
        # Only running on TestPrediction because TrainPrediction behaves the exact same way
        reorder_session = sessionmaker(bind=db_engine)()
        obj = reorder_session.query(TestPrediction).first()
        reorder_session.delete(obj)
        reorder_session.commit()

        make_transient(obj)
        reorder_session = sessionmaker(bind=db_engine)()
        reorder_session.add(obj)
        reorder_session.commit()

        predictor.load_model = Mock()
        new_predict_proba = predictor.predict(
            model_id,
            matrix_store,
            misc_db_parameters=dict(),
            train_matrix_columns=matrix.columns[0:-1].tolist())
        assert_array_equal(new_predict_proba, predict_proba)
        assert not predictor.load_model.called
Пример #10
0
def test_calculate_and_save():
    with rig_engines() as (db_engine, project_storage):
        train_store = get_matrix_store(
            project_storage,
            matrix_creator(),
            matrix_metadata_creator(matrix_type='train'),
        )
        test_store = get_matrix_store(
            project_storage,
            matrix_creator(),
            matrix_metadata_creator(matrix_type='test'),
        )
        calculator = IndividualImportanceCalculator(db_engine,
                                                    methods=['sample'],
                                                    replace=False)
        # given a trained model
        # and a test matrix
        _, model_id = \
            fake_trained_model(
                db_engine,
                train_matrix_uuid=train_store.uuid
            )
        # i expect to be able to call calculate and save
        calculator.calculate_and_save_all_methods_and_dates(
            model_id, test_store)
        # and find individual importances in the results schema afterwards
        records = [
            row for row in db_engine.execute('''select entity_id, as_of_date
            from test_results.individual_importances
            join model_metadata.models using (model_id)''')
        ]
        assert len(records) > 0
        # and that when run again, has the same result
        calculator.calculate_and_save_all_methods_and_dates(
            model_id, test_store)
        new_records = [
            row for row in db_engine.execute('''select entity_id, as_of_date
            from test_results.individual_importances
            join model_metadata.models using (model_id)''')
        ]
        assert len(records) == len(new_records)
        assert records == new_records
Пример #11
0
def test_n_jobs_not_new_model(default_model_trainer):
    grid_config = {
        "sklearn.ensemble.AdaBoostClassifier": {"n_estimators": [10, 100, 1000]},
        "sklearn.ensemble.RandomForestClassifier": {
            "n_estimators": [10, 100],
            "max_features": ["sqrt", "log2"],
            "max_depth": [5, 10, 15, 20],
            "criterion": ["gini", "entropy"],
            "n_jobs": [12],
        },
    }

    trainer = default_model_trainer
    project_storage = trainer.model_storage_engine.project_storage
    db_engine = trainer.db_engine

    # generate train tasks, with a specific random seed so that we can compare
    # apples to apples later
    random.seed(5)
    train_tasks = trainer.generate_train_tasks(
        grid_config, dict(), get_matrix_store(project_storage)
    )

    for train_task in train_tasks:
        trainer.process_train_task(**train_task)

    # since n_jobs is a runtime attribute of the model, it should not make it
    # into the model group
    for row in db_engine.execute(
        "select hyperparameters from triage_metadata.model_groups"
    ):
        assert "n_jobs" not in row[0]

    hashes = set(task['model_hash'] for task in train_tasks)
    # generate the grid again with a different n_jobs (but the same random seed!)
    # and make sure that the hashes are the same as before
    random.seed(5)
    grid_config['sklearn.ensemble.RandomForestClassifier']['n_jobs'] = [24]
    new_train_tasks = trainer.generate_train_tasks(
        grid_config, dict(), get_matrix_store(project_storage)
    )
    assert hashes == set(task['model_hash'] for task in new_train_tasks)
Пример #12
0
def setup_model_train_tester(project_storage,
                             replace,
                             additional_bigtrain_classnames=None):
    matrix_storage_engine = MatrixStorageEngine(project_storage)
    train_matrix_store = get_matrix_store(
        project_storage,
        metadata=matrix_metadata_creator(matrix_type="train"),
        write_to_db=False)
    test_matrix_store = get_matrix_store(
        project_storage,
        metadata=matrix_metadata_creator(matrix_type="test"),
        write_to_db=False)
    sample_train_kwargs = {
        'matrix_store': train_matrix_store,
        'class_path': None,
        'parameters': {},
        'model_hash': None,
        'misc_db_parameters': {}
    }
    train_test_task = {
        'train_kwargs': sample_train_kwargs,
        'train_store': train_matrix_store,
        'test_store': test_matrix_store
    }

    predictor = MagicMock(spec_set=Predictor)
    trainer = MagicMock(spec_set=ModelTrainer)
    evaluator = MagicMock(spec_set=ModelEvaluator)
    individual_importance_calculator = MagicMock(
        spec_set=IndividualImportanceCalculator)
    protected_groups_generator = MagicMock(spec_set=ProtectedGroupsGenerator)
    train_tester = ModelTrainTester(
        matrix_storage_engine=matrix_storage_engine,
        model_trainer=trainer,
        model_evaluator=evaluator,
        individual_importance_calculator=individual_importance_calculator,
        predictor=predictor,
        subsets=[],
        replace=replace,
        protected_groups_generator=protected_groups_generator,
        additional_bigtrain_classnames=additional_bigtrain_classnames)
    return train_tester, train_test_task
Пример #13
0
def test_ModelTrainTester_generate_tasks(db_engine_with_results_schema,
                                         project_storage,
                                         sample_timechop_splits,
                                         sample_grid_config):
    db_engine = db_engine_with_results_schema
    model_storage_engine = ModelStorageEngine(project_storage)
    matrix_storage_engine = MatrixStorageEngine(project_storage)
    sample_matrix_store = get_matrix_store(project_storage)
    experiment_hash = save_experiment_and_get_hash({}, db_engine)
    run_id = initialize_tracking_and_get_run_id(
        experiment_hash,
        experiment_class_path="",
        random_seed=5,
        experiment_kwargs={},
        db_engine=db_engine_with_results_schema)
    # instantiate pipeline objects
    trainer = ModelTrainer(
        experiment_hash=experiment_hash,
        model_storage_engine=model_storage_engine,
        db_engine=db_engine,
        run_id=run_id,
    )
    train_tester = ModelTrainTester(
        matrix_storage_engine=matrix_storage_engine,
        model_trainer=trainer,
        model_evaluator=None,
        individual_importance_calculator=None,
        predictor=None,
        subsets=None,
        protected_groups_generator=None,
    )
    with patch.object(matrix_storage_engine,
                      'get_store',
                      return_value=sample_matrix_store):
        batches = train_tester.generate_task_batches(
            splits=sample_timechop_splits, grid_config=sample_grid_config)
        assert len(batches) == 3
        # we expect to have a task for each combination of split and classifier
        flattened_tasks = list(task for batch in batches
                               for task in batch.tasks)
        assert len(flattened_tasks) == \
            len(sample_timechop_splits) * len(list(flatten_grid_config(sample_grid_config)))
        # we also expect each task to match the call signature of process_task
        with patch.object(train_tester, 'process_task', autospec=True):
            for task in flattened_tasks:
                train_tester.process_task(**task)
Пример #14
0
def test_predictor_composite_index():
    with prepare() as (project_storage, db_engine, model_id):
        predictor = Predictor(project_storage.model_storage_engine(),
                              db_engine)

        dayone = datetime.datetime(2011, 1, 1)
        daytwo = datetime.datetime(2011, 1, 2)
        source_dict = {
            "entity_id": [1, 2, 1, 2],
            "as_of_date": [dayone, dayone, daytwo, daytwo],
            "feature_one": [3, 4, 5, 6],
            "feature_two": [5, 6, 7, 8],
            "label": [7, 8, 8, 7],
        }

        # Runs the same test for training and testing predictions
        for mat_type in ("train", "test"):

            matrix = pandas.DataFrame.from_dict(source_dict).set_index(
                ["entity_id", "as_of_date"])
            metadata = matrix_metadata_creator(matrix_type=mat_type)
            matrix_store = get_matrix_store(project_storage, matrix, metadata)

            predict_proba = predictor.predict(
                model_id,
                matrix_store,
                misc_db_parameters=dict(),
                train_matrix_columns=["feature_one", "feature_two"],
            )

            # assert
            # 1. that the returned predictions are of the desired length
            assert len(predict_proba) == 4

            # 2. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row
                for row in db_engine.execute("""select entity_id, as_of_date
                from {}_results.predictions
                join model_metadata.models using (model_id)""".format(
                    mat_type, mat_type))
            ]
            assert len(records) == 4
Пример #15
0
    def test_retry_max(self):
        db_engine = None
        trainer = None
        # set up a basic model training run
        with rig_engines() as (db_engine, project_storage):
            trainer = ModelTrainer(
                experiment_hash=None,
                model_storage_engine=project_storage.model_storage_engine(),
                db_engine=db_engine,
                model_grouper=ModelGrouper())
            matrix_store = get_matrix_store(project_storage)

        # the postgres server goes out of scope here and thus no longer exists
        with patch('time.sleep') as time_mock:
            with self.assertRaises(sqlalchemy.exc.OperationalError):
                trainer.train_models(grid_config(), dict(), matrix_store)
            # we want to make sure that we are using the retrying module sanely
            # as opposed to matching the exact # of calls specified by the code
            assert len(time_mock.mock_calls) > 5
Пример #16
0
def test_baseline_exception_handling():
    grid_config = {
        'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature':
        {
            'feature': ['feature_one', 'feature_three']
        }
    }
    with rig_engines() as (db_engine, project_storage):
        trainer = ModelTrainer(
            experiment_hash=None,
            model_storage_engine=project_storage.model_storage_engine(),
            db_engine=db_engine,
            model_grouper=ModelGrouper())

        train_tasks = trainer.generate_train_tasks(
            grid_config, dict(), get_matrix_store(project_storage))

        model_ids = []
        for train_task in train_tasks:
            model_ids.append(trainer.process_train_task(**train_task))
        assert model_ids == [1, None]
Пример #17
0
def test_custom_groups(grid_config):
    with rig_engines() as (db_engine, project_storage):
        # create training set
        model_storage_engine = project_storage.model_storage_engine()
        trainer = ModelTrainer(
            experiment_hash=None,
            model_storage_engine=model_storage_engine,
            model_grouper=ModelGrouper(['class_path']),
            db_engine=db_engine,
        )
        model_ids = trainer.train_models(
            grid_config=grid_config,
            misc_db_parameters=dict(),
            matrix_store=get_matrix_store(project_storage))
        # expect only one model group now
        records = [
            row[0] for row in db_engine.execute(
                'select distinct model_group_id from model_metadata.models')
        ]
        assert len(records) == 1
        assert records[0] == model_ids[0]
Пример #18
0
def test_predictor_needs_predictions(matrix_type, predict_setup_args):
    """Test that the logic that figures out if predictions are needed for a given model/matrix"""
    (project_storage, db_engine, model_id) = predict_setup_args
    # if not all of the predictions for the given model id and matrix are present in the db,
    # needs_predictions should return true. else, false
    predictor = Predictor(project_storage.model_storage_engine(), db_engine,
                          'worst')

    metadata = matrix_metadata_creator(matrix_type=matrix_type)
    matrix_store = get_matrix_store(project_storage, metadata=metadata)
    train_matrix_columns = matrix_store.columns()

    # we haven't done anything yet, this should definitely need predictions
    assert predictor.needs_predictions(matrix_store, model_id)
    predictor.predict(
        model_id,
        matrix_store,
        misc_db_parameters=dict(),
        train_matrix_columns=train_matrix_columns,
    )
    # now that predictions have been made, this should no longer need predictions
    assert not predictor.needs_predictions(matrix_store, model_id)
Пример #19
0
def test_n_jobs_not_new_model():
    grid_config = {
        'sklearn.ensemble.AdaBoostClassifier': {
            'n_estimators': [10, 100, 1000]
        },
        'sklearn.ensemble.RandomForestClassifier': {
            'n_estimators': [10, 100],
            'max_features': ['sqrt', 'log2'],
            'max_depth': [5, 10, 15, 20],
            'criterion': ['gini', 'entropy'],
            'n_jobs': [12, 24],
        }
    }

    with rig_engines() as (db_engine, project_storage):
        model_storage_engine = project_storage.model_storage_engine()
        trainer = ModelTrainer(experiment_hash=None,
                               model_storage_engine=model_storage_engine,
                               db_engine=db_engine,
                               model_grouper=ModelGrouper())

        train_tasks = trainer.generate_train_tasks(
            grid_config,
            dict(),
            get_matrix_store(project_storage),
        )

        assert len(
            train_tasks) == 35  # 32+3, would be (32*2)+3 if we didn't remove
        assert len([
            task for task in train_tasks if 'n_jobs' in task['parameters']
        ]) == 32

        for train_task in train_tasks:
            trainer.process_train_task(**train_task)

        for row in db_engine.execute(
                'select hyperparameters from model_metadata.model_groups'):
            assert 'n_jobs' not in row[0]
def test_uniform_distribution_entity_id_index():
    with rig_engines() as (db_engine, project_storage):
        model = ModelFactory()
        feature_importances = [
            FeatureImportanceFactory(model_rel=model,
                                     feature="feature_{}".format(i))
            for i in range(0, 10)
        ]
        data_dict = {"entity_id": [1, 2]}
        for imp in feature_importances:
            data_dict[imp.feature] = [0.5, 0.5]
        metadata = matrix_metadata_creator(indices="entity_id")
        test_store = get_matrix_store(
            project_storage,
            pandas.DataFrame.from_dict(data_dict).set_index(
                metadata["indices"]),
            metadata,
        )
        results = uniform_distribution(
            db_engine,
            model_id=model.model_id,
            as_of_date="2016-01-01",
            test_matrix_store=test_store,
            n_ranks=5,
        )

        assert len(results) == 10  # 5 features x 2 entities
        for result in results:
            assert "entity_id" in result
            assert "feature_name" in result
            assert "score" in result
            assert "feature_value" in result
            assert result["feature_value"] == 0.5
            assert result["score"] >= 0
            assert result["score"] <= 1
            assert isinstance(result["feature_name"], str)
            assert result["entity_id"] in [1, 2]
Пример #21
0
def prediction_results(matrix_type, predictor, predict_setup_args):
    (project_storage, db_engine, model_id) = predict_setup_args

    dayone = datetime.datetime(2011, 1, 1)
    daytwo = datetime.datetime(2011, 1, 2)
    source_dict = {
        "entity_id": [1, 2, 3, 1, 2, 3],
        "as_of_date": [dayone, dayone, dayone, daytwo, daytwo, daytwo],
        "feature_one": [3] * 6,
        "feature_two": [5] * 6,
        "label": [True, False] * 3
    }

    matrix = pd.DataFrame.from_dict(source_dict)
    metadata = matrix_metadata_creator(matrix_type=matrix_type)
    matrix_store = get_matrix_store(project_storage, matrix, metadata)

    predict_proba = predictor.predict(
        model_id,
        matrix_store,
        misc_db_parameters=dict(),
        train_matrix_columns=["feature_one", "feature_two"],
    )
    return predict_proba
Пример #22
0
def test_predictor_needs_predictions(matrix_type, predict_setup_args):
    (project_storage, db_engine, model_id) = predict_setup_args
    # if not all of the predictions for the given model id and matrix are present in the db,
    # needs_predictions should return true. else, false
    predictor = Predictor(project_storage.model_storage_engine(), db_engine)

    matrix = matrix_creator(index="entity_id")
    metadata = matrix_metadata_creator(end_time=AS_OF_DATE,
                                       matrix_type=matrix_type,
                                       indices=["entity_id"])

    matrix_store = get_matrix_store(project_storage, matrix, metadata)
    train_matrix_columns = matrix.columns[0:-1].tolist()

    # we haven't done anything yet, this should definitely need predictions
    assert predictor.needs_predictions(matrix_store, model_id)
    predictor.predict(
        model_id,
        matrix_store,
        misc_db_parameters=dict(),
        train_matrix_columns=train_matrix_columns,
    )
    # now that predictions have been made, this should no longer need predictions
    assert not predictor.needs_predictions(matrix_store, model_id)
Пример #23
0
def test_predictor_entity_index():
    with prepare() as (project_storage, db_engine, model_id):
        predictor = Predictor(project_storage.model_storage_engine(),
                              db_engine)

        # Runs the same test for training and testing predictions
        for mat_type in ("train", "test"):
            matrix = matrix_creator(index="entity_id")
            metadata = matrix_metadata_creator(end_time=AS_OF_DATE,
                                               matrix_type=mat_type,
                                               indices=["entity_id"])

            matrix_store = get_matrix_store(project_storage, matrix, metadata)
            train_matrix_columns = matrix.columns[0:-1].tolist()

            predict_proba = predictor.predict(
                model_id,
                matrix_store,
                misc_db_parameters=dict(),
                train_matrix_columns=train_matrix_columns,
            )

            # assert
            # 1. that the returned predictions are of the desired length
            assert len(predict_proba) == 2

            # 2. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row
                for row in db_engine.execute("""select entity_id, as_of_date
                from {}_results.predictions
                join model_metadata.models using (model_id)""".format(
                    mat_type, mat_type))
            ]
            assert len(records) == 2

            # 3. that the contained as_of_dates match what we sent in
            for record in records:
                assert record[1].date() == AS_OF_DATE

            # 4. that the entity ids match the given dataset
            assert sorted([record[0] for record in records]) == [1, 2]

        # 5. running with same model_id, different as of date
        # then with same as of date only replaces the records
        # with the same date

        # Runs the same test for training and testing predictions
        for mat_type in ("train", "test"):
            new_matrix = matrix_creator(index="entity_id")
            new_metadata = matrix_metadata_creator(
                end_time=AS_OF_DATE + datetime.timedelta(days=1),
                matrix_type=mat_type,
                indices=["entity_id"],
            )
            new_matrix_store = get_matrix_store(project_storage, new_matrix,
                                                new_metadata)

            predictor.predict(
                model_id,
                new_matrix_store,
                misc_db_parameters=dict(),
                train_matrix_columns=train_matrix_columns,
            )
            predictor.predict(
                model_id,
                matrix_store,
                misc_db_parameters=dict(),
                train_matrix_columns=train_matrix_columns,
            )
            records = [
                row
                for row in db_engine.execute("""select entity_id, as_of_date
                from {}_results.predictions
                join model_metadata.models using (model_id)""".format(
                    mat_type, mat_type))
            ]
            assert len(records) == 4

        # 6. That we can delete the model when done prediction on it
        predictor.delete_model(model_id)
        assert predictor.load_model(model_id) is None
Пример #24
0
def test_model_trainer(grid_config):
    with rig_engines() as (db_engine, project_storage):
        # Creates a matrix entry in the matrices table with uuid from metadata above
        model_storage_engine = project_storage.model_storage_engine()
        trainer = ModelTrainer(
            experiment_hash=None,
            model_storage_engine=model_storage_engine,
            model_grouper=ModelGrouper(),
            db_engine=db_engine,
        )
        model_ids = trainer.train_models(
            grid_config=grid_config,
            misc_db_parameters=dict(),
            matrix_store=get_matrix_store(project_storage),
        )

        # assert
        # 1. that the models and feature importances table entries are present
        records = [
            row for row in db_engine.execute(
                'select * from train_results.feature_importances')
        ]
        assert len(records) == 4 * 2  # maybe exclude entity_id? yes

        records = [
            row for row in db_engine.execute(
                'select model_hash from model_metadata.models')
        ]
        assert len(records) == 4
        hashes = [row[0] for row in records]

        # 2. that the model groups are distinct
        records = [
            row for row in db_engine.execute(
                'select distinct model_group_id from model_metadata.models')
        ]
        assert len(records) == 4

        # 3. that the model sizes are saved in the table and all are < 1 kB
        records = [
            row for row in db_engine.execute(
                'select model_size from model_metadata.models')
        ]
        assert len(records) == 4
        for i in records:
            size = i[0]
            assert size < 1

        # 4. that all four models are cached
        model_pickles = [
            model_storage_engine.load(model_hash) for model_hash in hashes
        ]
        assert len(model_pickles) == 4
        assert len([x for x in model_pickles if x is not None]) == 4

        # 5. that their results can have predictions made on it
        test_matrix = pandas.DataFrame.from_dict({
            'entity_id': [3, 4],
            'feature_one': [4, 4],
            'feature_two': [6, 5],
        }).set_index('entity_id')

        for model_pickle in model_pickles:
            predictions = model_pickle.predict(test_matrix)
            assert len(predictions) == 2

        # 6. when run again, same models are returned
        new_model_ids = trainer.train_models(
            grid_config=grid_config,
            misc_db_parameters=dict(),
            matrix_store=get_matrix_store(project_storage))
        assert len([
            row for row in db_engine.execute(
                'select model_hash from model_metadata.models')
        ]) == 4
        assert model_ids == new_model_ids

        # 7. if replace is set, update non-unique attributes and feature importances
        max_batch_run_time = [
            row[0] for row in db_engine.execute(
                'select max(batch_run_time) from model_metadata.models')
        ][0]
        trainer = ModelTrainer(
            experiment_hash=None,
            model_storage_engine=model_storage_engine,
            model_grouper=ModelGrouper(
                model_group_keys=['label_name', 'label_timespan']),
            db_engine=db_engine,
            replace=True)
        new_model_ids = trainer.train_models(
            grid_config=grid_config,
            misc_db_parameters=dict(),
            matrix_store=get_matrix_store(project_storage))
        assert model_ids == new_model_ids
        assert [
            row['model_id'] for row in db_engine.execute(
                'select model_id from model_metadata.models order by 1 asc')
        ] == model_ids
        new_max_batch_run_time = [
            row[0] for row in db_engine.execute(
                'select max(batch_run_time) from model_metadata.models')
        ][0]
        assert new_max_batch_run_time > max_batch_run_time

        records = [
            row for row in db_engine.execute(
                'select * from train_results.feature_importances')
        ]
        assert len(records) == 4 * 2  # maybe exclude entity_id? yes

        # 8. if the cache is missing but the metadata is still there, reuse the metadata
        for row in db_engine.execute(
                'select model_hash from model_metadata.models'):
            model_storage_engine.delete(row[0])
        new_model_ids = trainer.train_models(
            grid_config=grid_config,
            misc_db_parameters=dict(),
            matrix_store=get_matrix_store(project_storage))
        assert model_ids == sorted(new_model_ids)

        # 9. that the generator interface works the same way
        new_model_ids = trainer.generate_trained_models(
            grid_config=grid_config,
            misc_db_parameters=dict(),
            matrix_store=get_matrix_store(project_storage))
        assert model_ids == \
            sorted([model_id for model_id in new_model_ids])
Пример #25
0
def test_reuse_model_random_seeds(grid_config, default_model_trainer):
    trainer = default_model_trainer
    db_engine = trainer.db_engine
    project_storage = trainer.model_storage_engine.project_storage
    model_storage_engine = trainer.model_storage_engine

    # re-using the random seeds requires the association between experiments and models
    # to exist, which we're not getting in these tests since we aren't using the experiment
    # architecture, so back-fill these associations after each train_models() run
    def update_experiment_models(db_engine):
        sql = """
            INSERT INTO triage_metadata.experiment_models(experiment_hash,model_hash) 
            SELECT er.run_hash, m.model_hash
            FROM triage_metadata.models m
            LEFT JOIN triage_metadata.triage_runs er
                ON m.built_in_triage_run = er.id
            LEFT JOIN triage_metadata.experiment_models em 
                ON m.model_hash = em.model_hash
                AND er.run_hash = em.experiment_hash
            WHERE em.experiment_hash IS NULL
            """
        db_engine.execute(sql)
        db_engine.execute('COMMIT;')

    random.seed(5)
    model_ids = trainer.train_models(
        grid_config=grid_config,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    update_experiment_models(db_engine)

    # simulate running a new experiment where the experiment hash has changed
    # (e.g. because the model grid is different), but experiment seed is the
    # same, so previously-trained models should not get new seeds
    experiment_hash = save_experiment_and_get_hash(
        config={'baz': 'qux'}, 
        db_engine=db_engine
        )
    run_id = initialize_tracking_and_get_run_id(
        experiment_hash,
        experiment_class_path="",
        random_seed=5,
        experiment_kwargs={},
        db_engine=db_engine
    )
    trainer = ModelTrainer(
        experiment_hash=experiment_hash,
        model_storage_engine=model_storage_engine,
        db_engine=db_engine,
        model_grouper=ModelGrouper(),
        run_id=run_id,
    )
    new_grid = grid_config.copy()
    new_grid['sklearn.tree.DecisionTreeClassifier']['min_samples_split'] = [3,10,100]
    random.seed(5)
    new_model_ids = trainer.train_models(
        grid_config=new_grid,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    update_experiment_models(db_engine)

    # should have received 5 models
    assert len(new_model_ids) == 6

    # all the original model ids should be in the new set
    assert len(set(new_model_ids) & set(model_ids)) == len(model_ids)

    # however, we should NOT re-use the random seeds (and so get new model_ids)
    # if the experiment-level seed is different
    experiment_hash = save_experiment_and_get_hash(
        config={'lorem': 'ipsum'}, 
        db_engine=db_engine
        )
    run_id = initialize_tracking_and_get_run_id(
        experiment_hash,
        experiment_class_path="",
        random_seed=42,
        experiment_kwargs={},
        db_engine=db_engine
    )
    trainer = ModelTrainer(
        experiment_hash=experiment_hash,
        model_storage_engine=model_storage_engine,
        db_engine=db_engine,
        model_grouper=ModelGrouper(),
        run_id=run_id,
    )
    random.seed(42) # different from above
    newer_model_ids = trainer.train_models(
        grid_config=new_grid,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    update_experiment_models(db_engine)

    # should get entirely new models now (different IDs)
    assert len(newer_model_ids) == 6
    assert len(set(new_model_ids) & set(newer_model_ids)) == 0
Пример #26
0
 def replace_db(arg):
     self.new_server = testing.postgresql.Postgresql(port=port)
     db_engine = create_engine(self.new_server.url())
     ensure_db(db_engine)
     init_engine(db_engine)
     get_matrix_store(project_storage)