Пример #1
0
    def __init__(self,
                 config,
                 db_engine,
                 model_storage_class=None,
                 project_path=None,
                 replace=True):
        self.config = config
        self.db_engine = db_engine
        if model_storage_class:
            self.model_storage_engine =\
                model_storage_class(project_path=project_path)
        self.project_path = project_path
        self.replace = replace
        ensure_db(self.db_engine)

        self.labels_table_name = 'labels'
        self.features_schema_name = 'features'
        if project_path:
            self.matrices_directory = os.path.join(self.project_path,
                                                   'matrices')
            if not os.path.exists(self.matrices_directory):
                os.makedirs(self.matrices_directory)
        self.experiment_hash = save_experiment_and_get_hash(
            self.config, self.db_engine)
        self._split_definitions = None
        self._matrix_build_tasks = None
        self._feature_table_tasks = None
        self._all_as_of_times = None
        self.initialize_factories()
        self.initialize_components()
Пример #2
0
def test_model_scoring_inspections():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        metric_groups = [{
            'metrics': ['precision@', 'recall@', 'fpr@'],
            'thresholds': {
                'percentiles': [50.0],
                'top_n': [3]
            }
        }]

        model_scorer = ModelScorer(metric_groups, db_engine)

        _, model_id = fake_trained_model(
            'myproject', InMemoryModelStorageEngine('myproject'), db_engine)

        labels = numpy.array([True, False, numpy.nan, True, False])
        prediction_probas = numpy.array([0.56, 0.4, 0.55, 0.5, 0.3])
        evaluation_start = datetime.datetime(2016, 4, 1)
        evaluation_end = datetime.datetime(2016, 7, 1)
        example_frequency = '1d'
        model_scorer.score(prediction_probas, labels, model_id,
                           evaluation_start, evaluation_end, example_frequency)

        for record in db_engine.execute(
                '''select * from results.evaluations
            where model_id = %s and evaluation_start_time = %s order by 1''',
            (model_id, evaluation_start)):
            assert record['num_labeled_examples'] == 4
            assert record['num_positive_labels'] == 2
            if 'pct' in record['parameter']:
                assert record['num_labeled_above_threshold'] == 1
            else:
                assert record['num_labeled_above_threshold'] == 2
Пример #3
0
def test_n_jobs_not_new_model():
    grid_config = {
        'sklearn.ensemble.AdaBoostClassifier': {
            'n_estimators': [10, 100, 1000]
        },
        'sklearn.ensemble.RandomForestClassifier': {
            'n_estimators': [10, 100],
            'max_features': ['sqrt', 'log2'],
            'max_depth': [5, 10, 15, 20],
            'criterion': ['gini', 'entropy'],
            'n_jobs': [12, 24],
        }
    }

    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)
        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            trainer = ModelTrainer(
                project_path='econ-dev/inspections',
                experiment_hash=None,
                model_storage_engine=S3ModelStorageEngine(
                    s3_conn, 'econ-dev/inspections'),
                db_engine=engine,
                model_group_keys=['label_name', 'label_window'])

            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': ['good', 'bad']
            })
            train_tasks = trainer.generate_train_tasks(
                grid_config, dict(),
                InMemoryMatrixStore(
                    matrix, {
                        'label_window': '1d',
                        'end_time': datetime.datetime.now(),
                        'beginning_of_time': datetime.date(2012, 12, 20),
                        'label_name': 'label',
                        'metta-uuid': '1234',
                        'feature_names': ['ft1', 'ft2']
                    }))
            assert len(train_tasks
                       ) == 35  # 32+3, would be (32*2)+3 if we didn't remove
            assert len([
                task for task in train_tasks if 'n_jobs' in task['parameters']
            ]) == 32

            for train_task in train_tasks:
                trainer.process_train_task(**train_task)

            for row in engine.execute(
                    'select model_parameters from results.model_groups'):
                assert 'n_jobs' not in row[0]
Пример #4
0
    def test_retry_recovery(self):
        grid_config = {
            'sklearn.ensemble.AdaBoostClassifier': {
                'n_estimators': [10]
            },
        }

        engine = None
        trainer = None
        port = None
        with testing.postgresql.Postgresql() as postgresql:
            port = postgresql.settings['port']
            engine = create_engine(postgresql.url())
            ensure_db(engine)
            trainer = ModelTrainer(
                project_path='econ-dev/inspections',
                experiment_hash=None,
                model_storage_engine=InMemoryModelStorageEngine(
                    project_path=''),
                db_engine=engine,
                model_group_keys=['label_name', 'label_window'])

            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': ['good', 'bad']
            })
            matrix_store = InMemoryMatrixStore(
                matrix, {
                    'label_window': '1d',
                    'end_time': datetime.datetime.now(),
                    'beginning_of_time': datetime.date(2012, 12, 20),
                    'label_name': 'label',
                    'metta-uuid': '1234',
                    'feature_names': ['ft1', 'ft2']
                })

        # start without a database server
        # then bring it back up after the first sleep
        # use self so it doesn't go out of scope too early and shut down
        self.new_server = None

        def replace_db(arg):
            self.new_server = testing.postgresql.Postgresql(port=port)
            engine = create_engine(self.new_server.url())
            ensure_db(engine)

        with patch('time.sleep') as time_mock:
            time_mock.side_effect = replace_db
            try:
                trainer.train_models(grid_config, dict(), matrix_store)
            finally:
                if self.new_server is not None:
                    self.new_server.stop()
            assert len(time_mock.mock_calls) == 1
Пример #5
0
def test_save_experiment_and_get_hash():
    # no reason to make assertions on the config itself, use a basic dict
    experiment_config = {'one': 'two'}
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)
        exp_hash = save_experiment_and_get_hash(experiment_config, engine)
        assert isinstance(exp_hash, str)
        new_hash = save_experiment_and_get_hash(experiment_config, engine)
        assert new_hash == exp_hash
Пример #6
0
    def __init__(self, config, db_engine, model_storage_class, project_path):
        self.config = config
        self.db_engine = db_engine
        self.model_storage_engine =\
            model_storage_class(project_path=project_path)
        self.project_path = project_path
        ensure_db(self.db_engine)

        self.labels_table_name = 'labels'
        self.features_schema_name = 'features'
        self.matrices_directory = os.path.join(self.project_path, 'matrices')
        if not os.path.exists(self.matrices_directory):
            os.makedirs(self.matrices_directory)
        self.initialize_factories()
        self.initialize_components()
Пример #7
0
    def test_retry_max(self):
        grid_config = {
            'sklearn.ensemble.AdaBoostClassifier': {
                'n_estimators': [10]
            },
        }

        engine = None
        trainer = None
        # set up a basic model training run
        # TODO abstract the setup of a basic model training run where
        # we don't worry about the specific values used? it would make
        # tests like this require a bit less noise to read past
        with testing.postgresql.Postgresql() as postgresql:
            engine = create_engine(postgresql.url())
            ensure_db(engine)
            trainer = ModelTrainer(
                project_path='econ-dev/inspections',
                experiment_hash=None,
                model_storage_engine=InMemoryModelStorageEngine(
                    project_path=''),
                db_engine=engine,
                model_group_keys=['label_name', 'label_window'])

            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': ['good', 'bad']
            })
            matrix_store = InMemoryMatrixStore(
                matrix, {
                    'label_window': '1d',
                    'end_time': datetime.datetime.now(),
                    'beginning_of_time': datetime.date(2012, 12, 20),
                    'label_name': 'label',
                    'metta-uuid': '1234',
                    'feature_names': ['ft1', 'ft2']
                })
        # the postgres server goes out of scope here and thus no longer exists
        with patch('time.sleep') as time_mock:
            with self.assertRaises(sqlalchemy.exc.OperationalError):
                trainer.train_models(grid_config, dict(), matrix_store)
            # we want to make sure that we are using the retrying module sanely
            # as opposed to matching the exact # of calls specified by the code
            assert len(time_mock.mock_calls) > 5
Пример #8
0
def test_model_scoring_inspections():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        metric_groups = [{
            'metrics': ['precision@', 'recall@'],
            'thresholds': {
                'percentiles': [5.0, 10.0],
                'top_n': [5, 10]
            }
        }]

        model_scorer = ModelScorer(metric_groups, db_engine)

        trained_model, model_id = fake_trained_model(
            'myproject', InMemoryModelStorageEngine('myproject'), db_engine)

        labels = fake_labels(5)
        as_of_date = datetime.date(2016, 5, 5)
        evaluation_start = datetime.datetime(2016, 4, 1)
        evaluation_end = datetime.datetime(2016, 7, 1)
        prediction_frequency = '1d'
        model_scorer.score(
            trained_model.predict_proba(labels)[:, 1],
            trained_model.predict(labels), labels, model_id, evaluation_start,
            evaluation_end, prediction_frequency)

        # assert
        # that all of the records are there
        results = db_engine.execute(
            '''select distinct(metric || parameter) from results.evaluations
                where model_id = %s and evaluation_start_time = %s order by 1''',
            (model_id, evaluation_start))
        records = [row[0] for row in results]
        assert records == [
            '[email protected]_pct',
            'precision@10_abs',
            '[email protected]_pct',
            'precision@5_abs',
            '[email protected]_pct',
            'recall@10_abs',
            '[email protected]_pct',
            'recall@5_abs',
        ]
Пример #9
0
def test_predictor_composite_index():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        project_path = 'econ-dev/inspections'
        model_storage_engine = InMemoryModelStorageEngine(project_path)
        _, model_id = \
            fake_trained_model(project_path, model_storage_engine, db_engine)
        predictor = Predictor(project_path, model_storage_engine, db_engine)
        dayone = datetime.datetime(2011, 1, 1)
        daytwo = datetime.datetime(2011, 1, 2)
        # create prediction set
        matrix = pandas.DataFrame.from_dict({
            'entity_id': [1, 2, 1, 2],
            'as_of_date': [dayone, dayone, daytwo, daytwo],
            'feature_one': [3, 4, 5, 6],
            'feature_two': [5, 6, 7, 8],
            'label': [7, 8, 8, 7]
        }).set_index(['entity_id', 'as_of_date'])
        metadata = {
            'label_name': 'label',
            'end_time': AS_OF_DATE,
            'label_window': '3month',
            'metta-uuid': '1234',
        }
        matrix_store = InMemoryMatrixStore(matrix, metadata)
        predict_proba = predictor.predict(model_id, matrix_store, misc_db_parameters=dict())

        # assert
        # 1. that the returned predictions are of the desired length
        assert len(predict_proba) == 4

        # 2. that the predictions table entries are present and
        # can be linked to the original models
        records = [
            row for row in
            db_engine.execute('''select entity_id, as_of_date
            from results.predictions
            join results.models using (model_id)''')
        ]
        assert len(records) == 4
Пример #10
0
def test_model_trainer():
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)

        grid_config = {
            'sklearn.linear_model.LogisticRegression': {
                'C': [0.00001, 0.0001],
                'penalty': ['l1', 'l2'],
                'random_state': [2193]
            }
        }

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')

            # create training set
            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': ['good', 'bad']
            })
            metadata = {
                'beginning_of_time': datetime.date(2012, 12, 20),
                'end_time': datetime.date(2016, 12, 20),
                'label_name': 'label',
                'label_window': '1y',
                'metta-uuid': '1234',
                'feature_names': ['ft1', 'ft2']
            }
            project_path = 'econ-dev/inspections'
            model_storage_engine = S3ModelStorageEngine(s3_conn, project_path)
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=None,
                model_storage_engine=model_storage_engine,
                db_engine=engine,
                model_group_keys=['label_name', 'label_window'])
            matrix_store = InMemoryMatrixStore(matrix, metadata)
            model_ids = trainer.train_models(grid_config=grid_config,
                                             misc_db_parameters=dict(),
                                             matrix_store=matrix_store)

            # assert
            # 1. that the models and feature importances table entries are present
            records = [
                row for row in engine.execute(
                    'select * from results.feature_importances')
            ]
            assert len(records) == 4 * 3  # maybe exclude entity_id?

            records = [
                row for row in engine.execute(
                    'select model_hash from results.models')
            ]
            assert len(records) == 4

            cache_keys = [
                model_cache_key(project_path, model_row[0], s3_conn)
                for model_row in records
            ]

            # 2. that the model groups are distinct
            records = [
                row for row in engine.execute(
                    'select distinct model_group_id from results.models')
            ]
            assert len(records) == 4

            # 3. that all four models are cached
            model_pickles = [
                pickle.loads(cache_key.get()['Body'].read())
                for cache_key in cache_keys
            ]
            assert len(model_pickles) == 4
            assert len([x for x in model_pickles if x is not None]) == 4

            # 4. that their results can have predictions made on it
            test_matrix = pandas.DataFrame.from_dict({
                'entity_id': [3, 4],
                'feature_one': [4, 4],
                'feature_two': [6, 5],
            })
            for model_pickle in model_pickles:
                predictions = model_pickle.predict(test_matrix)
                assert len(predictions) == 2

            # 5. when run again, same models are returned
            new_model_ids = trainer.train_models(grid_config=grid_config,
                                                 misc_db_parameters=dict(),
                                                 matrix_store=matrix_store)
            assert len([
                row for row in engine.execute(
                    'select model_hash from results.models')
            ]) == 4
            assert model_ids == new_model_ids

            # 6. if metadata is deleted but the cache is still there,
            # retrains that one and replaces the feature importance records
            engine.execute(
                'delete from results.feature_importances where model_id = 3')
            engine.execute('delete from results.models where model_id = 3')
            new_model_ids = trainer.train_models(grid_config=grid_config,
                                                 misc_db_parameters=dict(),
                                                 matrix_store=matrix_store)
            expected_model_ids = [1, 2, 4, 5]
            assert expected_model_ids == sorted(new_model_ids)
            assert [
                row['model_id'] for row in engine.execute(
                    'select model_id from results.models order by 1 asc')
            ] == expected_model_ids

            records = [
                row for row in engine.execute(
                    'select * from results.feature_importances')
            ]
            assert len(records) == 4 * 3  # maybe exclude entity_id?

            # 7. if the cache is missing but the metadata is still there, reuse the metadata
            for row in engine.execute('select model_hash from results.models'):
                model_storage_engine.get_store(row[0]).delete()
            expected_model_ids = [1, 2, 4, 5]
            new_model_ids = trainer.train_models(grid_config=grid_config,
                                                 misc_db_parameters=dict(),
                                                 matrix_store=matrix_store)
            assert expected_model_ids == sorted(new_model_ids)

            # 8. that the generator interface works the same way
            new_model_ids = trainer.generate_trained_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=matrix_store)
            assert expected_model_ids == \
                sorted([model_id for model_id in new_model_ids])
Пример #11
0
def reuse_pipeline_test(pipeline_class):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        populate_source_data(db_engine)
        temporal_config = {
            'beginning_of_time': '2010-01-01',
            'modeling_start_time': '2011-01-01',
            'modeling_end_time': '2014-01-01',
            'update_window': '1y',
            'train_label_windows': ['6months'],
            'test_label_windows': ['6months'],
            'train_example_frequency': '1day',
            'test_example_frequency': '3months',
            'train_durations': ['6months'],
            'test_durations': ['1months'],
        }
        scoring_config = {
            'metric_groups': [{
                'metrics': ['precision@'],
                'thresholds': {
                    'top_n': [2]
                }
            }],
            'sort_seed':
            12345
        }
        grid_config = {
            'sklearn.linear_model.LogisticRegression': {
                'C': [0.00001, 0.0001],
                'penalty': ['l1', 'l2'],
                'random_state': [2193]
            }
        }
        feature_config = [{
            'prefix':
            'test_features',
            'from_obj':
            'cat_complaints',
            'knowledge_date_column':
            'as_of_date',
            'aggregates': [{
                'quantity': 'cat_sightings',
                'metrics': ['count', 'avg'],
            }],
            'intervals': ['1y'],
            'groups': ['entity_id']
        }]
        experiment_config = {
            'events_table': 'events',
            'entity_column_name': 'entity_id',
            'model_comment': 'test2-final-final',
            'model_group_keys': ['label_name', 'label_type'],
            'feature_aggregations': feature_config,
            'temporal_config': temporal_config,
            'grid_config': grid_config,
            'scoring': scoring_config,
        }

        temp_dir = TemporaryDirectory()
        try:
            pipeline = pipeline_class(
                config=experiment_config,
                db_engine=db_engine,
                model_storage_class=FSModelStorageEngine,
                project_path=os.path.join(temp_dir.name, 'inspections'),
            )

            pipeline.run()

            evaluations = num_linked_evaluations(db_engine)
            assert evaluations > 0

            pipeline = pipeline_class(config=experiment_config,
                                      db_engine=db_engine,
                                      model_storage_class=FSModelStorageEngine,
                                      project_path=os.path.join(
                                          temp_dir.name, 'inspections'),
                                      replace=False)
            pipeline.make_entity_date_table = Mock()
            pipeline.run()
            assert not pipeline.make_entity_date_table.called
        finally:
            temp_dir.cleanup()
Пример #12
0
def simple_pipeline_test(pipeline_class):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        populate_source_data(db_engine)
        temporal_config = {
            'beginning_of_time': '2010-01-01',
            'modeling_start_time': '2011-01-01',
            'modeling_end_time': '2014-01-01',
            'update_window': '1y',
            'train_label_windows': ['6months'],
            'test_label_windows': ['6months'],
            'train_example_frequency': '1day',
            'test_example_frequency': '3months',
            'train_durations': ['6months'],
            'test_durations': ['1months'],
        }
        scoring_config = {
            'metric_groups': [{
                'metrics': ['precision@'],
                'thresholds': {
                    'top_n': [2]
                }
            }]
        }
        grid_config = {
            'sklearn.linear_model.LogisticRegression': {
                'C': [0.00001, 0.0001],
                'penalty': ['l1', 'l2'],
                'random_state': [2193]
            }
        }
        feature_config = [{
            'prefix':
            'test_features',
            'from_obj':
            'cat_complaints',
            'knowledge_date_column':
            'as_of_date',
            'aggregates': [{
                'quantity': 'cat_sightings',
                'metrics': ['count', 'avg'],
            }],
            'intervals': ['1y'],
            'groups': ['entity_id']
        }]
        experiment_config = {
            'events_table': 'events',
            'entity_column_name': 'entity_id',
            'model_comment': 'test2-final-final',
            'model_group_keys': ['label_name', 'label_type'],
            'feature_aggregations': feature_config,
            'temporal_config': temporal_config,
            'grid_config': grid_config,
            'scoring': scoring_config,
        }

        with TemporaryDirectory() as temp_dir:
            pipeline_class(config=experiment_config,
                           db_engine=db_engine,
                           model_storage_class=FSModelStorageEngine,
                           project_path=os.path.join(temp_dir,
                                                     'inspections')).run()

        # assert
        # 1. that model groups entries are present
        num_mgs = len([
            row
            for row in db_engine.execute('select * from results.model_groups')
        ])
        assert num_mgs > 0

        # 2. that model entries are present, and linked to model groups
        num_models = len([
            row for row in db_engine.execute('''
                select * from results.model_groups
                join results.models using (model_group_id)
                where model_comment = 'test2-final-final'
            ''')
        ])
        assert num_models > 0

        # 3. predictions, linked to models
        num_predictions = len([
            row for row in db_engine.execute('''
                select * from results.predictions
                join results.models using (model_id)''')
        ])
        assert num_predictions > 0

        # 4. evaluations linked to predictions linked to models
        num_evaluations = len([
            row for row in db_engine.execute('''
                select * from results.evaluations e
                join results.models using (model_id)
                join results.predictions p on (
                    e.model_id = p.model_id and
                    e.evaluation_start_time <= p.as_of_date and
                    e.evaluation_end_time > p.as_of_date)
            ''')
        ])
        assert num_evaluations > 0

        # 5. experiment
        num_experiments = len([
            row
            for row in db_engine.execute('select * from results.experiments')
        ])
        assert num_experiments == 1

        # 6. that models are linked to experiments
        num_models_with_experiment = len([
            row for row in db_engine.execute('''
                select * from results.experiments
                join results.models using (experiment_hash)
            ''')
        ])
        assert num_models == num_models_with_experiment

        # 7. that models have the train end date and label window
        results = [
            (model['train_end_time'], model['train_label_window'])
            for model in db_engine.execute('select * from results.models')
        ]
        assert sorted(set(results)) == [(datetime(2012, 1, 1), timedelta(180)),
                                        (datetime(2013, 1, 1), timedelta(180))]
Пример #13
0
def test_model_scoring_early_warning():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        metric_groups = [{
            'metrics': [
                'precision@', 'recall@', 'true positives@', 'true negatives@',
                'false positives@', 'false negatives@'
            ],
            'thresholds': {
                'percentiles': [5.0, 10.0],
                'top_n': [5, 10]
            }
        }, {
            'metrics': [
                'f1', 'mediocre', 'accuracy', 'roc_auc',
                'average precision score'
            ],
        }, {
            'metrics': ['fbeta@'],
            'parameters': [{
                'beta': 0.75
            }, {
                'beta': 1.25
            }]
        }]

        custom_metrics = {'mediocre': always_half}

        model_scorer = ModelScorer(metric_groups, db_engine, custom_metrics)

        trained_model, model_id = fake_trained_model(
            'myproject', InMemoryModelStorageEngine('myproject'), db_engine)

        labels = fake_labels(5)
        as_of_date = datetime.date(2016, 5, 5)
        model_scorer.score(
            trained_model.predict_proba(labels)[:, 1],
            trained_model.predict(labels), labels, model_id, as_of_date,
            as_of_date, '1y')

        # assert
        # that all of the records are there
        records = [
            row[0] for row in db_engine.execute(
                '''select distinct(metric || parameter) from results.evaluations
                where model_id = %s and evaluation_start_time = %s order by 1''',
                (model_id, as_of_date))
        ]
        assert records == [
            'accuracy', 'average precision score', 'f1',
            'false [email protected]_pct', 'false negatives@10_abs',
            'false [email protected]_pct', 'false negatives@5_abs',
            'false [email protected]_pct', 'false positives@10_abs',
            'false [email protected]_pct', 'false positives@5_abs',
            '[email protected]_beta', '[email protected]_beta', 'mediocre',
            '[email protected]_pct', 'precision@10_abs', '[email protected]_pct',
            'precision@5_abs', '[email protected]_pct', 'recall@10_abs',
            '[email protected]_pct', 'recall@5_abs', 'roc_auc',
            'true [email protected]_pct', 'true negatives@10_abs',
            'true [email protected]_pct', 'true negatives@5_abs',
            'true [email protected]_pct', 'true positives@10_abs',
            'true [email protected]_pct', 'true positives@5_abs'
        ]
Пример #14
0
def generic_pipeline_test(pipeline_class):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        populate_source_data(db_engine)
        temporal_config = {
            'beginning_of_time': '2010-01-01',
            'modeling_start_time': '2011-01-01',
            'modeling_end_time': '2014-01-01',
            'update_window': '1y',
            'prediction_window': '6m',
            'look_back_durations': ['6m'],
            'test_durations': ['1m'],
            'prediction_frequency': '1d'
        }
        scoring_config = [
            {'metrics': ['precision@'], 'thresholds': {'top_n': [2]}}
        ]
        grid_config = {
            'sklearn.linear_model.LogisticRegression': {
                'C': [0.00001, 0.0001],
                'penalty': ['l1', 'l2'],
                'random_state': [2193]
            }
        }
        feature_config = [{
            'prefix': 'test_features',
            'from_obj': 'cat_complaints',
            'knowledge_date_column': 'as_of_date',
            'aggregates': [{
                'quantity': 'cat_sightings',
                'metrics': ['count', 'avg'],
            }],
            'intervals': ['1y'],
            'groups': ['entity_id']
        }]
        experiment_config = {
            'events_table': 'events',
            'entity_column_name': 'entity_id',
            'model_comment': 'test2-final-final',
            'feature_aggregations': feature_config,
            'temporal_config': temporal_config,
            'grid_config': grid_config,
            'scoring': scoring_config,
        }

        with TemporaryDirectory() as temp_dir:
            pipeline_class(
                config=experiment_config,
                db_engine=db_engine,
                model_storage_class=InMemoryModelStorageEngine,
                project_path=os.path.join(temp_dir, 'inspections')
            ).run()

        # assert
        # 1. that model groups entries are present
        num_mgs = len([
            row for row in
            db_engine.execute('select * from results.model_groups')
        ])
        assert num_mgs > 0

        # 2. that model entries are present, and linked to model groups
        num_models = len([
            row for row in db_engine.execute('''
                select * from results.model_groups
                join results.models using (model_group_id)
                where model_comment = 'test2-final-final'
            ''')
        ])
        assert num_models > 0

        # 3. predictions, linked to models
        num_predictions = len([
            row for row in db_engine.execute('''
                select * from results.predictions
                join results.models using (model_id)''')
        ])
        assert num_predictions > 0

        # 4. evaluations linked to predictions linked to models
        num_evaluations = len([
            row for row in db_engine.execute('''
                select * from results.evaluations e
                join results.models using (model_id)
                join results.predictions p on (
                    e.model_id = p.model_id and
                    e.evaluation_start_time <= p.as_of_date and
                    e.evaluation_end_time > p.as_of_date)
            ''')
        ])
        assert num_evaluations > 0
Пример #15
0
def test_predictor():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            project_path = 'econ-dev/inspections'
            model_storage_engine = S3ModelStorageEngine(s3_conn, project_path)
            _, model_id = \
                fake_trained_model(project_path, model_storage_engine, db_engine)
            predictor = Predictor(project_path, model_storage_engine, db_engine)
            # create prediction set
            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            metadata = {
                'label_name': 'label',
                'end_time': AS_OF_DATE,
                'label_window': '3month',
                'metta-uuid': '1234',
            }

            matrix_store = InMemoryMatrixStore(matrix, metadata)
            predict_proba = predictor.predict(model_id, matrix_store, misc_db_parameters=dict())

            # assert
            # 1. that the returned predictions are of the desired length
            assert len(predict_proba) == 2

            # 2. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row for row in
                db_engine.execute('''select entity_id, as_of_date
                from results.predictions
                join results.models using (model_id)''')
            ]
            assert len(records) == 2

            # 3. that the contained as_of_dates match what we sent in
            for record in records:
                assert record[1].date() == AS_OF_DATE

            # 4. that the entity ids match the given dataset
            assert sorted([record[0] for record in records]) == [1, 2]

            # 5. running with same model_id, different as of date
            # then with same as of date only replaces the records
            # with the same date
            new_matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            new_metadata = {
                'label_name': 'label',
                'end_time': AS_OF_DATE + datetime.timedelta(days=1),
                'label_window': '3month',
                'metta-uuid': '1234',
            }
            new_matrix_store = InMemoryMatrixStore(new_matrix, new_metadata)
            predictor.predict(model_id, new_matrix_store, misc_db_parameters=dict())
            predictor.predict(model_id, matrix_store, misc_db_parameters=dict())
            records = [
                row for row in
                db_engine.execute('''select entity_id, as_of_date
                from results.predictions
                join results.models using (model_id)''')
            ]
            assert len(records) == 4

            # 6. That we can delete the model when done prediction on it
            predictor.delete_model(model_id)
            assert predictor.load_model(model_id) == None
Пример #16
0
def test_simple_model_trainer():
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)

        model_config = {
            'sklearn.linear_model.LogisticRegression': {
                'C': [0.00001, 0.0001],
                'penalty': ['l1', 'l2'],
                'random_state': [2193]
            }
        }

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')

            # create training set
            with fake_metta({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': ['good', 'bad']
            }, {'label_name': 'label'}) as (matrix_path, metadata_path):

                trainer = SimpleModelTrainer(
                    training_set_path=matrix_path,
                    training_metadata_path=metadata_path,
                    model_config=model_config,
                    project_path='econ-dev/inspections',
                    s3_conn=s3_conn,
                    db_engine=engine
                )
                cache_keys = trainer.train_models()

                # assert
                # 1. that all four models are cached
                model_pickles = [
                    pickle.loads(cache_key.get()['Body'].read())
                    for cache_key in cache_keys
                ]
                assert len(model_pickles) == 4
                assert len([x for x in model_pickles if x is not None]) == 4

                # 2. that their results can have predictions made on it
                test_matrix = pandas.DataFrame.from_dict({
                    'entity_id': [3, 4],
                    'feature_one': [4, 4],
                    'feature_two': [6, 5],
                })
                for model_pickle in model_pickles:
                    predictions = model_pickle.predict(test_matrix)
                    assert len(predictions) == 2

                # 3. that the models table entries are present
                records = [
                    row for row in
                    engine.execute('select * from results.models')
                ]
                assert len(records) == 4

                records = [
                    row for row in
                    engine.execute('select * from results.feature_importances')
                ]
                assert len(records) == 4 * 3  # maybe exclude entity_id?
Пример #17
0
def test_integration():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            project_path = 'econ-dev/inspections'

            # create train and test matrices
            train_matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            train_metadata = {
                'start_time': datetime.date(2012, 12, 20),
                'end_time': datetime.date(2016, 12, 20),
                'label_name': 'label',
                'prediction_window': '1y',
                'feature_names': ['ft1', 'ft2']
            }

            train_store = InMemoryMatrixStore(train_matrix, train_metadata)

            as_of_dates = [
                datetime.date(2016, 12, 21),
                datetime.date(2017, 1, 21)
            ]

            test_stores = [
                InMemoryMatrixStore(
                    pandas.DataFrame.from_dict({
                        'entity_id': [3],
                        'feature_one': [8],
                        'feature_two': [5],
                        'label': [5]
                    }).set_index('entity_id'),
                    {
                        'label_name': 'label',
                        'end_time': as_of_date
                    }
                )
                for as_of_date in as_of_dates
            ]

            model_storage_engine = S3ModelStorageEngine(s3_conn, project_path)

            # instantiate pipeline objects
            trainer = ModelTrainer(
                project_path=project_path,
                model_storage_engine=model_storage_engine,
                matrix_store=None,
                db_engine=db_engine,
            )
            predictor = Predictor(
                project_path,
                model_storage_engine,
                db_engine
            )
            model_scorer = ModelScorer(
                [{'metrics': ['precision@'], 'thresholds': {'top_n': [5]}}],
                db_engine
            )

            # run the pipeline
            grid_config = {
                'sklearn.linear_model.LogisticRegression': {
                    'C': [0.00001, 0.0001],
                    'penalty': ['l1', 'l2'],
                    'random_state': [2193]
                }
            }
            model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=train_store
            )

            for model_id in model_ids:
                for as_of_date, test_store in zip(as_of_dates, test_stores):
                    predictions, predictions_proba = predictor.predict(
                        model_id,
                        test_store,
                        misc_db_parameters=dict()
                    )

                    model_scorer.score(
                        predictions_proba,
                        predictions,
                        test_store.labels(),
                        model_id,
                        as_of_date,
                        as_of_date,
                        '6month'
                    )

            # assert
            # 1. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row for row in
                db_engine.execute('''select entity_id, model_id, as_of_date
                from results.predictions
                join results.models using (model_id)
                order by 3, 2''')
            ]
            assert records == [
                (3, 1, datetime.datetime(2016, 12, 21)),
                (3, 2, datetime.datetime(2016, 12, 21)),
                (3, 3, datetime.datetime(2016, 12, 21)),
                (3, 4, datetime.datetime(2016, 12, 21)),
                (3, 1, datetime.datetime(2017, 1, 21)),
                (3, 2, datetime.datetime(2017, 1, 21)),
                (3, 3, datetime.datetime(2017, 1, 21)),
                (3, 4, datetime.datetime(2017, 1, 21)),
            ]

            # that evaluations are there
            records = [
                row for row in
                db_engine.execute('''
                    select model_id, evaluation_start_time, metric, parameter
                    from results.evaluations order by 2, 1''')
            ]
            assert records == [
                (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
            ]
Пример #18
0
 def replace_db(arg):
     self.new_server = testing.postgresql.Postgresql(port=port)
     engine = create_engine(self.new_server.url())
     ensure_db(engine)
Пример #19
0
def test_predictor_retrieve():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        project_path = 'econ-dev/inspections'
        model_storage_engine = InMemoryModelStorageEngine(project_path)
        _, model_id = \
            fake_trained_model(project_path, model_storage_engine, db_engine)
        predictor = Predictor(project_path, model_storage_engine, db_engine, replace=False)
        dayone = datetime.date(2011, 1, 1).isoformat()
        daytwo = datetime.date(2011, 1, 2).isoformat()
        # create prediction set
        matrix_data = {
            'entity_id': [1, 2, 1, 2],
            'as_of_date': [dayone, dayone, daytwo, daytwo],
            'feature_one': [3, 4, 5, 6],
            'feature_two': [5, 6, 7, 8],
            'label': [7, 8, 8, 7]
        }
        matrix = pandas.DataFrame.from_dict(matrix_data)\
            .set_index(['entity_id', 'as_of_date'])
        metadata = {
            'label_name': 'label',
            'end_time': AS_OF_DATE,
            'label_window': '3month',
            'metta-uuid': '1234',
        }
        matrix_store = InMemoryMatrixStore(matrix, metadata)
        predict_proba = predictor.predict(model_id, matrix_store, misc_db_parameters=dict())

        # When run again, the predictions retrieved from the database
        # should match.
        #
        # Some trickiness here. Let's explain:
        #
        # If we are not careful, retrieving predictions from the database and
        # presenting them as a numpy array can result in a bad ordering,
        # since the given matrix may not be 'ordered' by some criteria
        # that can be easily represented by an ORDER BY clause.
        #
        # It will sometimes work, because without ORDER BY you will get
        # it back in the table's physical order, which unless something has
        # happened to the table will be the order you inserted it,
        # which could very well be the order in the matrix.
        # So it's not a bug that would necessarily immediately show itself,
        # but when it does go wrong your scores will be garbage.
        #
        # So we simulate a table order mutation that can happen over time:
        # Remove the first row and put it at the end.
        # If the Predictor doesn't explicitly reorder the results, this will fail
        session = sessionmaker(bind=db_engine)()
        obj = session.query(Prediction).first()
        session.delete(obj)
        session.commit()

        make_transient(obj)
        session = sessionmaker(bind=db_engine)()
        session.add(obj)
        session.commit()

        predictor.load_model = Mock()
        new_predict_proba = predictor.predict(model_id, matrix_store, misc_db_parameters=dict())
        assert_array_equal(new_predict_proba, predict_proba)
        assert not predictor.load_model.called