示例#1
0
    def train_test_models(self, train_matrix_uuid, model_ids_generator,
                          model_storage):

        predictor = Predictor(project_path=self.project_path,
                              model_storage_engine=model_storage,
                              db_engine=self.db_engine)

        for trained_model_id in model_ids_generator:
            ## Prediction
            log.info('Predict for model_id: {}'.format(trained_model_id))

            # Loop over testing as of dates
            for test_date in self.temporal_split['test_as_of_dates']:
                # Load matrixes
                log.info(
                    'Load test matrix for as of date: {}'.format(test_date))
                test_matrix_id = str([
                    test_date, self.labels,
                    self.temporal_split['prediction_window']
                ])

                test_metadata = self._make_metadata(
                    datetime.datetime.strptime(test_date, "%Y-%m-%d"),
                    datetime.datetime.strptime(test_date, "%Y-%m-%d"),
                    test_matrix_id, [test_date])

                test_df, test_uuid = self.load_store_matrix(
                    test_metadata, [test_date])
                misc_db_parameters = {'matrix_uuid': test_uuid}

                # remove the index from the data-frame
                for column in test_metadata['indices']:
                    if column in test_df.columns:
                        del test_df[column]

                # Store matrix
                test_matrix_store = InMemoryMatrixStore(
                    test_df.iloc[:, :-1], test_metadata, test_df.iloc[:, -1])

                predictions_binary, predictions_proba = predictor.predict(
                    trained_model_id, test_matrix_store, misc_db_parameters)
                ## Evaluation
                log.info('Generate Evaluations for model_id: {}'.format(
                    trained_model_id))
                self.evaluations(predictions_proba, predictions_binary,
                                 test_df.iloc[:,
                                              -1], trained_model_id, test_date)

            # remove trained model from memory
            predictor.delete_model(trained_model_id)

        return None
示例#2
0
def test_predictor_composite_index():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        project_path = 'econ-dev/inspections'
        model_storage_engine = InMemoryModelStorageEngine(project_path)
        _, model_id = \
            fake_trained_model(project_path, model_storage_engine, db_engine)
        predictor = Predictor(project_path, model_storage_engine, db_engine)
        dayone = datetime.datetime(2011, 1, 1)
        daytwo = datetime.datetime(2011, 1, 2)
        # create prediction set
        matrix = pandas.DataFrame.from_dict({
            'entity_id': [1, 2, 1, 2],
            'as_of_date': [dayone, dayone, daytwo, daytwo],
            'feature_one': [3, 4, 5, 6],
            'feature_two': [5, 6, 7, 8],
            'label': [7, 8, 8, 7]
        }).set_index(['entity_id', 'as_of_date'])
        metadata = {
            'label_name': 'label',
            'end_time': AS_OF_DATE,
            'label_window': '3month',
            'metta-uuid': '1234',
        }
        matrix_store = InMemoryMatrixStore(matrix, metadata)
        predict_proba = predictor.predict(model_id, matrix_store, misc_db_parameters=dict())

        # assert
        # 1. that the returned predictions are of the desired length
        assert len(predict_proba) == 4

        # 2. that the predictions table entries are present and
        # can be linked to the original models
        records = [
            row for row in
            db_engine.execute('''select entity_id, as_of_date
            from results.predictions
            join results.models using (model_id)''')
        ]
        assert len(records) == 4
示例#3
0
def test_predictor():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            project_path = 'econ-dev/inspections'
            model_storage_engine = S3ModelStorageEngine(s3_conn, project_path)
            _, model_id = \
                fake_trained_model(project_path, model_storage_engine, db_engine)
            predictor = Predictor(project_path, model_storage_engine, db_engine)
            # create prediction set
            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            metadata = {
                'label_name': 'label',
                'end_time': AS_OF_DATE,
                'label_window': '3month',
                'metta-uuid': '1234',
            }

            matrix_store = InMemoryMatrixStore(matrix, metadata)
            predict_proba = predictor.predict(model_id, matrix_store, misc_db_parameters=dict())

            # assert
            # 1. that the returned predictions are of the desired length
            assert len(predict_proba) == 2

            # 2. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row for row in
                db_engine.execute('''select entity_id, as_of_date
                from results.predictions
                join results.models using (model_id)''')
            ]
            assert len(records) == 2

            # 3. that the contained as_of_dates match what we sent in
            for record in records:
                assert record[1].date() == AS_OF_DATE

            # 4. that the entity ids match the given dataset
            assert sorted([record[0] for record in records]) == [1, 2]

            # 5. running with same model_id, different as of date
            # then with same as of date only replaces the records
            # with the same date
            new_matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            new_metadata = {
                'label_name': 'label',
                'end_time': AS_OF_DATE + datetime.timedelta(days=1),
                'label_window': '3month',
                'metta-uuid': '1234',
            }
            new_matrix_store = InMemoryMatrixStore(new_matrix, new_metadata)
            predictor.predict(model_id, new_matrix_store, misc_db_parameters=dict())
            predictor.predict(model_id, matrix_store, misc_db_parameters=dict())
            records = [
                row for row in
                db_engine.execute('''select entity_id, as_of_date
                from results.predictions
                join results.models using (model_id)''')
            ]
            assert len(records) == 4

            # 6. That we can delete the model when done prediction on it
            predictor.delete_model(model_id)
            assert predictor.load_model(model_id) == None
示例#4
0
def test_predictor_retrieve():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        project_path = 'econ-dev/inspections'
        model_storage_engine = InMemoryModelStorageEngine(project_path)
        _, model_id = \
            fake_trained_model(project_path, model_storage_engine, db_engine)
        predictor = Predictor(project_path, model_storage_engine, db_engine, replace=False)
        dayone = datetime.date(2011, 1, 1).isoformat()
        daytwo = datetime.date(2011, 1, 2).isoformat()
        # create prediction set
        matrix_data = {
            'entity_id': [1, 2, 1, 2],
            'as_of_date': [dayone, dayone, daytwo, daytwo],
            'feature_one': [3, 4, 5, 6],
            'feature_two': [5, 6, 7, 8],
            'label': [7, 8, 8, 7]
        }
        matrix = pandas.DataFrame.from_dict(matrix_data)\
            .set_index(['entity_id', 'as_of_date'])
        metadata = {
            'label_name': 'label',
            'end_time': AS_OF_DATE,
            'label_window': '3month',
            'metta-uuid': '1234',
        }
        matrix_store = InMemoryMatrixStore(matrix, metadata)
        predict_proba = predictor.predict(model_id, matrix_store, misc_db_parameters=dict())

        # When run again, the predictions retrieved from the database
        # should match.
        #
        # Some trickiness here. Let's explain:
        #
        # If we are not careful, retrieving predictions from the database and
        # presenting them as a numpy array can result in a bad ordering,
        # since the given matrix may not be 'ordered' by some criteria
        # that can be easily represented by an ORDER BY clause.
        #
        # It will sometimes work, because without ORDER BY you will get
        # it back in the table's physical order, which unless something has
        # happened to the table will be the order you inserted it,
        # which could very well be the order in the matrix.
        # So it's not a bug that would necessarily immediately show itself,
        # but when it does go wrong your scores will be garbage.
        #
        # So we simulate a table order mutation that can happen over time:
        # Remove the first row and put it at the end.
        # If the Predictor doesn't explicitly reorder the results, this will fail
        session = sessionmaker(bind=db_engine)()
        obj = session.query(Prediction).first()
        session.delete(obj)
        session.commit()

        make_transient(obj)
        session = sessionmaker(bind=db_engine)()
        session.add(obj)
        session.commit()

        predictor.load_model = Mock()
        new_predict_proba = predictor.predict(model_id, matrix_store, misc_db_parameters=dict())
        assert_array_equal(new_predict_proba, predict_proba)
        assert not predictor.load_model.called
示例#5
0
def test_integration():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            project_path = 'econ-dev/inspections'

            # create train and test matrices
            train_matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            train_metadata = {
                'start_time': datetime.date(2012, 12, 20),
                'end_time': datetime.date(2016, 12, 20),
                'label_name': 'label',
                'prediction_window': '1y',
                'feature_names': ['ft1', 'ft2']
            }

            train_store = InMemoryMatrixStore(train_matrix, train_metadata)

            as_of_dates = [
                datetime.date(2016, 12, 21),
                datetime.date(2017, 1, 21)
            ]

            test_stores = [
                InMemoryMatrixStore(
                    pandas.DataFrame.from_dict({
                        'entity_id': [3],
                        'feature_one': [8],
                        'feature_two': [5],
                        'label': [5]
                    }).set_index('entity_id'),
                    {
                        'label_name': 'label',
                        'end_time': as_of_date
                    }
                )
                for as_of_date in as_of_dates
            ]

            model_storage_engine = S3ModelStorageEngine(s3_conn, project_path)

            # instantiate pipeline objects
            trainer = ModelTrainer(
                project_path=project_path,
                model_storage_engine=model_storage_engine,
                matrix_store=None,
                db_engine=db_engine,
            )
            predictor = Predictor(
                project_path,
                model_storage_engine,
                db_engine
            )
            model_scorer = ModelScorer(
                [{'metrics': ['precision@'], 'thresholds': {'top_n': [5]}}],
                db_engine
            )

            # run the pipeline
            grid_config = {
                'sklearn.linear_model.LogisticRegression': {
                    'C': [0.00001, 0.0001],
                    'penalty': ['l1', 'l2'],
                    'random_state': [2193]
                }
            }
            model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=train_store
            )

            for model_id in model_ids:
                for as_of_date, test_store in zip(as_of_dates, test_stores):
                    predictions, predictions_proba = predictor.predict(
                        model_id,
                        test_store,
                        misc_db_parameters=dict()
                    )

                    model_scorer.score(
                        predictions_proba,
                        predictions,
                        test_store.labels(),
                        model_id,
                        as_of_date,
                        as_of_date,
                        '6month'
                    )

            # assert
            # 1. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row for row in
                db_engine.execute('''select entity_id, model_id, as_of_date
                from results.predictions
                join results.models using (model_id)
                order by 3, 2''')
            ]
            assert records == [
                (3, 1, datetime.datetime(2016, 12, 21)),
                (3, 2, datetime.datetime(2016, 12, 21)),
                (3, 3, datetime.datetime(2016, 12, 21)),
                (3, 4, datetime.datetime(2016, 12, 21)),
                (3, 1, datetime.datetime(2017, 1, 21)),
                (3, 2, datetime.datetime(2017, 1, 21)),
                (3, 3, datetime.datetime(2017, 1, 21)),
                (3, 4, datetime.datetime(2017, 1, 21)),
            ]

            # that evaluations are there
            records = [
                row for row in
                db_engine.execute('''
                    select model_id, evaluation_start_time, metric, parameter
                    from results.evaluations order by 2, 1''')
            ]
            assert records == [
                (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
            ]