예제 #1
0
def test_predictor_composite_index():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)

        project_path = 'econ-dev/inspections'
        model_storage_engine = InMemoryModelStorageEngine(project_path)

        _, model_id = \
            fake_trained_model(project_path, model_storage_engine, db_engine, train_matrix_uuid='1234')

        predictor = Predictor(project_path, model_storage_engine, db_engine)

        dayone = datetime.datetime(2011, 1, 1)
        daytwo = datetime.datetime(2011, 1, 2)

        # create prediction set
        matrix = pandas.DataFrame.from_dict({
            'entity_id': [1, 2, 1, 2],
            'as_of_date': [dayone, dayone, daytwo, daytwo],
            'feature_one': [3, 4, 5, 6],
            'feature_two': [5, 6, 7, 8],
            'label': [7, 8, 8, 7]
        }).set_index(['entity_id', 'as_of_date'])
        metadata = {
            'label_name': 'label',
            'end_time': AS_OF_DATE,
            'label_timespan': '3month',
            'metta-uuid': '1234',
            'indices': ['entity_id', 'as_of_date'],
        }

        # Runs the same test for training and testing predictions
        for mat_type in ("train", "test"):

            # Create the matrix to be tested and store in db
            metadata['matrix_type'] = mat_type
            matrix_store = InMemoryMatrixStore(matrix, metadata)

            # Adding 'label' column back into matrix
            matrix['label'] = [7, 8, 8, 7]

            predict_proba = predictor.predict(
                model_id,
                matrix_store,
                misc_db_parameters=dict(),
                train_matrix_columns=['feature_one', 'feature_two'])

            # assert
            # 1. that the returned predictions are of the desired length
            assert len(predict_proba) == 4

            # 2. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row
                for row in db_engine.execute('''select entity_id, as_of_date
                from {}_results.{}_predictions
                join model_metadata.models using (model_id)'''.format(
                    mat_type, mat_type))
            ]
            assert len(records) == 4
예제 #2
0
def test_predictor():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            project_path = 'econ-dev/inspections'
            model_storage_engine = S3ModelStorageEngine(project_path)

            _, model_id = \
                fake_trained_model(project_path, model_storage_engine, db_engine, train_matrix_uuid='1234')

            predictor = Predictor(project_path, model_storage_engine,
                                  db_engine)

            # create prediction set
            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')

            metadata = {
                'label_name': 'label',
                'end_time': AS_OF_DATE,
                'label_timespan': '3month',
                'metta-uuid': '1234',
                'indices': ['entity_id'],
            }

            train_matrix_columns = ['feature_one', 'feature_two']

            # Runs the same test for training and testing predictions
            for mat_type in ("train", "test"):
                # Create the matrix to be tested and store in db
                metadata['matrix_type'] = mat_type

                matrix_store = InMemoryMatrixStore(matrix, metadata)

                # Note, the first time 'matrix' is used, the label column is popped.
                # It must be added back in to 'matrix' to create another matrix_store.
                matrix['label'] = [7, 8]

                predict_proba = predictor.predict(
                    model_id,
                    matrix_store,
                    misc_db_parameters=dict(),
                    train_matrix_columns=train_matrix_columns)

                # assert
                # 1. that the returned predictions are of the desired length
                assert len(predict_proba) == 2

                # 2. that the predictions table entries are present and
                # can be linked to the original models
                records = [
                    row for row in db_engine.execute(
                        '''select entity_id, as_of_date
                    from {}_results.{}_predictions
                    join model_metadata.models using (model_id)'''.format(
                            mat_type, mat_type))
                ]
                assert len(records) == 2

                # 3. that the contained as_of_dates match what we sent in
                for record in records:
                    assert record[1].date() == AS_OF_DATE

                # 4. that the entity ids match the given dataset
                assert sorted([record[0] for record in records]) == [1, 2]

            # 5. running with same model_id, different as of date
            # then with same as of date only replaces the records
            # with the same date
            new_matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            new_metadata = {
                'label_name': 'label',
                'end_time': AS_OF_DATE + datetime.timedelta(days=1),
                'label_timespan': '3month',
                'metta-uuid': '1234',
                'indices': ['entity_id'],
            }

            # Runs the same test for training and testing predictions
            for mat_type in ("train", "test"):

                # Create the matrix to be tested and store in db
                new_metadata['matrix_type'] = mat_type

                new_matrix_store = InMemoryMatrixStore(new_matrix,
                                                       new_metadata)

                # Adding 'label' column back into new_matrix
                new_matrix['label'] = [7, 8]

                predictor.predict(model_id,
                                  new_matrix_store,
                                  misc_db_parameters=dict(),
                                  train_matrix_columns=train_matrix_columns)
                predictor.predict(model_id,
                                  matrix_store,
                                  misc_db_parameters=dict(),
                                  train_matrix_columns=train_matrix_columns)
                records = [
                    row for row in db_engine.execute(
                        '''select entity_id, as_of_date
                    from {}_results.{}_predictions
                    join model_metadata.models using (model_id)'''.format(
                            mat_type, mat_type))
                ]
                assert len(records) == 4

            # 6. That we can delete the model when done prediction on it
            predictor.delete_model(model_id)
            assert predictor.load_model(model_id) == None
예제 #3
0
def test_predictor_entity_index():
    with prepare() as (project_storage, db_engine, model_id):
        predictor = Predictor(project_storage.model_storage_engine(),
                              db_engine)

        # Runs the same test for training and testing predictions
        for mat_type in ("train", "test"):
            matrix = matrix_creator(index="entity_id")
            metadata = matrix_metadata_creator(end_time=AS_OF_DATE,
                                               matrix_type=mat_type,
                                               indices=["entity_id"])

            matrix_store = get_matrix_store(project_storage, matrix, metadata)
            train_matrix_columns = matrix.columns[0:-1].tolist()

            predict_proba = predictor.predict(
                model_id,
                matrix_store,
                misc_db_parameters=dict(),
                train_matrix_columns=train_matrix_columns,
            )

            # assert
            # 1. that the returned predictions are of the desired length
            assert len(predict_proba) == 2

            # 2. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row
                for row in db_engine.execute("""select entity_id, as_of_date
                from {}_results.predictions
                join model_metadata.models using (model_id)""".format(
                    mat_type, mat_type))
            ]
            assert len(records) == 2

            # 3. that the contained as_of_dates match what we sent in
            for record in records:
                assert record[1].date() == AS_OF_DATE

            # 4. that the entity ids match the given dataset
            assert sorted([record[0] for record in records]) == [1, 2]

        # 5. running with same model_id, different as of date
        # then with same as of date only replaces the records
        # with the same date

        # Runs the same test for training and testing predictions
        for mat_type in ("train", "test"):
            new_matrix = matrix_creator(index="entity_id")
            new_metadata = matrix_metadata_creator(
                end_time=AS_OF_DATE + datetime.timedelta(days=1),
                matrix_type=mat_type,
                indices=["entity_id"],
            )
            new_matrix_store = get_matrix_store(project_storage, new_matrix,
                                                new_metadata)

            predictor.predict(
                model_id,
                new_matrix_store,
                misc_db_parameters=dict(),
                train_matrix_columns=train_matrix_columns,
            )
            predictor.predict(
                model_id,
                matrix_store,
                misc_db_parameters=dict(),
                train_matrix_columns=train_matrix_columns,
            )
            records = [
                row
                for row in db_engine.execute("""select entity_id, as_of_date
                from {}_results.predictions
                join model_metadata.models using (model_id)""".format(
                    mat_type, mat_type))
            ]
            assert len(records) == 4

        # 6. That we can delete the model when done prediction on it
        predictor.delete_model(model_id)
        assert predictor.load_model(model_id) is None
예제 #4
0
def test_integration():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            project_path = 'econ-dev/inspections'

            # create train and test matrices
            train_matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            train_metadata = {
                'feature_start_time': datetime.date(2012, 12, 20),
                'end_time': datetime.date(2016, 12, 20),
                'label_name': 'label',
                'label_timespan': '1y',
                'feature_names': ['ft1', 'ft2'],
                'metta-uuid': '1234',
                'indices': ['entity_id'],
                'matrix_type': 'train'
            }
            # Creates a matrix entry in the matrices table with uuid from train_metadata
            MatrixFactory(matrix_uuid="1234")
            session.commit()

            train_store = InMemoryMatrixStore(train_matrix, sample_metadata())

            as_of_dates = [
                datetime.date(2016, 12, 21),
                datetime.date(2017, 1, 21)
            ]

            test_stores = [
                InMemoryMatrixStore(
                    pandas.DataFrame.from_dict({
                        'entity_id': [3],
                        'feature_one': [8],
                        'feature_two': [5],
                        'label': [5]
                    }), {
                        'label_name': 'label',
                        'label_timespan': '1y',
                        'end_time': as_of_date,
                        'metta-uuid': '1234',
                        'indices': ['entity_id'],
                        'matrix_type': 'test',
                        'as_of_date_frequency': '1month'
                    }) for as_of_date in as_of_dates
            ]

            model_storage_engine = S3ModelStorageEngine(project_path)

            experiment_hash = save_experiment_and_get_hash({}, db_engine)
            # instantiate pipeline objects
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=experiment_hash,
                model_storage_engine=model_storage_engine,
                db_engine=db_engine,
            )
            predictor = Predictor(project_path, model_storage_engine,
                                  db_engine)
            model_evaluator = ModelEvaluator([{
                'metrics': ['precision@'],
                'thresholds': {
                    'top_n': [5]
                }
            }], [{}], db_engine)

            # run the pipeline
            grid_config = {
                'sklearn.linear_model.LogisticRegression': {
                    'C': [0.00001, 0.0001],
                    'penalty': ['l1', 'l2'],
                    'random_state': [2193]
                }
            }
            model_ids = trainer.train_models(grid_config=grid_config,
                                             misc_db_parameters=dict(),
                                             matrix_store=train_store)

            for model_id in model_ids:
                for as_of_date, test_store in zip(as_of_dates, test_stores):
                    predictions_proba = predictor.predict(
                        model_id,
                        test_store,
                        misc_db_parameters=dict(),
                        train_matrix_columns=['feature_one', 'feature_two'])

                    model_evaluator.evaluate(
                        predictions_proba,
                        test_store,
                        model_id,
                    )

            # assert
            # 1. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row for row in db_engine.execute(
                    '''select entity_id, model_id, as_of_date
                from test_results.test_predictions
                join model_metadata.models using (model_id)
                order by 3, 2''')
            ]
            assert records == [
                (3, 1, datetime.datetime(2016, 12, 21)),
                (3, 2, datetime.datetime(2016, 12, 21)),
                (3, 3, datetime.datetime(2016, 12, 21)),
                (3, 4, datetime.datetime(2016, 12, 21)),
                (3, 1, datetime.datetime(2017, 1, 21)),
                (3, 2, datetime.datetime(2017, 1, 21)),
                (3, 3, datetime.datetime(2017, 1, 21)),
                (3, 4, datetime.datetime(2017, 1, 21)),
            ]

            # that evaluations are there
            records = [
                row for row in db_engine.execute('''
                    select model_id, evaluation_start_time, metric, parameter
                    from test_results.test_evaluations order by 2, 1''')
            ]
            assert records == [
                (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
            ]
예제 #5
0
def test_predictor_retrieve():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)

        project_path = 'econ-dev/inspections'
        model_storage_engine = InMemoryModelStorageEngine(project_path)

        _, model_id = \
            fake_trained_model(project_path, model_storage_engine, db_engine, train_matrix_uuid='1234')

        predictor = Predictor(project_path,
                              model_storage_engine,
                              db_engine,
                              replace=False)

        dayone = datetime.date(2011, 1,
                               1).strftime(predictor.expected_matrix_ts_format)
        daytwo = datetime.date(2011, 1,
                               2).strftime(predictor.expected_matrix_ts_format)

        # create prediction set
        matrix_data = {
            'entity_id': [1, 2, 1, 2],
            'as_of_date': [dayone, dayone, daytwo, daytwo],
            'feature_one': [3, 4, 5, 6],
            'feature_two': [5, 6, 7, 8],
            'label': [7, 8, 8, 7]
        }
        matrix = pandas.DataFrame.from_dict(matrix_data)\
            .set_index(['entity_id', 'as_of_date'])
        metadata = {
            'label_name': 'label',
            'end_time': AS_OF_DATE,
            'label_timespan': '3month',
            'metta-uuid': '1234',
            'indices': ['entity_id', 'as_of_date'],
            'matrix_type': 'test'
        }

        matrix_store = InMemoryMatrixStore(matrix, metadata)

        predict_proba = predictor.predict(
            model_id,
            matrix_store,
            misc_db_parameters=dict(),
            train_matrix_columns=['feature_one', 'feature_two'])

        # When run again, the predictions retrieved from the database
        # should match.
        #
        # Some trickiness here. Let's explain:
        #
        # If we are not careful, retrieving predictions from the database and
        # presenting them as a numpy array can result in a bad ordering,
        # since the given matrix may not be 'ordered' by some criteria
        # that can be easily represented by an ORDER BY clause.
        #
        # It will sometimes work, because without ORDER BY you will get
        # it back in the table's physical order, which unless something has
        # happened to the table will be the order you inserted it,
        # which could very well be the order in the matrix.
        # So it's not a bug that would necessarily immediately show itself,
        # but when it does go wrong your scores will be garbage.
        #
        # So we simulate a table order mutation that can happen over time:
        # Remove the first row and put it at the end.
        # If the Predictor doesn't explicitly reorder the results, this will fail
        # Only running on TestPrediction because TrainPrediction behaves the exact same way
        reorder_session = sessionmaker(bind=db_engine)()
        obj = reorder_session.query(TestPrediction).first()
        reorder_session.delete(obj)
        reorder_session.commit()

        make_transient(obj)
        reorder_session = sessionmaker(bind=db_engine)()
        reorder_session.add(obj)
        reorder_session.commit()

        predictor.load_model = Mock()
        new_predict_proba = predictor.predict(
            model_id,
            matrix_store,
            misc_db_parameters=dict(),
            train_matrix_columns=['feature_one', 'feature_two'])
        assert_array_equal(new_predict_proba, predict_proba)
        assert not predictor.load_model.called
예제 #6
0
def test_predictor_retrieve():
    with prepare() as (project_storage, db_engine, model_id):
        predictor = Predictor(project_storage.model_storage_engine(),
                              db_engine,
                              replace=False)

        # create prediction set
        matrix = matrix_creator()
        metadata = matrix_metadata_creator()
        matrix_store = get_matrix_store(project_storage, matrix, metadata)

        predict_proba = predictor.predict(
            model_id,
            matrix_store,
            misc_db_parameters=dict(),
            train_matrix_columns=matrix.columns[0:-1].tolist(),
        )

        # When run again, the predictions retrieved from the database
        # should match.
        #
        # Some trickiness here. Let's explain:
        #
        # If we are not careful, retrieving predictions from the database and
        # presenting them as a numpy array can result in a bad ordering,
        # since the given matrix may not be 'ordered' by some criteria
        # that can be easily represented by an ORDER BY clause.
        #
        # It will sometimes work, because without ORDER BY you will get
        # it back in the table's physical order, which unless something has
        # happened to the table will be the order you inserted it,
        # which could very well be the order in the matrix.
        # So it's not a bug that would necessarily immediately show itself,
        # but when it does go wrong your scores will be garbage.
        #
        # So we simulate a table order mutation that can happen over time:
        # Remove the first row and put it at the end.
        # If the Predictor doesn't explicitly reorder the results, this will fail
        # Only running on TestPrediction because TrainPrediction behaves the exact same way
        try:
            reorder_session = sessionmaker(bind=db_engine)()
            obj = reorder_session.query(TestPrediction).first()
            reorder_session.delete(obj)
            reorder_session.commit()
        finally:
            reorder_session.close()

        make_transient(obj)
        try:
            reorder_session = sessionmaker(bind=db_engine)()
            reorder_session.add(obj)
            reorder_session.commit()
        finally:
            reorder_session.close()

        predictor.load_model = Mock()
        new_predict_proba = predictor.predict(
            model_id,
            matrix_store,
            misc_db_parameters=dict(),
            train_matrix_columns=matrix.columns[0:-1].tolist(),
        )
        assert_array_equal(new_predict_proba, predict_proba)
        assert not predictor.load_model.called
예제 #7
0
class ModelTester(object):
    def __init__(
        self,
        db_engine,
        model_storage_engine,
        matrix_storage_engine,
        replace,
        evaluator_config,
        individual_importance_config,
    ):
        self.matrix_storage_engine = matrix_storage_engine
        self.predictor = Predictor(
            db_engine=db_engine,
            model_storage_engine=model_storage_engine,
            replace=replace,
        )

        self.individual_importance_calculator = IndividualImportanceCalculator(
            db_engine=db_engine,
            n_ranks=individual_importance_config.get("n_ranks", 5),
            methods=individual_importance_config.get("methods", ["uniform"]),
            replace=replace,
        )

        self.evaluator = ModelEvaluator(
            db_engine=db_engine,
            sort_seed=evaluator_config.get("sort_seed", None),
            testing_metric_groups=evaluator_config.get("testing_metric_groups",
                                                       []),
            training_metric_groups=evaluator_config.get(
                "training_metric_groups", []),
        )

    def generate_model_test_tasks(self, split, train_store, model_ids):
        test_tasks = []
        for test_matrix_def, test_uuid in zip(split["test_matrices"],
                                              split["test_uuids"]):
            test_store = self.matrix_storage_engine.get_store(test_uuid)

            if test_store.empty:
                logging.warning(
                    """Test matrix for uuid %s
                was empty, no point in generating predictions. Not creating test task.
                """,
                    test_uuid,
                )
                continue
            test_tasks.append({
                "test_store":
                test_store,
                "train_store":
                train_store,
                "model_ids": [model_id for model_id in model_ids if model_id],
            })
        return test_tasks

    def process_model_test_task(self, test_store, train_store, model_ids):
        as_of_times = test_store.metadata["as_of_times"]
        logging.info(
            "Testing and scoring all model ids with test matrix %s. "
            "as_of_times min: %s max: %s num: %s",
            test_store.uuid,
            min(as_of_times),
            max(as_of_times),
            len(as_of_times),
        )

        for model_id in model_ids:
            logging.info("Testing model id %s", model_id)

            self.individual_importance_calculator.calculate_and_save_all_methods_and_dates(
                model_id, test_store)

            # Generate predictions for the testing data then training data
            for store in (test_store, train_store):
                if self.evaluator.needs_evaluations(store, model_id):
                    logging.info(
                        "The evaluations needed for matrix %s-%s and model %s"
                        "are not all present in db, so predicting and evaluating",
                        store.uuid, store.matrix_type, model_id)
                    predictions_proba = self.predictor.predict(
                        model_id,
                        store,
                        misc_db_parameters=dict(),
                        train_matrix_columns=train_store.columns(),
                    )

                    self.evaluator.evaluate(
                        predictions_proba=predictions_proba,
                        matrix_store=store,
                        model_id=model_id,
                    )
                else:
                    logging.info(
                        "The evaluations needed for matrix %s-%s and model %s are all present"
                        "in db from a previous run (or none needed at all), so skipping!",
                        store.uuid, store.matrix_type, model_id)
예제 #8
0
class ModelTester(object):
    def __init__(self, db_engine, project_path, model_storage_engine, replace,
                 evaluator_config, individual_importance_config):
        self.predictor = Predictor(db_engine=db_engine,
                                   model_storage_engine=model_storage_engine,
                                   project_path=project_path,
                                   replace=replace)

        self.individual_importance_calculator = IndividualImportanceCalculator(
            db_engine=db_engine,
            n_ranks=individual_importance_config.get('n_ranks', 5),
            methods=individual_importance_config.get('methods', ['uniform']),
            replace=replace)

        self.evaluator = ModelEvaluator(
            db_engine=db_engine,
            sort_seed=evaluator_config.get('sort_seed', None),
            metric_groups=evaluator_config['metric_groups'],
            training_metric_groups=evaluator_config['training_metric_groups'])

    def generate_model_test_tasks(self, split, train_store, model_ids,
                                  matrix_store_creator):
        test_tasks = []
        for test_matrix_def, test_uuid in zip(split['test_matrices'],
                                              split['test_uuids']):
            test_store = matrix_store_creator(test_uuid)

            if test_store.empty:
                logging.warning(
                    '''Test matrix for uuid %s
                was empty, no point in generating predictions. Not creating test task.
                ''', test_uuid)
                continue
            test_tasks.append({
                'test_store':
                test_store,
                'train_store':
                train_store,
                'model_ids': [model_id for model_id in model_ids if model_id]
            })
        return test_tasks

    def process_model_test_task(self, test_store, train_store, model_ids):
        as_of_times = test_store.metadata['as_of_times']
        logging.info(
            'Testing and scoring all model ids with test matrix %s. as_of_times min: %s max: %s num: %s',
            test_store.uuid, min(as_of_times), max(as_of_times),
            len(as_of_times))
        for model_id in model_ids:
            logging.info('Testing model id %s', model_id)

            self.individual_importance_calculator\
                .calculate_and_save_all_methods_and_dates(
                    model_id,
                    test_store
                )

            # Generate predictions for the testing data then training data
            for store in (test_store, train_store):
                predictions_proba = self.predictor.predict(
                    model_id,
                    store,
                    misc_db_parameters=dict(),
                    train_matrix_columns=train_store.columns())

                self.evaluator.evaluate(
                    predictions_proba=predictions_proba,
                    matrix_store=store,
                    model_id=model_id,
                )
예제 #9
0
def predictor(predict_setup_args):
    (project_storage, db_engine, model_id) = predict_setup_args
    return Predictor(project_storage.model_storage_engine(),
                     db_engine,
                     rank_order='worst')
예제 #10
0
def add_predictions(db_engine,
                    model_groups,
                    project_path,
                    experiment_hashes=None,
                    train_end_times_range=None,
                    rank_order='worst',
                    replace=True):
    """ For a set of modl_groups generate test predictions and write to DB
        Args:
            db_engine: Sqlalchemy engine
            model_groups (list): The list of model group ids we are interested in (ideally, chosen through audition)
            project_path (str): Path where the created matrices and trained model objects are stored for the experiment
            experiment_hashes (List[str]): Optional. hash(es) of the experiments we are interested in. Can be used to narrow down the model_ids in the model groups specified
            range_train_end_times (Dict): Optional. If provided, only the models with train_end_times that fall in the range are scored. 
                                        This too, helps narrow down model_ids in the model groups specified.
                                        A dictionary with two possible keys 'range_start_date' and 'range_end_date'. Either or both could be set
            rank_order (str) : How to deal with ties in the scores. 
            replace (bool) : Whether to overwrite the preditctions for a model_id, if already found in the DB.

        Returns: None
            This directly writes to the test_results.predictions table
    """

    model_matrix_info = _fetch_relevant_model_matrix_info(
        db_engine=db_engine,
        model_groups=model_groups,
        experiment_hashes=experiment_hashes)

    # If we are only generating predictions for a specific time range
    if train_end_times_range is not None:
        if 'range_start_date' in train_end_times_range:
            range_start = train_end_times_range['range_start_date']
            msk = (model_matrix_info['train_end_time'] >= range_start)
            logging.info(
                'Filtering out models with a train_end_time before {}'.format(
                    range_start))

            model_matrix_info = model_matrix_info[msk]

        if 'range_end_date' in train_end_times_range:
            range_end = train_end_times_range['range_end_date']
            msk = (model_matrix_info['train_end_time'] <= range_end)
            logging.info(
                'Filtering out models with a train_end_time after {}'.format(
                    range_end))

            model_matrix_info = model_matrix_info[msk]

    if len(model_matrix_info) == 0:
        raise ValueError('Configis not valid. No models were found!')

    # Al the model groups specified in the config file should valid (even if the experiment_hashes and train_end_times are specified)
    not_fetched_model_grps = [
        x for x in model_groups
        if not x in model_matrix_info['model_group_id'].unique()
    ]

    if len(not_fetched_model_grps) > 0:
        raise ValueError(
            'The config is not valid. No models were found for the model group(s) {}. All specified model groups should be present'
            .format(not_fetched_model_grps))

    logging.info('Scoring {} model ids'.format(len(model_matrix_info)))

    # summary of the models that we are scoring. To check any special things worth noting
    _summary_of_models(model_matrix_info)

    logging.info('Instantiating storage engines and the predictor')

    # Storage objects to handle already stored models and matrices
    project_storage = ProjectStorage(project_path)
    model_storage_engine = project_storage.model_storage_engine()
    matrix_storage_engine = project_storage.matrix_storage_engine()

    # Prediction generation is handled by the Predictor class in catwalk
    predictor = Predictor(model_storage_engine=model_storage_engine,
                          db_engine=db_engine,
                          rank_order=rank_order,
                          replace=replace,
                          save_predictions=True)

    # Organizing prediction run over unique (train_mat, test_mat) pairs
    # This is to reduce no. the times the matrices get loaded to memory
    groupby_obj = model_matrix_info.groupby(
        ['train_matrix_uuid', 'test_matrix_uuid'])

    for group, _ in groupby_obj:
        train_uuid = group[0]
        test_uuid = group[1]

        df_grp = groupby_obj.get_group(group)

        logging.info(
            'Processing {} model_ids for train matrix {} and test matrix {}'.
            format(len(df_grp), train_uuid, test_uuid))

        train_matrix_store = matrix_storage_engine.get_store(
            matrix_uuid=train_uuid)

        # To ensure that the column order we use for predictions match the order we used in model training
        train_matrix_columns = list(train_matrix_store.design_matrix.columns)

        test_matrix_store = matrix_storage_engine.get_store(
            matrix_uuid=test_uuid)

        for model_id in df_grp['model_id'].tolist():
            logging.info(
                'Writing predictions for model_id {}'.format(model_id))
            predictor.predict(model_id=model_id,
                              matrix_store=test_matrix_store,
                              train_matrix_columns=train_matrix_columns,
                              misc_db_parameters={})

    logging.info('Successfully generated predictions for {} models!'.format(
        len(model_matrix_info)))
예제 #11
0
파일: __init__.py 프로젝트: dssg/triage
def predict_forward_with_existed_model(db_engine, project_path, model_id,
                                       as_of_date):
    """Predict forward given model_id and as_of_date and store the prediction in database

    Args:
            db_engine (sqlalchemy.db.engine)
            project_storage (catwalk.storage.ProjectStorage)
            model_id (int) The id of a given model in the database
            as_of_date (string) a date string like "YYYY-MM-DD"
    """
    logger.spam("In PREDICT LIST................")
    upgrade_db(db_engine=db_engine)
    project_storage = ProjectStorage(project_path)
    matrix_storage_engine = project_storage.matrix_storage_engine()
    # 1. Get feature and cohort config from database
    (train_matrix_uuid,
     matrix_metadata) = train_matrix_info_from_model_id(db_engine, model_id)
    experiment_config = experiment_config_from_model_id(db_engine, model_id)

    # 2. Generate cohort
    cohort_table_name = f"triage_production.cohort_{experiment_config['cohort_config']['name']}"
    cohort_table_generator = EntityDateTableGenerator(
        db_engine=db_engine,
        query=experiment_config['cohort_config']['query'],
        entity_date_table_name=cohort_table_name)
    cohort_table_generator.generate_entity_date_table(
        as_of_dates=[dt_from_str(as_of_date)])

    # 3. Generate feature aggregations
    feature_generator = FeatureGenerator(
        db_engine=db_engine,
        features_schema_name="triage_production",
        feature_start_time=experiment_config['temporal_config']
        ['feature_start_time'],
    )
    collate_aggregations = feature_generator.aggregations(
        feature_aggregation_config=experiment_config['feature_aggregations'],
        feature_dates=[as_of_date],
        state_table=cohort_table_name)
    feature_generator.process_table_tasks(
        feature_generator.generate_all_table_tasks(collate_aggregations,
                                                   task_type='aggregation'))

    # 4. Reconstruct feature disctionary from feature_names and generate imputation

    reconstructed_feature_dict = FeatureGroup()
    imputation_table_tasks = OrderedDict()

    for aggregation in collate_aggregations:
        feature_group, feature_names = get_feature_names(
            aggregation, matrix_metadata)
        reconstructed_feature_dict[feature_group] = feature_names

        # Make sure that the features imputed in training should also be imputed in production

        features_imputed_in_train = get_feature_needs_imputation_in_train(
            aggregation, feature_names)

        features_imputed_in_production = get_feature_needs_imputation_in_production(
            aggregation, db_engine)

        total_impute_cols = set(features_imputed_in_production) | set(
            features_imputed_in_train)
        total_nonimpute_cols = set(f for f in set(feature_names)
                                   if '_imp' not in f) - total_impute_cols

        task_generator = feature_generator._generate_imp_table_tasks_for

        imputation_table_tasks.update(
            task_generator(aggregation,
                           impute_cols=list(total_impute_cols),
                           nonimpute_cols=list(total_nonimpute_cols)))
    feature_generator.process_table_tasks(imputation_table_tasks)

    # 5. Build matrix
    db_config = {
        "features_schema_name": "triage_production",
        "labels_schema_name": "public",
        "cohort_table_name": cohort_table_name,
    }

    matrix_builder = MatrixBuilder(
        db_config=db_config,
        matrix_storage_engine=matrix_storage_engine,
        engine=db_engine,
        experiment_hash=None,
        replace=True,
    )

    feature_start_time = experiment_config['temporal_config'][
        'feature_start_time']
    label_name = experiment_config['label_config']['name']
    label_type = 'binary'
    cohort_name = experiment_config['cohort_config']['name']
    user_metadata = experiment_config['user_metadata']

    # Use timechop to get the time definition for production
    temporal_config = experiment_config["temporal_config"]
    temporal_config.update(
        temporal_params_from_matrix_metadata(db_engine, model_id))
    timechopper = Timechop(**temporal_config)
    prod_definitions = timechopper.define_test_matrices(
        train_test_split_time=dt_from_str(as_of_date),
        test_duration=temporal_config['test_durations'][0],
        test_label_timespan=temporal_config['test_label_timespans'][0])

    matrix_metadata = Planner.make_metadata(
        prod_definitions[-1],
        reconstructed_feature_dict,
        label_name,
        label_type,
        cohort_name,
        'production',
        feature_start_time,
        user_metadata,
    )

    matrix_metadata['matrix_id'] = str(
        as_of_date) + f'_model_id_{model_id}' + '_risklist'

    matrix_uuid = filename_friendly_hash(matrix_metadata)

    matrix_builder.build_matrix(
        as_of_times=[as_of_date],
        label_name=label_name,
        label_type=label_type,
        feature_dictionary=reconstructed_feature_dict,
        matrix_metadata=matrix_metadata,
        matrix_uuid=matrix_uuid,
        matrix_type="production",
    )

    # 6. Predict the risk score for production
    predictor = Predictor(
        model_storage_engine=project_storage.model_storage_engine(),
        db_engine=db_engine,
        rank_order='best')

    predictor.predict(
        model_id=model_id,
        matrix_store=matrix_storage_engine.get_store(matrix_uuid),
        misc_db_parameters={},
        train_matrix_columns=matrix_storage_engine.get_store(
            train_matrix_uuid).columns())
예제 #12
0
파일: __init__.py 프로젝트: dssg/triage
    def predict(self, prediction_date):
        """Predict forward by creating a matrix using as_of_date = prediction_date and applying the retrain model on it

        Args:
            prediction_date(str)
        """
        cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_predict"

        # 1. Generate cohort
        self.generate_entity_date_table(prediction_date, cohort_table_name)

        # 2. Generate feature aggregations
        collate_aggregations = self.get_collate_aggregations(
            prediction_date, cohort_table_name)
        self.feature_generator.process_table_tasks(
            self.feature_generator.generate_all_table_tasks(
                collate_aggregations, task_type='aggregation'))
        # 3. Reconstruct feature disctionary from feature_names and generate imputation
        reconstructed_feature_dict, imputation_table_tasks = self.get_feature_dict_and_imputation_task(
            collate_aggregations, self.retrain_model_id)
        self.feature_generator.process_table_tasks(imputation_table_tasks)

        # 4. Build matrix
        db_config = {
            "features_schema_name": "triage_production",
            "labels_schema_name": "public",
            "cohort_table_name": cohort_table_name,
        }

        matrix_builder = MatrixBuilder(
            db_config=db_config,
            matrix_storage_engine=self.matrix_storage_engine,
            engine=self.db_engine,
            experiment_hash=None,
            replace=True,
        )
        # Use timechop to get the time definition for production
        temporal_config = self.get_temporal_config_for_retrain(
            dt_from_str(prediction_date))
        timechopper = Timechop(**temporal_config)

        retrain_config = get_retrain_config_from_model_id(
            self.db_engine, self.retrain_model_id)

        prod_definitions = timechopper.define_test_matrices(
            train_test_split_time=dt_from_str(prediction_date),
            test_duration=retrain_config['test_duration'],
            test_label_timespan=retrain_config['test_label_timespan'])
        last_split_definition = prod_definitions[-1]
        matrix_metadata = Planner.make_metadata(
            matrix_definition=last_split_definition,
            feature_dictionary=reconstructed_feature_dict,
            label_name=self.label_name,
            label_type='binary',
            cohort_name=self.cohort_name,
            matrix_type='production',
            feature_start_time=self.feature_start_time,
            user_metadata=self.user_metadata,
        )

        matrix_metadata['matrix_id'] = str(
            prediction_date
        ) + f'_model_id_{self.retrain_model_id}' + '_risklist'

        matrix_uuid = filename_friendly_hash(matrix_metadata)

        matrix_builder.build_matrix(
            as_of_times=[prediction_date],
            label_name=self.label_name,
            label_type='binary',
            feature_dictionary=reconstructed_feature_dict,
            matrix_metadata=matrix_metadata,
            matrix_uuid=matrix_uuid,
            matrix_type="production",
        )

        # 5. Predict the risk score for production
        predictor = Predictor(
            model_storage_engine=self.project_storage.model_storage_engine(),
            db_engine=self.db_engine,
            rank_order='best')

        predictor.predict(
            model_id=self.retrain_model_id,
            matrix_store=self.matrix_storage_engine.get_store(matrix_uuid),
            misc_db_parameters={},
            train_matrix_columns=self.matrix_storage_engine.get_store(
                self.retrain_matrix_uuid).columns(),
        )
        self.predict_matrix_uuid = matrix_uuid
예제 #13
0
def test_integration():
    with rig_engines() as (db_engine, project_storage):
        train_store = get_matrix_store(
            project_storage,
            matrix_creator(),
            matrix_metadata_creator(matrix_type="train"),
        )
        as_of_dates = [datetime.date(2016, 12, 21), datetime.date(2017, 1, 21)]

        test_stores = []
        for as_of_date in as_of_dates:
            matrix_store = get_matrix_store(
                project_storage,
                pandas.DataFrame.from_dict({
                    "entity_id": [3],
                    "feature_one": [8],
                    "feature_two": [5],
                    "label": [0],
                }).set_index("entity_id"),
                matrix_metadata_creator(end_time=as_of_date,
                                        indices=["entity_id"]),
            )
            test_stores.append(matrix_store)

        model_storage_engine = ModelStorageEngine(project_storage)

        experiment_hash = save_experiment_and_get_hash({}, db_engine)
        # instantiate pipeline objects
        trainer = ModelTrainer(
            experiment_hash=experiment_hash,
            model_storage_engine=model_storage_engine,
            db_engine=db_engine,
        )
        predictor = Predictor(model_storage_engine, db_engine)
        model_evaluator = ModelEvaluator([{
            "metrics": ["precision@"],
            "thresholds": {
                "top_n": [5]
            }
        }], [{}], db_engine)

        # run the pipeline
        grid_config = {
            "sklearn.linear_model.LogisticRegression": {
                "C": [0.00001, 0.0001],
                "penalty": ["l1", "l2"],
                "random_state": [2193],
            }
        }
        model_ids = trainer.train_models(grid_config=grid_config,
                                         misc_db_parameters=dict(),
                                         matrix_store=train_store)

        for model_id in model_ids:
            for as_of_date, test_store in zip(as_of_dates, test_stores):
                predictions_proba = predictor.predict(
                    model_id,
                    test_store,
                    misc_db_parameters=dict(),
                    train_matrix_columns=["feature_one", "feature_two"],
                )

                model_evaluator.evaluate(predictions_proba, test_store,
                                         model_id)

        # assert
        # 1. that the predictions table entries are present and
        # can be linked to the original models
        records = [
            row for row in db_engine.execute(
                """select entity_id, model_id, as_of_date
            from test_results.predictions
            join model_metadata.models using (model_id)
            order by 3, 2""")
        ]
        assert records == [
            (3, 1, datetime.datetime(2016, 12, 21)),
            (3, 2, datetime.datetime(2016, 12, 21)),
            (3, 3, datetime.datetime(2016, 12, 21)),
            (3, 4, datetime.datetime(2016, 12, 21)),
            (3, 1, datetime.datetime(2017, 1, 21)),
            (3, 2, datetime.datetime(2017, 1, 21)),
            (3, 3, datetime.datetime(2017, 1, 21)),
            (3, 4, datetime.datetime(2017, 1, 21)),
        ]

        # that evaluations are there
        records = [
            row for row in db_engine.execute("""
                select model_id, evaluation_start_time, metric, parameter
                from test_results.evaluations order by 2, 1""")
        ]
        assert records == [
            (1, datetime.datetime(2016, 12, 21), "precision@", "5_abs"),
            (2, datetime.datetime(2016, 12, 21), "precision@", "5_abs"),
            (3, datetime.datetime(2016, 12, 21), "precision@", "5_abs"),
            (4, datetime.datetime(2016, 12, 21), "precision@", "5_abs"),
            (1, datetime.datetime(2017, 1, 21), "precision@", "5_abs"),
            (2, datetime.datetime(2017, 1, 21), "precision@", "5_abs"),
            (3, datetime.datetime(2017, 1, 21), "precision@", "5_abs"),
            (4, datetime.datetime(2017, 1, 21), "precision@", "5_abs"),
        ]