Python Predictor примеры использования

Язык программирования: Python

Пространство имен/Пакет: triage.component.catwalk.predictors

Класс/Тип: Predictor

Примеров на hotexamples.com: 13

Python Predictor - 13 примеров найдено. Это лучшие примеры Python кода для triage.component.catwalk.predictors.Predictor, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Predictor(29)

predict(21)

load_model(4)

delete_model(2)

needs_predictions(2)

update_db_with_ranks(1)

Пример #1

Показать файл

def test_predictor_composite_index():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)

        project_path = 'econ-dev/inspections'
        model_storage_engine = InMemoryModelStorageEngine(project_path)

        _, model_id = \
            fake_trained_model(project_path, model_storage_engine, db_engine, train_matrix_uuid='1234')

        predictor = Predictor(project_path, model_storage_engine, db_engine)

        dayone = datetime.datetime(2011, 1, 1)
        daytwo = datetime.datetime(2011, 1, 2)

        # create prediction set
        matrix = pandas.DataFrame.from_dict({
            'entity_id': [1, 2, 1, 2],
            'as_of_date': [dayone, dayone, daytwo, daytwo],
            'feature_one': [3, 4, 5, 6],
            'feature_two': [5, 6, 7, 8],
            'label': [7, 8, 8, 7]
        }).set_index(['entity_id', 'as_of_date'])
        metadata = {
            'label_name': 'label',
            'end_time': AS_OF_DATE,
            'label_timespan': '3month',
            'metta-uuid': '1234',
            'indices': ['entity_id', 'as_of_date'],
        }

        # Runs the same test for training and testing predictions
        for mat_type in ("train", "test"):

            # Create the matrix to be tested and store in db
            metadata['matrix_type'] = mat_type
            matrix_store = InMemoryMatrixStore(matrix, metadata)

            # Adding 'label' column back into matrix
            matrix['label'] = [7, 8, 8, 7]

            predict_proba = predictor.predict(
                model_id,
                matrix_store,
                misc_db_parameters=dict(),
                train_matrix_columns=['feature_one', 'feature_two'])

            # assert
            # 1. that the returned predictions are of the desired length
            assert len(predict_proba) == 4

            # 2. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row
                for row in db_engine.execute('''select entity_id, as_of_date
                from {}_results.{}_predictions
                join model_metadata.models using (model_id)'''.format(
                    mat_type, mat_type))
            ]
            assert len(records) == 4

Пример #2

Показать файл

def test_predictor():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            project_path = 'econ-dev/inspections'
            model_storage_engine = S3ModelStorageEngine(project_path)

            _, model_id = \
                fake_trained_model(project_path, model_storage_engine, db_engine, train_matrix_uuid='1234')

            predictor = Predictor(project_path, model_storage_engine,
                                  db_engine)

            # create prediction set
            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')

            metadata = {
                'label_name': 'label',
                'end_time': AS_OF_DATE,
                'label_timespan': '3month',
                'metta-uuid': '1234',
                'indices': ['entity_id'],
            }

            train_matrix_columns = ['feature_one', 'feature_two']

            # Runs the same test for training and testing predictions
            for mat_type in ("train", "test"):
                # Create the matrix to be tested and store in db
                metadata['matrix_type'] = mat_type

                matrix_store = InMemoryMatrixStore(matrix, metadata)

                # Note, the first time 'matrix' is used, the label column is popped.
                # It must be added back in to 'matrix' to create another matrix_store.
                matrix['label'] = [7, 8]

                predict_proba = predictor.predict(
                    model_id,
                    matrix_store,
                    misc_db_parameters=dict(),
                    train_matrix_columns=train_matrix_columns)

                # assert
                # 1. that the returned predictions are of the desired length
                assert len(predict_proba) == 2

                # 2. that the predictions table entries are present and
                # can be linked to the original models
                records = [
                    row for row in db_engine.execute(
                        '''select entity_id, as_of_date
                    from {}_results.{}_predictions
                    join model_metadata.models using (model_id)'''.format(
                            mat_type, mat_type))
                ]
                assert len(records) == 2

                # 3. that the contained as_of_dates match what we sent in
                for record in records:
                    assert record[1].date() == AS_OF_DATE

                # 4. that the entity ids match the given dataset
                assert sorted([record[0] for record in records]) == [1, 2]

            # 5. running with same model_id, different as of date
            # then with same as of date only replaces the records
            # with the same date
            new_matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            new_metadata = {
                'label_name': 'label',
                'end_time': AS_OF_DATE + datetime.timedelta(days=1),
                'label_timespan': '3month',
                'metta-uuid': '1234',
                'indices': ['entity_id'],
            }

            # Runs the same test for training and testing predictions
            for mat_type in ("train", "test"):

                # Create the matrix to be tested and store in db
                new_metadata['matrix_type'] = mat_type

                new_matrix_store = InMemoryMatrixStore(new_matrix,
                                                       new_metadata)

                # Adding 'label' column back into new_matrix
                new_matrix['label'] = [7, 8]

                predictor.predict(model_id,
                                  new_matrix_store,
                                  misc_db_parameters=dict(),
                                  train_matrix_columns=train_matrix_columns)
                predictor.predict(model_id,
                                  matrix_store,
                                  misc_db_parameters=dict(),
                                  train_matrix_columns=train_matrix_columns)
                records = [
                    row for row in db_engine.execute(
                        '''select entity_id, as_of_date
                    from {}_results.{}_predictions
                    join model_metadata.models using (model_id)'''.format(
                            mat_type, mat_type))
                ]
                assert len(records) == 4

            # 6. That we can delete the model when done prediction on it
            predictor.delete_model(model_id)
            assert predictor.load_model(model_id) == None

Пример #3

Показать файл

Файл: test_predictors.py Проект: kalyserge/triage

def test_predictor_entity_index():
    with prepare() as (project_storage, db_engine, model_id):
        predictor = Predictor(project_storage.model_storage_engine(),
                              db_engine)

        # Runs the same test for training and testing predictions
        for mat_type in ("train", "test"):
            matrix = matrix_creator(index="entity_id")
            metadata = matrix_metadata_creator(end_time=AS_OF_DATE,
                                               matrix_type=mat_type,
                                               indices=["entity_id"])

            matrix_store = get_matrix_store(project_storage, matrix, metadata)
            train_matrix_columns = matrix.columns[0:-1].tolist()

            predict_proba = predictor.predict(
                model_id,
                matrix_store,
                misc_db_parameters=dict(),
                train_matrix_columns=train_matrix_columns,
            )

            # assert
            # 1. that the returned predictions are of the desired length
            assert len(predict_proba) == 2

            # 2. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row
                for row in db_engine.execute("""select entity_id, as_of_date
                from {}_results.predictions
                join model_metadata.models using (model_id)""".format(
                    mat_type, mat_type))
            ]
            assert len(records) == 2

            # 3. that the contained as_of_dates match what we sent in
            for record in records:
                assert record[1].date() == AS_OF_DATE

            # 4. that the entity ids match the given dataset
            assert sorted([record[0] for record in records]) == [1, 2]

        # 5. running with same model_id, different as of date
        # then with same as of date only replaces the records
        # with the same date

        # Runs the same test for training and testing predictions
        for mat_type in ("train", "test"):
            new_matrix = matrix_creator(index="entity_id")
            new_metadata = matrix_metadata_creator(
                end_time=AS_OF_DATE + datetime.timedelta(days=1),
                matrix_type=mat_type,
                indices=["entity_id"],
            )
            new_matrix_store = get_matrix_store(project_storage, new_matrix,
                                                new_metadata)

            predictor.predict(
                model_id,
                new_matrix_store,
                misc_db_parameters=dict(),
                train_matrix_columns=train_matrix_columns,
            )
            predictor.predict(
                model_id,
                matrix_store,
                misc_db_parameters=dict(),
                train_matrix_columns=train_matrix_columns,
            )
            records = [
                row
                for row in db_engine.execute("""select entity_id, as_of_date
                from {}_results.predictions
                join model_metadata.models using (model_id)""".format(
                    mat_type, mat_type))
            ]
            assert len(records) == 4

        # 6. That we can delete the model when done prediction on it
        predictor.delete_model(model_id)
        assert predictor.load_model(model_id) is None

Пример #4

Показать файл

Файл: test_integration.py Проект: afcarl/triage

def test_integration():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            project_path = 'econ-dev/inspections'

            # create train and test matrices
            train_matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            train_metadata = {
                'feature_start_time': datetime.date(2012, 12, 20),
                'end_time': datetime.date(2016, 12, 20),
                'label_name': 'label',
                'label_timespan': '1y',
                'feature_names': ['ft1', 'ft2'],
                'metta-uuid': '1234',
                'indices': ['entity_id'],
                'matrix_type': 'train'
            }
            # Creates a matrix entry in the matrices table with uuid from train_metadata
            MatrixFactory(matrix_uuid="1234")
            session.commit()

            train_store = InMemoryMatrixStore(train_matrix, sample_metadata())

            as_of_dates = [
                datetime.date(2016, 12, 21),
                datetime.date(2017, 1, 21)
            ]

            test_stores = [
                InMemoryMatrixStore(
                    pandas.DataFrame.from_dict({
                        'entity_id': [3],
                        'feature_one': [8],
                        'feature_two': [5],
                        'label': [5]
                    }), {
                        'label_name': 'label',
                        'label_timespan': '1y',
                        'end_time': as_of_date,
                        'metta-uuid': '1234',
                        'indices': ['entity_id'],
                        'matrix_type': 'test',
                        'as_of_date_frequency': '1month'
                    }) for as_of_date in as_of_dates
            ]

            model_storage_engine = S3ModelStorageEngine(project_path)

            experiment_hash = save_experiment_and_get_hash({}, db_engine)
            # instantiate pipeline objects
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=experiment_hash,
                model_storage_engine=model_storage_engine,
                db_engine=db_engine,
            )
            predictor = Predictor(project_path, model_storage_engine,
                                  db_engine)
            model_evaluator = ModelEvaluator([{
                'metrics': ['precision@'],
                'thresholds': {
                    'top_n': [5]
                }
            }], [{}], db_engine)

            # run the pipeline
            grid_config = {
                'sklearn.linear_model.LogisticRegression': {
                    'C': [0.00001, 0.0001],
                    'penalty': ['l1', 'l2'],
                    'random_state': [2193]
                }
            }
            model_ids = trainer.train_models(grid_config=grid_config,
                                             misc_db_parameters=dict(),
                                             matrix_store=train_store)

            for model_id in model_ids:
                for as_of_date, test_store in zip(as_of_dates, test_stores):
                    predictions_proba = predictor.predict(
                        model_id,
                        test_store,
                        misc_db_parameters=dict(),
                        train_matrix_columns=['feature_one', 'feature_two'])

                    model_evaluator.evaluate(
                        predictions_proba,
                        test_store,
                        model_id,
                    )

            # assert
            # 1. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row for row in db_engine.execute(
                    '''select entity_id, model_id, as_of_date
                from test_results.test_predictions
                join model_metadata.models using (model_id)
                order by 3, 2''')
            ]
            assert records == [
                (3, 1, datetime.datetime(2016, 12, 21)),
                (3, 2, datetime.datetime(2016, 12, 21)),
                (3, 3, datetime.datetime(2016, 12, 21)),
                (3, 4, datetime.datetime(2016, 12, 21)),
                (3, 1, datetime.datetime(2017, 1, 21)),
                (3, 2, datetime.datetime(2017, 1, 21)),
                (3, 3, datetime.datetime(2017, 1, 21)),
                (3, 4, datetime.datetime(2017, 1, 21)),
            ]

            # that evaluations are there
            records = [
                row for row in db_engine.execute('''
                    select model_id, evaluation_start_time, metric, parameter
                    from test_results.test_evaluations order by 2, 1''')
            ]
            assert records == [
                (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
            ]

Пример #5

Показать файл

Файл: test_predictors.py Проект: washingtonm/triage

def test_predictor_retrieve():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)

        project_path = 'econ-dev/inspections'
        model_storage_engine = InMemoryModelStorageEngine(project_path)

        _, model_id = \
            fake_trained_model(project_path, model_storage_engine, db_engine, train_matrix_uuid='1234')

        predictor = Predictor(project_path,
                              model_storage_engine,
                              db_engine,
                              replace=False)

        dayone = datetime.date(2011, 1,
                               1).strftime(predictor.expected_matrix_ts_format)
        daytwo = datetime.date(2011, 1,
                               2).strftime(predictor.expected_matrix_ts_format)

        # create prediction set
        matrix_data = {
            'entity_id': [1, 2, 1, 2],
            'as_of_date': [dayone, dayone, daytwo, daytwo],
            'feature_one': [3, 4, 5, 6],
            'feature_two': [5, 6, 7, 8],
            'label': [7, 8, 8, 7]
        }
        matrix = pandas.DataFrame.from_dict(matrix_data)\
            .set_index(['entity_id', 'as_of_date'])
        metadata = {
            'label_name': 'label',
            'end_time': AS_OF_DATE,
            'label_timespan': '3month',
            'metta-uuid': '1234',
            'indices': ['entity_id', 'as_of_date'],
            'matrix_type': 'test'
        }

        matrix_store = InMemoryMatrixStore(matrix, metadata)

        predict_proba = predictor.predict(
            model_id,
            matrix_store,
            misc_db_parameters=dict(),
            train_matrix_columns=['feature_one', 'feature_two'])

        # When run again, the predictions retrieved from the database
        # should match.
        #
        # Some trickiness here. Let's explain:
        #
        # If we are not careful, retrieving predictions from the database and
        # presenting them as a numpy array can result in a bad ordering,
        # since the given matrix may not be 'ordered' by some criteria
        # that can be easily represented by an ORDER BY clause.
        #
        # It will sometimes work, because without ORDER BY you will get
        # it back in the table's physical order, which unless something has
        # happened to the table will be the order you inserted it,
        # which could very well be the order in the matrix.
        # So it's not a bug that would necessarily immediately show itself,
        # but when it does go wrong your scores will be garbage.
        #
        # So we simulate a table order mutation that can happen over time:
        # Remove the first row and put it at the end.
        # If the Predictor doesn't explicitly reorder the results, this will fail
        # Only running on TestPrediction because TrainPrediction behaves the exact same way
        reorder_session = sessionmaker(bind=db_engine)()
        obj = reorder_session.query(TestPrediction).first()
        reorder_session.delete(obj)
        reorder_session.commit()

        make_transient(obj)
        reorder_session = sessionmaker(bind=db_engine)()
        reorder_session.add(obj)
        reorder_session.commit()

        predictor.load_model = Mock()
        new_predict_proba = predictor.predict(
            model_id,
            matrix_store,
            misc_db_parameters=dict(),
            train_matrix_columns=['feature_one', 'feature_two'])
        assert_array_equal(new_predict_proba, predict_proba)
        assert not predictor.load_model.called

Пример #6

Показать файл

Файл: test_predictors.py Проект: kalyserge/triage

def test_predictor_retrieve():
    with prepare() as (project_storage, db_engine, model_id):
        predictor = Predictor(project_storage.model_storage_engine(),
                              db_engine,
                              replace=False)

        # create prediction set
        matrix = matrix_creator()
        metadata = matrix_metadata_creator()
        matrix_store = get_matrix_store(project_storage, matrix, metadata)

        predict_proba = predictor.predict(
            model_id,
            matrix_store,
            misc_db_parameters=dict(),
            train_matrix_columns=matrix.columns[0:-1].tolist(),
        )

        # When run again, the predictions retrieved from the database
        # should match.
        #
        # Some trickiness here. Let's explain:
        #
        # If we are not careful, retrieving predictions from the database and
        # presenting them as a numpy array can result in a bad ordering,
        # since the given matrix may not be 'ordered' by some criteria
        # that can be easily represented by an ORDER BY clause.
        #
        # It will sometimes work, because without ORDER BY you will get
        # it back in the table's physical order, which unless something has
        # happened to the table will be the order you inserted it,
        # which could very well be the order in the matrix.
        # So it's not a bug that would necessarily immediately show itself,
        # but when it does go wrong your scores will be garbage.
        #
        # So we simulate a table order mutation that can happen over time:
        # Remove the first row and put it at the end.
        # If the Predictor doesn't explicitly reorder the results, this will fail
        # Only running on TestPrediction because TrainPrediction behaves the exact same way
        try:
            reorder_session = sessionmaker(bind=db_engine)()
            obj = reorder_session.query(TestPrediction).first()
            reorder_session.delete(obj)
            reorder_session.commit()
        finally:
            reorder_session.close()

        make_transient(obj)
        try:
            reorder_session = sessionmaker(bind=db_engine)()
            reorder_session.add(obj)
            reorder_session.commit()
        finally:
            reorder_session.close()

        predictor.load_model = Mock()
        new_predict_proba = predictor.predict(
            model_id,
            matrix_store,
            misc_db_parameters=dict(),
            train_matrix_columns=matrix.columns[0:-1].tolist(),
        )
        assert_array_equal(new_predict_proba, predict_proba)
        assert not predictor.load_model.called

Пример #7

Показать файл

class ModelTester(object):
    def __init__(
        self,
        db_engine,
        model_storage_engine,
        matrix_storage_engine,
        replace,
        evaluator_config,
        individual_importance_config,
    ):
        self.matrix_storage_engine = matrix_storage_engine
        self.predictor = Predictor(
            db_engine=db_engine,
            model_storage_engine=model_storage_engine,
            replace=replace,
        )

        self.individual_importance_calculator = IndividualImportanceCalculator(
            db_engine=db_engine,
            n_ranks=individual_importance_config.get("n_ranks", 5),
            methods=individual_importance_config.get("methods", ["uniform"]),
            replace=replace,
        )

        self.evaluator = ModelEvaluator(
            db_engine=db_engine,
            sort_seed=evaluator_config.get("sort_seed", None),
            testing_metric_groups=evaluator_config.get("testing_metric_groups",
                                                       []),
            training_metric_groups=evaluator_config.get(
                "training_metric_groups", []),
        )

    def generate_model_test_tasks(self, split, train_store, model_ids):
        test_tasks = []
        for test_matrix_def, test_uuid in zip(split["test_matrices"],
                                              split["test_uuids"]):
            test_store = self.matrix_storage_engine.get_store(test_uuid)

            if test_store.empty:
                logging.warning(
                    """Test matrix for uuid %s
                was empty, no point in generating predictions. Not creating test task.
                """,
                    test_uuid,
                )
                continue
            test_tasks.append({
                "test_store":
                test_store,
                "train_store":
                train_store,
                "model_ids": [model_id for model_id in model_ids if model_id],
            })
        return test_tasks

    def process_model_test_task(self, test_store, train_store, model_ids):
        as_of_times = test_store.metadata["as_of_times"]
        logging.info(
            "Testing and scoring all model ids with test matrix %s. "
            "as_of_times min: %s max: %s num: %s",
            test_store.uuid,
            min(as_of_times),
            max(as_of_times),
            len(as_of_times),
        )

        for model_id in model_ids:
            logging.info("Testing model id %s", model_id)

            self.individual_importance_calculator.calculate_and_save_all_methods_and_dates(
                model_id, test_store)

            # Generate predictions for the testing data then training data
            for store in (test_store, train_store):
                if self.evaluator.needs_evaluations(store, model_id):
                    logging.info(
                        "The evaluations needed for matrix %s-%s and model %s"
                        "are not all present in db, so predicting and evaluating",
                        store.uuid, store.matrix_type, model_id)
                    predictions_proba = self.predictor.predict(
                        model_id,
                        store,
                        misc_db_parameters=dict(),
                        train_matrix_columns=train_store.columns(),
                    )

                    self.evaluator.evaluate(
                        predictions_proba=predictions_proba,
                        matrix_store=store,
                        model_id=model_id,
                    )
                else:
                    logging.info(
                        "The evaluations needed for matrix %s-%s and model %s are all present"
                        "in db from a previous run (or none needed at all), so skipping!",
                        store.uuid, store.matrix_type, model_id)

Пример #8

Показать файл

class ModelTester(object):
    def __init__(self, db_engine, project_path, model_storage_engine, replace,
                 evaluator_config, individual_importance_config):
        self.predictor = Predictor(db_engine=db_engine,
                                   model_storage_engine=model_storage_engine,
                                   project_path=project_path,
                                   replace=replace)

        self.individual_importance_calculator = IndividualImportanceCalculator(
            db_engine=db_engine,
            n_ranks=individual_importance_config.get('n_ranks', 5),
            methods=individual_importance_config.get('methods', ['uniform']),
            replace=replace)

        self.evaluator = ModelEvaluator(
            db_engine=db_engine,
            sort_seed=evaluator_config.get('sort_seed', None),
            metric_groups=evaluator_config['metric_groups'],
            training_metric_groups=evaluator_config['training_metric_groups'])

    def generate_model_test_tasks(self, split, train_store, model_ids,
                                  matrix_store_creator):
        test_tasks = []
        for test_matrix_def, test_uuid in zip(split['test_matrices'],
                                              split['test_uuids']):
            test_store = matrix_store_creator(test_uuid)

            if test_store.empty:
                logging.warning(
                    '''Test matrix for uuid %s
                was empty, no point in generating predictions. Not creating test task.
                ''', test_uuid)
                continue
            test_tasks.append({
                'test_store':
                test_store,
                'train_store':
                train_store,
                'model_ids': [model_id for model_id in model_ids if model_id]
            })
        return test_tasks

    def process_model_test_task(self, test_store, train_store, model_ids):
        as_of_times = test_store.metadata['as_of_times']
        logging.info(
            'Testing and scoring all model ids with test matrix %s. as_of_times min: %s max: %s num: %s',
            test_store.uuid, min(as_of_times), max(as_of_times),
            len(as_of_times))
        for model_id in model_ids:
            logging.info('Testing model id %s', model_id)

            self.individual_importance_calculator\
                .calculate_and_save_all_methods_and_dates(
                    model_id,
                    test_store
                )

            # Generate predictions for the testing data then training data
            for store in (test_store, train_store):
                predictions_proba = self.predictor.predict(
                    model_id,
                    store,
                    misc_db_parameters=dict(),
                    train_matrix_columns=train_store.columns())

                self.evaluator.evaluate(
                    predictions_proba=predictions_proba,
                    matrix_store=store,
                    model_id=model_id,
                )

Пример #9

Показать файл

Файл: test_predictors.py Проект: zshi1/triage

def predictor(predict_setup_args):
    (project_storage, db_engine, model_id) = predict_setup_args
    return Predictor(project_storage.model_storage_engine(),
                     db_engine,
                     rank_order='worst')

Пример #10

Показать файл

def add_predictions(db_engine,
                    model_groups,
                    project_path,
                    experiment_hashes=None,
                    train_end_times_range=None,
                    rank_order='worst',
                    replace=True):
    """ For a set of modl_groups generate test predictions and write to DB
        Args:
            db_engine: Sqlalchemy engine
            model_groups (list): The list of model group ids we are interested in (ideally, chosen through audition)
            project_path (str): Path where the created matrices and trained model objects are stored for the experiment
            experiment_hashes (List[str]): Optional. hash(es) of the experiments we are interested in. Can be used to narrow down the model_ids in the model groups specified
            range_train_end_times (Dict): Optional. If provided, only the models with train_end_times that fall in the range are scored. 
                                        This too, helps narrow down model_ids in the model groups specified.
                                        A dictionary with two possible keys 'range_start_date' and 'range_end_date'. Either or both could be set
            rank_order (str) : How to deal with ties in the scores. 
            replace (bool) : Whether to overwrite the preditctions for a model_id, if already found in the DB.

        Returns: None
            This directly writes to the test_results.predictions table
    """

    model_matrix_info = _fetch_relevant_model_matrix_info(
        db_engine=db_engine,
        model_groups=model_groups,
        experiment_hashes=experiment_hashes)

    # If we are only generating predictions for a specific time range
    if train_end_times_range is not None:
        if 'range_start_date' in train_end_times_range:
            range_start = train_end_times_range['range_start_date']
            msk = (model_matrix_info['train_end_time'] >= range_start)
            logging.info(
                'Filtering out models with a train_end_time before {}'.format(
                    range_start))

            model_matrix_info = model_matrix_info[msk]

        if 'range_end_date' in train_end_times_range:
            range_end = train_end_times_range['range_end_date']
            msk = (model_matrix_info['train_end_time'] <= range_end)
            logging.info(
                'Filtering out models with a train_end_time after {}'.format(
                    range_end))

            model_matrix_info = model_matrix_info[msk]

    if len(model_matrix_info) == 0:
        raise ValueError('Configis not valid. No models were found!')

    # Al the model groups specified in the config file should valid (even if the experiment_hashes and train_end_times are specified)
    not_fetched_model_grps = [
        x for x in model_groups
        if not x in model_matrix_info['model_group_id'].unique()
    ]

    if len(not_fetched_model_grps) > 0:
        raise ValueError(
            'The config is not valid. No models were found for the model group(s) {}. All specified model groups should be present'
            .format(not_fetched_model_grps))

    logging.info('Scoring {} model ids'.format(len(model_matrix_info)))

    # summary of the models that we are scoring. To check any special things worth noting
    _summary_of_models(model_matrix_info)

    logging.info('Instantiating storage engines and the predictor')

    # Storage objects to handle already stored models and matrices
    project_storage = ProjectStorage(project_path)
    model_storage_engine = project_storage.model_storage_engine()
    matrix_storage_engine = project_storage.matrix_storage_engine()

    # Prediction generation is handled by the Predictor class in catwalk
    predictor = Predictor(model_storage_engine=model_storage_engine,
                          db_engine=db_engine,
                          rank_order=rank_order,
                          replace=replace,
                          save_predictions=True)

    # Organizing prediction run over unique (train_mat, test_mat) pairs
    # This is to reduce no. the times the matrices get loaded to memory
    groupby_obj = model_matrix_info.groupby(
        ['train_matrix_uuid', 'test_matrix_uuid'])

    for group, _ in groupby_obj:
        train_uuid = group[0]
        test_uuid = group[1]

        df_grp = groupby_obj.get_group(group)

        logging.info(
            'Processing {} model_ids for train matrix {} and test matrix {}'.
            format(len(df_grp), train_uuid, test_uuid))

        train_matrix_store = matrix_storage_engine.get_store(
            matrix_uuid=train_uuid)

        # To ensure that the column order we use for predictions match the order we used in model training
        train_matrix_columns = list(train_matrix_store.design_matrix.columns)

        test_matrix_store = matrix_storage_engine.get_store(
            matrix_uuid=test_uuid)

        for model_id in df_grp['model_id'].tolist():
            logging.info(
                'Writing predictions for model_id {}'.format(model_id))
            predictor.predict(model_id=model_id,
                              matrix_store=test_matrix_store,
                              train_matrix_columns=train_matrix_columns,
                              misc_db_parameters={})

    logging.info('Successfully generated predictions for {} models!'.format(
        len(model_matrix_info)))

Пример #11

Показать файл

Файл: __init__.py Проект: dssg/triage

def predict_forward_with_existed_model(db_engine, project_path, model_id,
                                       as_of_date):
    """Predict forward given model_id and as_of_date and store the prediction in database

    Args:
            db_engine (sqlalchemy.db.engine)
            project_storage (catwalk.storage.ProjectStorage)
            model_id (int) The id of a given model in the database
            as_of_date (string) a date string like "YYYY-MM-DD"
    """
    logger.spam("In PREDICT LIST................")
    upgrade_db(db_engine=db_engine)
    project_storage = ProjectStorage(project_path)
    matrix_storage_engine = project_storage.matrix_storage_engine()
    # 1. Get feature and cohort config from database
    (train_matrix_uuid,
     matrix_metadata) = train_matrix_info_from_model_id(db_engine, model_id)
    experiment_config = experiment_config_from_model_id(db_engine, model_id)

    # 2. Generate cohort
    cohort_table_name = f"triage_production.cohort_{experiment_config['cohort_config']['name']}"
    cohort_table_generator = EntityDateTableGenerator(
        db_engine=db_engine,
        query=experiment_config['cohort_config']['query'],
        entity_date_table_name=cohort_table_name)
    cohort_table_generator.generate_entity_date_table(
        as_of_dates=[dt_from_str(as_of_date)])

    # 3. Generate feature aggregations
    feature_generator = FeatureGenerator(
        db_engine=db_engine,
        features_schema_name="triage_production",
        feature_start_time=experiment_config['temporal_config']
        ['feature_start_time'],
    )
    collate_aggregations = feature_generator.aggregations(
        feature_aggregation_config=experiment_config['feature_aggregations'],
        feature_dates=[as_of_date],
        state_table=cohort_table_name)
    feature_generator.process_table_tasks(
        feature_generator.generate_all_table_tasks(collate_aggregations,
                                                   task_type='aggregation'))

    # 4. Reconstruct feature disctionary from feature_names and generate imputation

    reconstructed_feature_dict = FeatureGroup()
    imputation_table_tasks = OrderedDict()

    for aggregation in collate_aggregations:
        feature_group, feature_names = get_feature_names(
            aggregation, matrix_metadata)
        reconstructed_feature_dict[feature_group] = feature_names

        # Make sure that the features imputed in training should also be imputed in production

        features_imputed_in_train = get_feature_needs_imputation_in_train(
            aggregation, feature_names)

        features_imputed_in_production = get_feature_needs_imputation_in_production(
            aggregation, db_engine)

        total_impute_cols = set(features_imputed_in_production) | set(
            features_imputed_in_train)
        total_nonimpute_cols = set(f for f in set(feature_names)
                                   if '_imp' not in f) - total_impute_cols

        task_generator = feature_generator._generate_imp_table_tasks_for

        imputation_table_tasks.update(
            task_generator(aggregation,
                           impute_cols=list(total_impute_cols),
                           nonimpute_cols=list(total_nonimpute_cols)))
    feature_generator.process_table_tasks(imputation_table_tasks)

    # 5. Build matrix
    db_config = {
        "features_schema_name": "triage_production",
        "labels_schema_name": "public",
        "cohort_table_name": cohort_table_name,
    }

    matrix_builder = MatrixBuilder(
        db_config=db_config,
        matrix_storage_engine=matrix_storage_engine,
        engine=db_engine,
        experiment_hash=None,
        replace=True,
    )

    feature_start_time = experiment_config['temporal_config'][
        'feature_start_time']
    label_name = experiment_config['label_config']['name']
    label_type = 'binary'
    cohort_name = experiment_config['cohort_config']['name']
    user_metadata = experiment_config['user_metadata']

    # Use timechop to get the time definition for production
    temporal_config = experiment_config["temporal_config"]
    temporal_config.update(
        temporal_params_from_matrix_metadata(db_engine, model_id))
    timechopper = Timechop(**temporal_config)
    prod_definitions = timechopper.define_test_matrices(
        train_test_split_time=dt_from_str(as_of_date),
        test_duration=temporal_config['test_durations'][0],
        test_label_timespan=temporal_config['test_label_timespans'][0])

    matrix_metadata = Planner.make_metadata(
        prod_definitions[-1],
        reconstructed_feature_dict,
        label_name,
        label_type,
        cohort_name,
        'production',
        feature_start_time,
        user_metadata,
    )

    matrix_metadata['matrix_id'] = str(
        as_of_date) + f'_model_id_{model_id}' + '_risklist'

    matrix_uuid = filename_friendly_hash(matrix_metadata)

    matrix_builder.build_matrix(
        as_of_times=[as_of_date],
        label_name=label_name,
        label_type=label_type,
        feature_dictionary=reconstructed_feature_dict,
        matrix_metadata=matrix_metadata,
        matrix_uuid=matrix_uuid,
        matrix_type="production",
    )

    # 6. Predict the risk score for production
    predictor = Predictor(
        model_storage_engine=project_storage.model_storage_engine(),
        db_engine=db_engine,
        rank_order='best')

    predictor.predict(
        model_id=model_id,
        matrix_store=matrix_storage_engine.get_store(matrix_uuid),
        misc_db_parameters={},
        train_matrix_columns=matrix_storage_engine.get_store(
            train_matrix_uuid).columns())

Пример #12

Показать файл

Файл: __init__.py Проект: dssg/triage

    def predict(self, prediction_date):
        """Predict forward by creating a matrix using as_of_date = prediction_date and applying the retrain model on it

        Args:
            prediction_date(str)
        """
        cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_predict"

        # 1. Generate cohort
        self.generate_entity_date_table(prediction_date, cohort_table_name)

        # 2. Generate feature aggregations
        collate_aggregations = self.get_collate_aggregations(
            prediction_date, cohort_table_name)
        self.feature_generator.process_table_tasks(
            self.feature_generator.generate_all_table_tasks(
                collate_aggregations, task_type='aggregation'))
        # 3. Reconstruct feature disctionary from feature_names and generate imputation
        reconstructed_feature_dict, imputation_table_tasks = self.get_feature_dict_and_imputation_task(
            collate_aggregations, self.retrain_model_id)
        self.feature_generator.process_table_tasks(imputation_table_tasks)

        # 4. Build matrix
        db_config = {
            "features_schema_name": "triage_production",
            "labels_schema_name": "public",
            "cohort_table_name": cohort_table_name,
        }

        matrix_builder = MatrixBuilder(
            db_config=db_config,
            matrix_storage_engine=self.matrix_storage_engine,
            engine=self.db_engine,
            experiment_hash=None,
            replace=True,
        )
        # Use timechop to get the time definition for production
        temporal_config = self.get_temporal_config_for_retrain(
            dt_from_str(prediction_date))
        timechopper = Timechop(**temporal_config)

        retrain_config = get_retrain_config_from_model_id(
            self.db_engine, self.retrain_model_id)

        prod_definitions = timechopper.define_test_matrices(
            train_test_split_time=dt_from_str(prediction_date),
            test_duration=retrain_config['test_duration'],
            test_label_timespan=retrain_config['test_label_timespan'])
        last_split_definition = prod_definitions[-1]
        matrix_metadata = Planner.make_metadata(
            matrix_definition=last_split_definition,
            feature_dictionary=reconstructed_feature_dict,
            label_name=self.label_name,
            label_type='binary',
            cohort_name=self.cohort_name,
            matrix_type='production',
            feature_start_time=self.feature_start_time,
            user_metadata=self.user_metadata,
        )

        matrix_metadata['matrix_id'] = str(
            prediction_date
        ) + f'_model_id_{self.retrain_model_id}' + '_risklist'

        matrix_uuid = filename_friendly_hash(matrix_metadata)

        matrix_builder.build_matrix(
            as_of_times=[prediction_date],
            label_name=self.label_name,
            label_type='binary',
            feature_dictionary=reconstructed_feature_dict,
            matrix_metadata=matrix_metadata,
            matrix_uuid=matrix_uuid,
            matrix_type="production",
        )

        # 5. Predict the risk score for production
        predictor = Predictor(
            model_storage_engine=self.project_storage.model_storage_engine(),
            db_engine=self.db_engine,
            rank_order='best')

        predictor.predict(
            model_id=self.retrain_model_id,
            matrix_store=self.matrix_storage_engine.get_store(matrix_uuid),
            misc_db_parameters={},
            train_matrix_columns=self.matrix_storage_engine.get_store(
                self.retrain_matrix_uuid).columns(),
        )
        self.predict_matrix_uuid = matrix_uuid

Пример #13

Показать файл

def test_integration():
    with rig_engines() as (db_engine, project_storage):
        train_store = get_matrix_store(
            project_storage,
            matrix_creator(),
            matrix_metadata_creator(matrix_type="train"),
        )
        as_of_dates = [datetime.date(2016, 12, 21), datetime.date(2017, 1, 21)]

        test_stores = []
        for as_of_date in as_of_dates:
            matrix_store = get_matrix_store(
                project_storage,
                pandas.DataFrame.from_dict({
                    "entity_id": [3],
                    "feature_one": [8],
                    "feature_two": [5],
                    "label": [0],
                }).set_index("entity_id"),
                matrix_metadata_creator(end_time=as_of_date,
                                        indices=["entity_id"]),
            )
            test_stores.append(matrix_store)

        model_storage_engine = ModelStorageEngine(project_storage)

        experiment_hash = save_experiment_and_get_hash({}, db_engine)
        # instantiate pipeline objects
        trainer = ModelTrainer(
            experiment_hash=experiment_hash,
            model_storage_engine=model_storage_engine,
            db_engine=db_engine,
        )
        predictor = Predictor(model_storage_engine, db_engine)
        model_evaluator = ModelEvaluator([{
            "metrics": ["precision@"],
            "thresholds": {
                "top_n": [5]
            }
        }], [{}], db_engine)

        # run the pipeline
        grid_config = {
            "sklearn.linear_model.LogisticRegression": {
                "C": [0.00001, 0.0001],
                "penalty": ["l1", "l2"],
                "random_state": [2193],
            }
        }
        model_ids = trainer.train_models(grid_config=grid_config,
                                         misc_db_parameters=dict(),
                                         matrix_store=train_store)

        for model_id in model_ids:
            for as_of_date, test_store in zip(as_of_dates, test_stores):
                predictions_proba = predictor.predict(
                    model_id,
                    test_store,
                    misc_db_parameters=dict(),
                    train_matrix_columns=["feature_one", "feature_two"],
                )

                model_evaluator.evaluate(predictions_proba, test_store,
                                         model_id)

        # assert
        # 1. that the predictions table entries are present and
        # can be linked to the original models
        records = [
            row for row in db_engine.execute(
                """select entity_id, model_id, as_of_date
            from test_results.predictions
            join model_metadata.models using (model_id)
            order by 3, 2""")
        ]
        assert records == [
            (3, 1, datetime.datetime(2016, 12, 21)),
            (3, 2, datetime.datetime(2016, 12, 21)),
            (3, 3, datetime.datetime(2016, 12, 21)),
            (3, 4, datetime.datetime(2016, 12, 21)),
            (3, 1, datetime.datetime(2017, 1, 21)),
            (3, 2, datetime.datetime(2017, 1, 21)),
            (3, 3, datetime.datetime(2017, 1, 21)),
            (3, 4, datetime.datetime(2017, 1, 21)),
        ]

        # that evaluations are there
        records = [
            row for row in db_engine.execute("""
                select model_id, evaluation_start_time, metric, parameter
                from test_results.evaluations order by 2, 1""")
        ]
        assert records == [
            (1, datetime.datetime(2016, 12, 21), "precision@", "5_abs"),
            (2, datetime.datetime(2016, 12, 21), "precision@", "5_abs"),
            (3, datetime.datetime(2016, 12, 21), "precision@", "5_abs"),
            (4, datetime.datetime(2016, 12, 21), "precision@", "5_abs"),
            (1, datetime.datetime(2017, 1, 21), "precision@", "5_abs"),
            (2, datetime.datetime(2017, 1, 21), "precision@", "5_abs"),
            (3, datetime.datetime(2017, 1, 21), "precision@", "5_abs"),
            (4, datetime.datetime(2017, 1, 21), "precision@", "5_abs"),
        ]