Python S3ModelStorageEngine примеры использования

Язык программирования: Python

Пространство имен/Пакет: catwalk.storage

Класс/Тип: S3ModelStorageEngine

Примеров на hotexamples.com: 4

Python S3ModelStorageEngine - 4 примера найдено. Это лучшие примеры Python кода для catwalk.storage.S3ModelStorageEngine, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

S3ModelStorageEngine(4)

Основные методы

S3ModelStorageEngine (4)

Пример #1

Показать файл

Файл: test_model_trainers.py Проект: fagan2888/catwalk

def test_n_jobs_not_new_model():
    grid_config = {
        'sklearn.ensemble.AdaBoostClassifier': {
            'n_estimators': [10, 100, 1000]
        },
        'sklearn.ensemble.RandomForestClassifier': {
            'n_estimators': [10, 100],
            'max_features': ['sqrt', 'log2'],
            'max_depth': [5, 10, 15, 20],
            'criterion': ['gini', 'entropy'],
            'n_jobs': [12, 24],
        }
    }

    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)
        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            trainer = ModelTrainer(
                project_path='econ-dev/inspections',
                experiment_hash=None,
                model_storage_engine=S3ModelStorageEngine(s3_conn, 'econ-dev/inspections'),
                db_engine=engine,
                model_group_keys=['label_name', 'label_timespan']
            )

            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': ['good', 'bad']
            })
            train_tasks = trainer.generate_train_tasks(
                grid_config,
                dict(),
                InMemoryMatrixStore(matrix, {
                    'label_timespan': '1d',
                    'end_time': datetime.datetime.now(),
                    'feature_start_time': datetime.date(2012, 12, 20),
                    'label_name': 'label',
                    'metta-uuid': '1234',
                    'feature_names': ['ft1', 'ft2'],
                    'indices': ['entity_id'],
                })
            )
            assert len(train_tasks) == 35 # 32+3, would be (32*2)+3 if we didn't remove
            assert len([
                task for task in train_tasks
                if 'n_jobs' in task['parameters']
            ]) == 32

            for train_task in train_tasks:
                trainer.process_train_task(**train_task)

            for row in engine.execute(
                'select model_parameters from results.model_groups'
            ):
                assert 'n_jobs' not in row[0]

Пример #2

Показать файл

Файл: test_integration.py Проект: shaycrk/catwalk

def test_integration():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            project_path = 'econ-dev/inspections'

            # create train and test matrices
            train_matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            train_metadata = {
                'beginning_of_time': datetime.date(2012, 12, 20),
                'end_time': datetime.date(2016, 12, 20),
                'label_name': 'label',
                'label_window': '1y',
                'feature_names': ['ft1', 'ft2'],
                'metta-uuid': '1234',
            }

            train_store = InMemoryMatrixStore(train_matrix, train_metadata)

            as_of_dates = [
                datetime.date(2016, 12, 21),
                datetime.date(2017, 1, 21)
            ]

            test_stores = [
                InMemoryMatrixStore(
                    pandas.DataFrame.from_dict({
                        'entity_id': [3],
                        'feature_one': [8],
                        'feature_two': [5],
                        'label': [5]
                    }).set_index('entity_id'), {
                        'label_name': 'label',
                        'label_window': '1y',
                        'end_time': as_of_date,
                        'metta-uuid': '1234',
                    }) for as_of_date in as_of_dates
            ]

            model_storage_engine = S3ModelStorageEngine(s3_conn, project_path)

            experiment_hash = save_experiment_and_get_hash({}, db_engine)
            # instantiate pipeline objects
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=experiment_hash,
                model_storage_engine=model_storage_engine,
                db_engine=db_engine,
                model_group_keys=['label_name', 'label_window'])
            predictor = Predictor(project_path, model_storage_engine,
                                  db_engine)
            model_evaluator = ModelEvaluator([{
                'metrics': ['precision@'],
                'thresholds': {
                    'top_n': [5]
                }
            }], db_engine)

            # run the pipeline
            grid_config = {
                'sklearn.linear_model.LogisticRegression': {
                    'C': [0.00001, 0.0001],
                    'penalty': ['l1', 'l2'],
                    'random_state': [2193]
                }
            }
            model_ids = trainer.train_models(grid_config=grid_config,
                                             misc_db_parameters=dict(),
                                             matrix_store=train_store)

            for model_id in model_ids:
                for as_of_date, test_store in zip(as_of_dates, test_stores):
                    predictions_proba = predictor.predict(
                        model_id,
                        test_store,
                        misc_db_parameters=dict(),
                        train_matrix_columns=['feature_one', 'feature_two'])

                    model_evaluator.evaluate(predictions_proba,
                                             test_store.labels(), model_id,
                                             as_of_date, as_of_date, '6month')

            # assert
            # 1. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row for row in db_engine.execute(
                    '''select entity_id, model_id, as_of_date
                from results.predictions
                join results.models using (model_id)
                order by 3, 2''')
            ]
            assert records == [
                (3, 1, datetime.datetime(2016, 12, 21)),
                (3, 2, datetime.datetime(2016, 12, 21)),
                (3, 3, datetime.datetime(2016, 12, 21)),
                (3, 4, datetime.datetime(2016, 12, 21)),
                (3, 1, datetime.datetime(2017, 1, 21)),
                (3, 2, datetime.datetime(2017, 1, 21)),
                (3, 3, datetime.datetime(2017, 1, 21)),
                (3, 4, datetime.datetime(2017, 1, 21)),
            ]

            # that evaluations are there
            records = [
                row for row in db_engine.execute('''
                    select model_id, evaluation_start_time, metric, parameter
                    from results.evaluations order by 2, 1''')
            ]
            assert records == [
                (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
            ]

Пример #3

Показать файл

def test_predictor():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            project_path = 'econ-dev/inspections'
            model_storage_engine = S3ModelStorageEngine(s3_conn, project_path)
            _, model_id = \
                fake_trained_model(project_path, model_storage_engine, db_engine)
            predictor = Predictor(project_path, model_storage_engine,
                                  db_engine)
            # create prediction set
            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            metadata = {
                'label_name': 'label',
                'end_time': AS_OF_DATE,
                'label_window': '3month',
                'metta-uuid': '1234',
            }

            matrix_store = InMemoryMatrixStore(matrix, metadata)
            train_matrix_columns = ['feature_one', 'feature_two']
            predict_proba = predictor.predict(
                model_id,
                matrix_store,
                misc_db_parameters=dict(),
                train_matrix_columns=train_matrix_columns)

            # assert
            # 1. that the returned predictions are of the desired length
            assert len(predict_proba) == 2

            # 2. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row
                for row in db_engine.execute('''select entity_id, as_of_date
                from results.predictions
                join results.models using (model_id)''')
            ]
            assert len(records) == 2

            # 3. that the contained as_of_dates match what we sent in
            for record in records:
                assert record[1].date() == AS_OF_DATE

            # 4. that the entity ids match the given dataset
            assert sorted([record[0] for record in records]) == [1, 2]

            # 5. running with same model_id, different as of date
            # then with same as of date only replaces the records
            # with the same date
            new_matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            new_metadata = {
                'label_name': 'label',
                'end_time': AS_OF_DATE + datetime.timedelta(days=1),
                'label_window': '3month',
                'metta-uuid': '1234',
            }
            new_matrix_store = InMemoryMatrixStore(new_matrix, new_metadata)
            predictor.predict(model_id,
                              new_matrix_store,
                              misc_db_parameters=dict(),
                              train_matrix_columns=train_matrix_columns)
            predictor.predict(model_id,
                              matrix_store,
                              misc_db_parameters=dict(),
                              train_matrix_columns=train_matrix_columns)
            records = [
                row
                for row in db_engine.execute('''select entity_id, as_of_date
                from results.predictions
                join results.models using (model_id)''')
            ]
            assert len(records) == 4

            # 6. That we can delete the model when done prediction on it
            predictor.delete_model(model_id)
            assert predictor.load_model(model_id) == None

Пример #4

Показать файл

Файл: test_model_trainers.py Проект: fagan2888/catwalk

def test_model_trainer():
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)

        grid_config = {
            'sklearn.linear_model.LogisticRegression': {
                'C': [0.00001, 0.0001],
                'penalty': ['l1', 'l2'],
                'random_state': [2193]
            }
        }

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')

            # create training set
            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': ['good', 'bad']
            })
            metadata = {
                'feature_start_time': datetime.date(2012, 12, 20),
                'end_time': datetime.date(2016, 12, 20),
                'label_name': 'label',
                'label_timespan': '1y',
                'metta-uuid': '1234',
                'feature_names': ['ft1', 'ft2'],
                'indices': ['entity_id'],
            }
            project_path = 'econ-dev/inspections'
            model_storage_engine = S3ModelStorageEngine(s3_conn, project_path)
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=None,
                model_storage_engine=model_storage_engine,
                db_engine=engine,
                model_group_keys=['label_name', 'label_timespan']
            )
            matrix_store = InMemoryMatrixStore(matrix, metadata)
            model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=matrix_store
            )

            # assert
            # 1. that the models and feature importances table entries are present
            records = [
                row for row in
                engine.execute('select * from results.feature_importances')
            ]
            assert len(records) == 4 * 2  # maybe exclude entity_id? yes

            records = [
                row for row in
                engine.execute('select model_hash from results.models')
            ]
            assert len(records) == 4

            cache_keys = [
                model_cache_key(project_path, model_row[0], s3_conn)
                for model_row in records
            ]

            # 2. that the model groups are distinct
            records = [
                row for row in
                engine.execute('select distinct model_group_id from results.models')
            ]
            assert len(records) == 4

            # 3. that all four models are cached
            model_pickles = [
                pickle.loads(cache_key.get()['Body'].read())
                for cache_key in cache_keys
            ]
            assert len(model_pickles) == 4
            assert len([x for x in model_pickles if x is not None]) == 4

            # 4. that their results can have predictions made on it
            test_matrix = pandas.DataFrame.from_dict({
                'entity_id': [3, 4],
                'feature_one': [4, 4],
                'feature_two': [6, 5],
            })

            test_matrix = InMemoryMatrixStore(matrix=test_matrix, metadata=metadata).matrix

            for model_pickle in model_pickles:
                predictions = model_pickle.predict(test_matrix)
                assert len(predictions) == 2

            # 5. when run again, same models are returned
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=matrix_store
            )
            assert len([
                row for row in
                engine.execute('select model_hash from results.models')
            ]) == 4
            assert model_ids == new_model_ids

            # 6. if replace is set, update non-unique attributes and feature importances
            max_batch_run_time = [
                row[0] for row in
                engine.execute('select max(batch_run_time) from results.models')
            ][0]
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=None,
                model_storage_engine=model_storage_engine,
                db_engine=engine,
                model_group_keys=['label_name', 'label_timespan'],
                replace=True
            )
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=matrix_store,
            )
            assert model_ids == new_model_ids
            assert [
                row['model_id'] for row in
                engine.execute('select model_id from results.models order by 1 asc')
            ] == model_ids
            new_max_batch_run_time = [
                row[0] for row in
                engine.execute('select max(batch_run_time) from results.models')
            ][0]
            assert new_max_batch_run_time > max_batch_run_time

            records = [
                row for row in
                engine.execute('select * from results.feature_importances')
            ]
            assert len(records) == 4 * 2  # maybe exclude entity_id? yes

            # 7. if the cache is missing but the metadata is still there, reuse the metadata
            for row in engine.execute('select model_hash from results.models'):
                model_storage_engine.get_store(row[0]).delete()
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=matrix_store
            )
            assert model_ids == sorted(new_model_ids)

            # 8. that the generator interface works the same way
            new_model_ids = trainer.generate_trained_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=matrix_store
            )
            assert model_ids == \
                sorted([model_id for model_id in new_model_ids])