示例#1
0
def test_model_trainer(sample_matrix_store, grid_config):
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)
        init_engine(db_engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')

            # Creates a matrix entry in the matrices table with uuid from metadata above
            MatrixFactory(matrix_uuid="1234")
            session.commit()
            project_path = 'econ-dev/inspections'
            model_storage_engine = S3ModelStorageEngine(project_path)
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=None,
                model_storage_engine=model_storage_engine,
                model_grouper=ModelGrouper(),
                db_engine=db_engine,
            )
            model_ids = trainer.train_models(grid_config=grid_config,
                                             misc_db_parameters=dict(),
                                             matrix_store=sample_matrix_store)

            # assert
            # 1. that the models and feature importances table entries are present
            records = [
                row for row in db_engine.execute(
                    'select * from train_results.feature_importances')
            ]
            assert len(records) == 4 * 2  # maybe exclude entity_id? yes

            records = [
                row for row in db_engine.execute(
                    'select model_hash from model_metadata.models')
            ]
            assert len(records) == 4
            hashes = [row[0] for row in records]

            # 2. that the model groups are distinct
            records = [
                row for row in db_engine.execute(
                    'select distinct model_group_id from model_metadata.models'
                )
            ]
            assert len(records) == 4

            # 3. that the model sizes are saved in the table and all are < 1 kB
            records = [
                row for row in db_engine.execute(
                    'select model_size from model_metadata.models')
            ]
            assert len(records) == 4
            for i in records:
                size = i[0]
                assert size < 1

            # 4. that all four models are cached
            model_pickles = [
                model_storage_engine.get_store(model_hash).load()
                for model_hash in hashes
            ]
            assert len(model_pickles) == 4
            assert len([x for x in model_pickles if x is not None]) == 4

            # 5. that their results can have predictions made on it
            test_matrix = pandas.DataFrame.from_dict({
                'entity_id': [3, 4],
                'feature_one': [4, 4],
                'feature_two': [6, 5],
            })

            test_matrix = InMemoryMatrixStore(matrix=test_matrix, metadata=sample_metadata())\
                .matrix

            for model_pickle in model_pickles:
                predictions = model_pickle.predict(test_matrix)
                assert len(predictions) == 2

            # 6. when run again, same models are returned
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=sample_matrix_store)
            assert len([
                row for row in db_engine.execute(
                    'select model_hash from model_metadata.models')
            ]) == 4
            assert model_ids == new_model_ids

            # 7. if replace is set, update non-unique attributes and feature importances
            max_batch_run_time = [
                row[0] for row in db_engine.execute(
                    'select max(batch_run_time) from model_metadata.models')
            ][0]
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=None,
                model_storage_engine=model_storage_engine,
                model_grouper=ModelGrouper(
                    model_group_keys=['label_name', 'label_timespan']),
                db_engine=db_engine,
                replace=True)
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=sample_matrix_store,
            )
            assert model_ids == new_model_ids
            assert [
                row['model_id'] for row in db_engine.execute(
                    'select model_id from model_metadata.models order by 1 asc'
                )
            ] == model_ids
            new_max_batch_run_time = [
                row[0] for row in db_engine.execute(
                    'select max(batch_run_time) from model_metadata.models')
            ][0]
            assert new_max_batch_run_time > max_batch_run_time

            records = [
                row for row in db_engine.execute(
                    'select * from train_results.feature_importances')
            ]
            assert len(records) == 4 * 2  # maybe exclude entity_id? yes

            # 8. if the cache is missing but the metadata is still there, reuse the metadata
            for row in db_engine.execute(
                    'select model_hash from model_metadata.models'):
                model_storage_engine.get_store(row[0]).delete()
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=sample_matrix_store)
            assert model_ids == sorted(new_model_ids)

            # 9. that the generator interface works the same way
            new_model_ids = trainer.generate_trained_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=sample_matrix_store)
            assert model_ids == \
                sorted([model_id for model_id in new_model_ids])
示例#2
0
def test_model_trainer():
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)

        grid_config = {
            'sklearn.linear_model.LogisticRegression': {
                'C': [0.00001, 0.0001],
                'penalty': ['l1', 'l2'],
                'random_state': [2193]
            }
        }

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')

            # create training set
            matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': ['good', 'bad']
            })
            metadata = {
                'feature_start_time': datetime.date(2012, 12, 20),
                'end_time': datetime.date(2016, 12, 20),
                'label_name': 'label',
                'label_timespan': '1y',
                'metta-uuid': '1234',
                'feature_names': ['ft1', 'ft2'],
                'indices': ['entity_id'],
            }
            project_path = 'econ-dev/inspections'
            model_storage_engine = S3ModelStorageEngine(project_path)
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=None,
                model_storage_engine=model_storage_engine,
                db_engine=engine,
                model_group_keys=['label_name', 'label_timespan']
            )
            matrix_store = InMemoryMatrixStore(matrix, metadata)
            model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=matrix_store
            )

            # assert
            # 1. that the models and feature importances table entries are present
            records = [
                row for row in
                engine.execute('select * from results.feature_importances')
            ]
            assert len(records) == 4 * 2  # maybe exclude entity_id? yes

            records = [
                row for row in
                engine.execute('select model_hash from results.models')
            ]
            assert len(records) == 4
            hashes = [row[0] for row in records]

            # 2. that the model groups are distinct
            records = [
                row for row in
                engine.execute('select distinct model_group_id from results.models')
            ]
            assert len(records) == 4

            # 3. that all four models are cached
            model_pickles = [
                model_storage_engine.get_store(model_hash).load()
                for model_hash in hashes
            ]
            assert len(model_pickles) == 4
            assert len([x for x in model_pickles if x is not None]) == 4

            # 4. that their results can have predictions made on it
            test_matrix = pandas.DataFrame.from_dict({
                'entity_id': [3, 4],
                'feature_one': [4, 4],
                'feature_two': [6, 5],
            })

            test_matrix = InMemoryMatrixStore(matrix=test_matrix, metadata=metadata).matrix

            for model_pickle in model_pickles:
                predictions = model_pickle.predict(test_matrix)
                assert len(predictions) == 2

            # 5. when run again, same models are returned
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=matrix_store
            )
            assert len([
                row for row in
                engine.execute('select model_hash from results.models')
            ]) == 4
            assert model_ids == new_model_ids

            # 6. if replace is set, update non-unique attributes and feature importances
            max_batch_run_time = [
                row[0] for row in
                engine.execute('select max(batch_run_time) from results.models')
            ][0]
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=None,
                model_storage_engine=model_storage_engine,
                db_engine=engine,
                model_group_keys=['label_name', 'label_timespan'],
                replace=True
            )
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=matrix_store,
            )
            assert model_ids == new_model_ids
            assert [
                row['model_id'] for row in
                engine.execute('select model_id from results.models order by 1 asc')
            ] == model_ids
            new_max_batch_run_time = [
                row[0] for row in
                engine.execute('select max(batch_run_time) from results.models')
            ][0]
            assert new_max_batch_run_time > max_batch_run_time

            records = [
                row for row in
                engine.execute('select * from results.feature_importances')
            ]
            assert len(records) == 4 * 2  # maybe exclude entity_id? yes

            # 7. if the cache is missing but the metadata is still there, reuse the metadata
            for row in engine.execute('select model_hash from results.models'):
                model_storage_engine.get_store(row[0]).delete()
            new_model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=matrix_store
            )
            assert model_ids == sorted(new_model_ids)

            # 8. that the generator interface works the same way
            new_model_ids = trainer.generate_trained_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=matrix_store
            )
            assert model_ids == \
                sorted([model_id for model_id in new_model_ids])
示例#3
0
def test_model_trainer(grid_config, default_model_trainer):
    trainer = default_model_trainer
    db_engine = trainer.db_engine
    project_storage = trainer.model_storage_engine.project_storage
    model_storage_engine = trainer.model_storage_engine

    def set_test_seed():
        random.seed(5)

    set_test_seed()
    model_ids = trainer.train_models(
        grid_config=grid_config,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )

    # assert
    # 1. that the models and feature importances table entries are present
    records = [
        row for row in db_engine.execute(
            "select * from train_results.feature_importances")
    ]
    assert len(records) == 4 * 2  # maybe exclude entity_id? yes

    records = [
        row for row in db_engine.execute(
            "select model_hash from triage_metadata.models")
    ]
    assert len(records) == 4
    hashes = [row[0] for row in records]

    # 2. that the model groups are distinct
    records = [
        row for row in db_engine.execute(
            "select distinct model_group_id from triage_metadata.models")
    ]
    assert len(records) == 4

    # 2. that the random seeds are distinct
    records = [
        row for row in db_engine.execute(
            "select distinct random_seed from triage_metadata.models")
    ]
    assert len(records) == 4

    # 3. that the model sizes are saved in the table and all are < 1 kB
    records = [
        row for row in db_engine.execute(
            "select model_size from triage_metadata.models")
    ]
    assert len(records) == 4
    for i in records:
        size = i[0]
        assert size < 1

    # 4. that all four models are cached
    model_pickles = [
        model_storage_engine.load(model_hash) for model_hash in hashes
    ]
    assert len(model_pickles) == 4
    assert len([x for x in model_pickles if x is not None]) == 4

    # 5. that their results can have predictions made on it
    test_matrix = pd.DataFrame.from_dict({
        "entity_id": [3, 4],
        "feature_one": [4, 4],
        "feature_two": [6, 5]
    }).set_index("entity_id")

    for model_pickle in model_pickles:
        predictions = model_pickle.predict(test_matrix)
        assert len(predictions) == 2

    # 6. when run again with the same starting seed, same models are returned
    set_test_seed()
    new_model_ids = trainer.train_models(
        grid_config=grid_config,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    assert (len([
        row for row in db_engine.execute(
            "select model_hash from triage_metadata.models")
    ]) == 4)
    assert model_ids == new_model_ids

    # 7. if replace is set, update non-unique attributes and feature importances
    max_batch_run_time = [
        row[0] for row in db_engine.execute(
            "select max(batch_run_time) from triage_metadata.models")
    ][0]
    trainer = ModelTrainer(
        experiment_hash=None,
        model_storage_engine=model_storage_engine,
        model_grouper=ModelGrouper(
            model_group_keys=["label_name", "label_timespan"]),
        db_engine=db_engine,
        replace=True,
    )
    set_test_seed()
    new_model_ids = trainer.train_models(
        grid_config=grid_config,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    assert model_ids == new_model_ids
    assert [
        row["model_id"] for row in db_engine.execute(
            "select model_id from triage_metadata.models order by 1 asc")
    ] == model_ids
    new_max_batch_run_time = [
        row[0] for row in db_engine.execute(
            "select max(batch_run_time) from triage_metadata.models")
    ][0]
    assert new_max_batch_run_time > max_batch_run_time

    records = [
        row for row in db_engine.execute(
            "select * from train_results.feature_importances")
    ]
    assert len(records) == 4 * 2  # maybe exclude entity_id? yes

    # 8. if the cache is missing but the metadata is still there, reuse the metadata
    set_test_seed()
    for row in db_engine.execute(
            "select model_hash from triage_metadata.models"):
        model_storage_engine.delete(row[0])
    new_model_ids = trainer.train_models(
        grid_config=grid_config,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    assert model_ids == sorted(new_model_ids)

    # 9. that the generator interface works the same way
    set_test_seed()
    new_model_ids = trainer.generate_trained_models(
        grid_config=grid_config,
        misc_db_parameters=dict(),
        matrix_store=get_matrix_store(project_storage),
    )
    assert model_ids == sorted([model_id for model_id in new_model_ids])
示例#4
0
def test_model_trainer(grid_config):
    with rig_engines() as (db_engine, project_storage):
        # Creates a matrix entry in the matrices table with uuid from metadata above
        model_storage_engine = project_storage.model_storage_engine()
        trainer = ModelTrainer(
            experiment_hash=None,
            model_storage_engine=model_storage_engine,
            model_grouper=ModelGrouper(),
            db_engine=db_engine,
        )
        model_ids = trainer.train_models(
            grid_config=grid_config,
            misc_db_parameters=dict(),
            matrix_store=get_matrix_store(project_storage),
        )

        # assert
        # 1. that the models and feature importances table entries are present
        records = [
            row for row in db_engine.execute(
                'select * from train_results.feature_importances')
        ]
        assert len(records) == 4 * 2  # maybe exclude entity_id? yes

        records = [
            row for row in db_engine.execute(
                'select model_hash from model_metadata.models')
        ]
        assert len(records) == 4
        hashes = [row[0] for row in records]

        # 2. that the model groups are distinct
        records = [
            row for row in db_engine.execute(
                'select distinct model_group_id from model_metadata.models')
        ]
        assert len(records) == 4

        # 3. that the model sizes are saved in the table and all are < 1 kB
        records = [
            row for row in db_engine.execute(
                'select model_size from model_metadata.models')
        ]
        assert len(records) == 4
        for i in records:
            size = i[0]
            assert size < 1

        # 4. that all four models are cached
        model_pickles = [
            model_storage_engine.load(model_hash) for model_hash in hashes
        ]
        assert len(model_pickles) == 4
        assert len([x for x in model_pickles if x is not None]) == 4

        # 5. that their results can have predictions made on it
        test_matrix = pandas.DataFrame.from_dict({
            'entity_id': [3, 4],
            'feature_one': [4, 4],
            'feature_two': [6, 5],
        }).set_index('entity_id')

        for model_pickle in model_pickles:
            predictions = model_pickle.predict(test_matrix)
            assert len(predictions) == 2

        # 6. when run again, same models are returned
        new_model_ids = trainer.train_models(
            grid_config=grid_config,
            misc_db_parameters=dict(),
            matrix_store=get_matrix_store(project_storage))
        assert len([
            row for row in db_engine.execute(
                'select model_hash from model_metadata.models')
        ]) == 4
        assert model_ids == new_model_ids

        # 7. if replace is set, update non-unique attributes and feature importances
        max_batch_run_time = [
            row[0] for row in db_engine.execute(
                'select max(batch_run_time) from model_metadata.models')
        ][0]
        trainer = ModelTrainer(
            experiment_hash=None,
            model_storage_engine=model_storage_engine,
            model_grouper=ModelGrouper(
                model_group_keys=['label_name', 'label_timespan']),
            db_engine=db_engine,
            replace=True)
        new_model_ids = trainer.train_models(
            grid_config=grid_config,
            misc_db_parameters=dict(),
            matrix_store=get_matrix_store(project_storage))
        assert model_ids == new_model_ids
        assert [
            row['model_id'] for row in db_engine.execute(
                'select model_id from model_metadata.models order by 1 asc')
        ] == model_ids
        new_max_batch_run_time = [
            row[0] for row in db_engine.execute(
                'select max(batch_run_time) from model_metadata.models')
        ][0]
        assert new_max_batch_run_time > max_batch_run_time

        records = [
            row for row in db_engine.execute(
                'select * from train_results.feature_importances')
        ]
        assert len(records) == 4 * 2  # maybe exclude entity_id? yes

        # 8. if the cache is missing but the metadata is still there, reuse the metadata
        for row in db_engine.execute(
                'select model_hash from model_metadata.models'):
            model_storage_engine.delete(row[0])
        new_model_ids = trainer.train_models(
            grid_config=grid_config,
            misc_db_parameters=dict(),
            matrix_store=get_matrix_store(project_storage))
        assert model_ids == sorted(new_model_ids)

        # 9. that the generator interface works the same way
        new_model_ids = trainer.generate_trained_models(
            grid_config=grid_config,
            misc_db_parameters=dict(),
            matrix_store=get_matrix_store(project_storage))
        assert model_ids == \
            sorted([model_id for model_id in new_model_ids])