def test_n_jobs_not_new_model(): grid_config = { 'sklearn.ensemble.AdaBoostClassifier': { 'n_estimators': [10, 100, 1000] }, 'sklearn.ensemble.RandomForestClassifier': { 'n_estimators': [10, 100], 'max_features': ['sqrt', 'log2'], 'max_depth': [5, 10, 15, 20], 'criterion': ['gini', 'entropy'], 'n_jobs': [12, 24], } } with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') trainer = ModelTrainer( project_path='econ-dev/inspections', experiment_hash=None, model_storage_engine=S3ModelStorageEngine(s3_conn, 'econ-dev/inspections'), db_engine=engine, model_group_keys=['label_name', 'label_timespan'] ) matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': ['good', 'bad'] }) train_tasks = trainer.generate_train_tasks( grid_config, dict(), InMemoryMatrixStore(matrix, { 'label_timespan': '1d', 'end_time': datetime.datetime.now(), 'feature_start_time': datetime.date(2012, 12, 20), 'label_name': 'label', 'metta-uuid': '1234', 'feature_names': ['ft1', 'ft2'], 'indices': ['entity_id'], }) ) assert len(train_tasks) == 35 # 32+3, would be (32*2)+3 if we didn't remove assert len([ task for task in train_tasks if 'n_jobs' in task['parameters'] ]) == 32 for train_task in train_tasks: trainer.process_train_task(**train_task) for row in engine.execute( 'select model_parameters from results.model_groups' ): assert 'n_jobs' not in row[0]
def test_integration(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') project_path = 'econ-dev/inspections' # create train and test matrices train_matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') train_metadata = { 'beginning_of_time': datetime.date(2012, 12, 20), 'end_time': datetime.date(2016, 12, 20), 'label_name': 'label', 'label_window': '1y', 'feature_names': ['ft1', 'ft2'], 'metta-uuid': '1234', } train_store = InMemoryMatrixStore(train_matrix, train_metadata) as_of_dates = [ datetime.date(2016, 12, 21), datetime.date(2017, 1, 21) ] test_stores = [ InMemoryMatrixStore( pandas.DataFrame.from_dict({ 'entity_id': [3], 'feature_one': [8], 'feature_two': [5], 'label': [5] }).set_index('entity_id'), { 'label_name': 'label', 'label_window': '1y', 'end_time': as_of_date, 'metta-uuid': '1234', }) for as_of_date in as_of_dates ] model_storage_engine = S3ModelStorageEngine(s3_conn, project_path) experiment_hash = save_experiment_and_get_hash({}, db_engine) # instantiate pipeline objects trainer = ModelTrainer( project_path=project_path, experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, model_group_keys=['label_name', 'label_window']) predictor = Predictor(project_path, model_storage_engine, db_engine) model_evaluator = ModelEvaluator([{ 'metrics': ['precision@'], 'thresholds': { 'top_n': [5] } }], db_engine) # run the pipeline grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict(), train_matrix_columns=['feature_one', 'feature_two']) model_evaluator.evaluate(predictions_proba, test_store.labels(), model_id, as_of_date, as_of_date, '6month') # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( '''select entity_id, model_id, as_of_date from results.predictions join results.models using (model_id) order by 3, 2''') ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(''' select model_id, evaluation_start_time, metric, parameter from results.evaluations order by 2, 1''') ] assert records == [ (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), ]
def test_predictor(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(s3_conn, project_path) _, model_id = \ fake_trained_model(project_path, model_storage_engine, db_engine) predictor = Predictor(project_path, model_storage_engine, db_engine) # create prediction set matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') metadata = { 'label_name': 'label', 'end_time': AS_OF_DATE, 'label_window': '3month', 'metta-uuid': '1234', } matrix_store = InMemoryMatrixStore(matrix, metadata) train_matrix_columns = ['feature_one', 'feature_two'] predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns) # assert # 1. that the returned predictions are of the desired length assert len(predict_proba) == 2 # 2. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute('''select entity_id, as_of_date from results.predictions join results.models using (model_id)''') ] assert len(records) == 2 # 3. that the contained as_of_dates match what we sent in for record in records: assert record[1].date() == AS_OF_DATE # 4. that the entity ids match the given dataset assert sorted([record[0] for record in records]) == [1, 2] # 5. running with same model_id, different as of date # then with same as of date only replaces the records # with the same date new_matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') new_metadata = { 'label_name': 'label', 'end_time': AS_OF_DATE + datetime.timedelta(days=1), 'label_window': '3month', 'metta-uuid': '1234', } new_matrix_store = InMemoryMatrixStore(new_matrix, new_metadata) predictor.predict(model_id, new_matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns) predictor.predict(model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns) records = [ row for row in db_engine.execute('''select entity_id, as_of_date from results.predictions join results.models using (model_id)''') ] assert len(records) == 4 # 6. That we can delete the model when done prediction on it predictor.delete_model(model_id) assert predictor.load_model(model_id) == None
def test_model_trainer(): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') # create training set matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': ['good', 'bad'] }) metadata = { 'feature_start_time': datetime.date(2012, 12, 20), 'end_time': datetime.date(2016, 12, 20), 'label_name': 'label', 'label_timespan': '1y', 'metta-uuid': '1234', 'feature_names': ['ft1', 'ft2'], 'indices': ['entity_id'], } project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(s3_conn, project_path) trainer = ModelTrainer( project_path=project_path, experiment_hash=None, model_storage_engine=model_storage_engine, db_engine=engine, model_group_keys=['label_name', 'label_timespan'] ) matrix_store = InMemoryMatrixStore(matrix, metadata) model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store ) # assert # 1. that the models and feature importances table entries are present records = [ row for row in engine.execute('select * from results.feature_importances') ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes records = [ row for row in engine.execute('select model_hash from results.models') ] assert len(records) == 4 cache_keys = [ model_cache_key(project_path, model_row[0], s3_conn) for model_row in records ] # 2. that the model groups are distinct records = [ row for row in engine.execute('select distinct model_group_id from results.models') ] assert len(records) == 4 # 3. that all four models are cached model_pickles = [ pickle.loads(cache_key.get()['Body'].read()) for cache_key in cache_keys ] assert len(model_pickles) == 4 assert len([x for x in model_pickles if x is not None]) == 4 # 4. that their results can have predictions made on it test_matrix = pandas.DataFrame.from_dict({ 'entity_id': [3, 4], 'feature_one': [4, 4], 'feature_two': [6, 5], }) test_matrix = InMemoryMatrixStore(matrix=test_matrix, metadata=metadata).matrix for model_pickle in model_pickles: predictions = model_pickle.predict(test_matrix) assert len(predictions) == 2 # 5. when run again, same models are returned new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store ) assert len([ row for row in engine.execute('select model_hash from results.models') ]) == 4 assert model_ids == new_model_ids # 6. if replace is set, update non-unique attributes and feature importances max_batch_run_time = [ row[0] for row in engine.execute('select max(batch_run_time) from results.models') ][0] trainer = ModelTrainer( project_path=project_path, experiment_hash=None, model_storage_engine=model_storage_engine, db_engine=engine, model_group_keys=['label_name', 'label_timespan'], replace=True ) new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store, ) assert model_ids == new_model_ids assert [ row['model_id'] for row in engine.execute('select model_id from results.models order by 1 asc') ] == model_ids new_max_batch_run_time = [ row[0] for row in engine.execute('select max(batch_run_time) from results.models') ][0] assert new_max_batch_run_time > max_batch_run_time records = [ row for row in engine.execute('select * from results.feature_importances') ] assert len(records) == 4 * 2 # maybe exclude entity_id? yes # 7. if the cache is missing but the metadata is still there, reuse the metadata for row in engine.execute('select model_hash from results.models'): model_storage_engine.get_store(row[0]).delete() new_model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store ) assert model_ids == sorted(new_model_ids) # 8. that the generator interface works the same way new_model_ids = trainer.generate_trained_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store ) assert model_ids == \ sorted([model_id for model_id in new_model_ids])