def test_predictor_save_predictions(matrix_type, predict_setup_args): (project_storage, db_engine, model_id) = predict_setup_args # if save_predictions is sent as False, don't save predictor = Predictor(project_storage.model_storage_engine(), db_engine, save_predictions=False) matrix = matrix_creator(index="entity_id") metadata = matrix_metadata_creator(end_time=AS_OF_DATE, matrix_type=matrix_type, indices=["entity_id"]) matrix_store = get_matrix_store(project_storage, matrix, metadata) train_matrix_columns = matrix.columns[0:-1].tolist() predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns, ) # assert # 1. that the returned predictions are of the desired length assert len(predict_proba) == 2 # 2. that the predictions table entries are present and # can be linked to the original models assert not table_has_data(f"{matrix_type}_predictions", db_engine)
def test_predictor_get_train_columns(): with prepare() as (project_storage, db_engine, model_id): predictor = Predictor(project_storage.model_storage_engine(), db_engine) train_store = get_matrix_store( project_storage=project_storage, matrix=matrix_creator(), metadata=matrix_metadata_creator(matrix_type="train"), ) # flip the order of some feature columns in the test matrix other_order_matrix = matrix_creator() order = other_order_matrix.columns.tolist() order[0], order[1] = order[1], order[0] other_order_matrix = other_order_matrix[order] test_store = get_matrix_store( project_storage=project_storage, matrix=other_order_matrix, metadata=matrix_metadata_creator(matrix_type="test"), ) # Runs the same test for training and testing predictions for store, mat_type in zip((train_store, test_store), ("train", "test")): predict_proba = predictor.predict( model_id, store, misc_db_parameters=dict(), train_matrix_columns=train_store.columns(), ) # assert # 1. that we calculated predictions assert len(predict_proba) > 0 # 2. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute("""select entity_id, as_of_date from {}_results.predictions join model_metadata.models using (model_id)""".format( mat_type, mat_type)) ] assert len(records) > 0
def test_predictor_retrieve(): with prepare() as (project_storage, db_engine, model_id): predictor = Predictor(project_storage.model_storage_engine(), db_engine, replace=False) # create prediction set matrix = matrix_creator() metadata = matrix_metadata_creator() matrix_store = get_matrix_store(project_storage, matrix, metadata) predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=matrix.columns[0:-1].tolist()) # When run again, the predictions retrieved from the database # should match. # # Some trickiness here. Let's explain: # # If we are not careful, retrieving predictions from the database and # presenting them as a numpy array can result in a bad ordering, # since the given matrix may not be 'ordered' by some criteria # that can be easily represented by an ORDER BY clause. # # It will sometimes work, because without ORDER BY you will get # it back in the table's physical order, which unless something has # happened to the table will be the order you inserted it, # which could very well be the order in the matrix. # So it's not a bug that would necessarily immediately show itself, # but when it does go wrong your scores will be garbage. # # So we simulate a table order mutation that can happen over time: # Remove the first row and put it at the end. # If the Predictor doesn't explicitly reorder the results, this will fail # Only running on TestPrediction because TrainPrediction behaves the exact same way reorder_session = sessionmaker(bind=db_engine)() obj = reorder_session.query(TestPrediction).first() reorder_session.delete(obj) reorder_session.commit() make_transient(obj) reorder_session = sessionmaker(bind=db_engine)() reorder_session.add(obj) reorder_session.commit() predictor.load_model = Mock() new_predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=matrix.columns[0:-1].tolist()) assert_array_equal(new_predict_proba, predict_proba) assert not predictor.load_model.called
def test_calculate_and_save(): with rig_engines() as (db_engine, project_storage): train_store = get_matrix_store( project_storage, matrix_creator(), matrix_metadata_creator(matrix_type='train'), ) test_store = get_matrix_store( project_storage, matrix_creator(), matrix_metadata_creator(matrix_type='test'), ) calculator = IndividualImportanceCalculator(db_engine, methods=['sample'], replace=False) # given a trained model # and a test matrix _, model_id = \ fake_trained_model( db_engine, train_matrix_uuid=train_store.uuid ) # i expect to be able to call calculate and save calculator.calculate_and_save_all_methods_and_dates( model_id, test_store) # and find individual importances in the results schema afterwards records = [ row for row in db_engine.execute('''select entity_id, as_of_date from test_results.individual_importances join model_metadata.models using (model_id)''') ] assert len(records) > 0 # and that when run again, has the same result calculator.calculate_and_save_all_methods_and_dates( model_id, test_store) new_records = [ row for row in db_engine.execute('''select entity_id, as_of_date from test_results.individual_importances join model_metadata.models using (model_id)''') ] assert len(records) == len(new_records) assert records == new_records
def test_predictor_needs_predictions(matrix_type, predict_setup_args): (project_storage, db_engine, model_id) = predict_setup_args # if not all of the predictions for the given model id and matrix are present in the db, # needs_predictions should return true. else, false predictor = Predictor(project_storage.model_storage_engine(), db_engine) matrix = matrix_creator(index="entity_id") metadata = matrix_metadata_creator(end_time=AS_OF_DATE, matrix_type=matrix_type, indices=["entity_id"]) matrix_store = get_matrix_store(project_storage, matrix, metadata) train_matrix_columns = matrix.columns[0:-1].tolist() # we haven't done anything yet, this should definitely need predictions assert predictor.needs_predictions(matrix_store, model_id) predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns, ) # now that predictions have been made, this should no longer need predictions assert not predictor.needs_predictions(matrix_store, model_id)
def test_integration(): with rig_engines() as (db_engine, project_storage): train_store = get_matrix_store( project_storage, matrix_creator(), matrix_metadata_creator(matrix_type='train')) as_of_dates = [datetime.date(2016, 12, 21), datetime.date(2017, 1, 21)] test_stores = [] for as_of_date in as_of_dates: matrix_store = get_matrix_store( project_storage, pandas.DataFrame.from_dict({ 'entity_id': [3], 'feature_one': [8], 'feature_two': [5], 'label': [0] }).set_index('entity_id'), matrix_metadata_creator(end_time=as_of_date, indices=['entity_id'])) test_stores.append(matrix_store) model_storage_engine = ModelStorageEngine(project_storage) experiment_hash = save_experiment_and_get_hash({}, db_engine) # instantiate pipeline objects trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, ) predictor = Predictor(model_storage_engine, db_engine) model_evaluator = ModelEvaluator([{ 'metrics': ['precision@'], 'thresholds': { 'top_n': [5] } }], [{}], db_engine) # run the pipeline grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict(), train_matrix_columns=['feature_one', 'feature_two']) model_evaluator.evaluate( predictions_proba, test_store, model_id, ) # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( '''select entity_id, model_id, as_of_date from test_results.predictions join model_metadata.models using (model_id) order by 3, 2''') ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(''' select model_id, evaluation_start_time, metric, parameter from test_results.evaluations order by 2, 1''') ] assert records == [ (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), ]
def test_predictor_entity_index(): with prepare() as (project_storage, db_engine, model_id): predictor = Predictor(project_storage.model_storage_engine(), db_engine) # Runs the same test for training and testing predictions for mat_type in ("train", "test"): matrix = matrix_creator(index="entity_id") metadata = matrix_metadata_creator(end_time=AS_OF_DATE, matrix_type=mat_type, indices=["entity_id"]) matrix_store = get_matrix_store(project_storage, matrix, metadata) train_matrix_columns = matrix.columns[0:-1].tolist() predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns, ) # assert # 1. that the returned predictions are of the desired length assert len(predict_proba) == 2 # 2. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute("""select entity_id, as_of_date from {}_results.predictions join model_metadata.models using (model_id)""".format( mat_type, mat_type)) ] assert len(records) == 2 # 3. that the contained as_of_dates match what we sent in for record in records: assert record[1].date() == AS_OF_DATE # 4. that the entity ids match the given dataset assert sorted([record[0] for record in records]) == [1, 2] # 5. running with same model_id, different as of date # then with same as of date only replaces the records # with the same date # Runs the same test for training and testing predictions for mat_type in ("train", "test"): new_matrix = matrix_creator(index="entity_id") new_metadata = matrix_metadata_creator( end_time=AS_OF_DATE + datetime.timedelta(days=1), matrix_type=mat_type, indices=["entity_id"], ) new_matrix_store = get_matrix_store(project_storage, new_matrix, new_metadata) predictor.predict( model_id, new_matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns, ) predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns, ) records = [ row for row in db_engine.execute("""select entity_id, as_of_date from {}_results.predictions join model_metadata.models using (model_id)""".format( mat_type, mat_type)) ] assert len(records) == 4 # 6. That we can delete the model when done prediction on it predictor.delete_model(model_id) assert predictor.load_model(model_id) is None
def test_integration(): with rig_engines() as (db_engine, project_storage): train_store = get_matrix_store( project_storage, matrix_creator(), matrix_metadata_creator(matrix_type="train"), ) as_of_dates = [datetime.date(2016, 12, 21), datetime.date(2017, 1, 21)] test_stores = [] for as_of_date in as_of_dates: matrix_store = get_matrix_store( project_storage, pandas.DataFrame.from_dict({ "entity_id": [3], "feature_one": [8], "feature_two": [5], "label": [0], }).set_index("entity_id"), matrix_metadata_creator(end_time=as_of_date, indices=["entity_id"]), ) test_stores.append(matrix_store) model_storage_engine = ModelStorageEngine(project_storage) experiment_hash = save_experiment_and_get_hash({}, db_engine) # instantiate pipeline objects trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, ) predictor = Predictor(model_storage_engine, db_engine) model_evaluator = ModelEvaluator([{ "metrics": ["precision@"], "thresholds": { "top_n": [5] } }], [{}], db_engine) # run the pipeline grid_config = { "sklearn.linear_model.LogisticRegression": { "C": [0.00001, 0.0001], "penalty": ["l1", "l2"], "random_state": [2193], } } model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict(), train_matrix_columns=["feature_one", "feature_two"], ) model_evaluator.evaluate(predictions_proba, test_store, model_id) # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( """select entity_id, model_id, as_of_date from test_results.predictions join model_metadata.models using (model_id) order by 3, 2""") ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(""" select model_id, evaluation_start_time, metric, parameter from test_results.evaluations order by 2, 1""") ] assert records == [ (1, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (2, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (3, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (4, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (1, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), (2, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), (3, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), (4, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), ]