def test_predictor_composite_index(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) project_path = 'econ-dev/inspections' model_storage_engine = InMemoryModelStorageEngine(project_path) _, model_id = \ fake_trained_model(project_path, model_storage_engine, db_engine, train_matrix_uuid='1234') predictor = Predictor(project_path, model_storage_engine, db_engine) dayone = datetime.datetime(2011, 1, 1) daytwo = datetime.datetime(2011, 1, 2) # create prediction set matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2, 1, 2], 'as_of_date': [dayone, dayone, daytwo, daytwo], 'feature_one': [3, 4, 5, 6], 'feature_two': [5, 6, 7, 8], 'label': [7, 8, 8, 7] }).set_index(['entity_id', 'as_of_date']) metadata = { 'label_name': 'label', 'end_time': AS_OF_DATE, 'label_timespan': '3month', 'metta-uuid': '1234', 'indices': ['entity_id', 'as_of_date'], } # Runs the same test for training and testing predictions for mat_type in ("train", "test"): # Create the matrix to be tested and store in db metadata['matrix_type'] = mat_type matrix_store = InMemoryMatrixStore(matrix, metadata) # Adding 'label' column back into matrix matrix['label'] = [7, 8, 8, 7] predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=['feature_one', 'feature_two']) # assert # 1. that the returned predictions are of the desired length assert len(predict_proba) == 4 # 2. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute('''select entity_id, as_of_date from {}_results.{}_predictions join model_metadata.models using (model_id)'''.format( mat_type, mat_type)) ] assert len(records) == 4
def test_predictor(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(project_path) _, model_id = \ fake_trained_model(project_path, model_storage_engine, db_engine, train_matrix_uuid='1234') predictor = Predictor(project_path, model_storage_engine, db_engine) # create prediction set matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') metadata = { 'label_name': 'label', 'end_time': AS_OF_DATE, 'label_timespan': '3month', 'metta-uuid': '1234', 'indices': ['entity_id'], } train_matrix_columns = ['feature_one', 'feature_two'] # Runs the same test for training and testing predictions for mat_type in ("train", "test"): # Create the matrix to be tested and store in db metadata['matrix_type'] = mat_type matrix_store = InMemoryMatrixStore(matrix, metadata) # Note, the first time 'matrix' is used, the label column is popped. # It must be added back in to 'matrix' to create another matrix_store. matrix['label'] = [7, 8] predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns) # assert # 1. that the returned predictions are of the desired length assert len(predict_proba) == 2 # 2. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( '''select entity_id, as_of_date from {}_results.{}_predictions join model_metadata.models using (model_id)'''.format( mat_type, mat_type)) ] assert len(records) == 2 # 3. that the contained as_of_dates match what we sent in for record in records: assert record[1].date() == AS_OF_DATE # 4. that the entity ids match the given dataset assert sorted([record[0] for record in records]) == [1, 2] # 5. running with same model_id, different as of date # then with same as of date only replaces the records # with the same date new_matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') new_metadata = { 'label_name': 'label', 'end_time': AS_OF_DATE + datetime.timedelta(days=1), 'label_timespan': '3month', 'metta-uuid': '1234', 'indices': ['entity_id'], } # Runs the same test for training and testing predictions for mat_type in ("train", "test"): # Create the matrix to be tested and store in db new_metadata['matrix_type'] = mat_type new_matrix_store = InMemoryMatrixStore(new_matrix, new_metadata) # Adding 'label' column back into new_matrix new_matrix['label'] = [7, 8] predictor.predict(model_id, new_matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns) predictor.predict(model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns) records = [ row for row in db_engine.execute( '''select entity_id, as_of_date from {}_results.{}_predictions join model_metadata.models using (model_id)'''.format( mat_type, mat_type)) ] assert len(records) == 4 # 6. That we can delete the model when done prediction on it predictor.delete_model(model_id) assert predictor.load_model(model_id) == None
def test_predictor_entity_index(): with prepare() as (project_storage, db_engine, model_id): predictor = Predictor(project_storage.model_storage_engine(), db_engine) # Runs the same test for training and testing predictions for mat_type in ("train", "test"): matrix = matrix_creator(index="entity_id") metadata = matrix_metadata_creator(end_time=AS_OF_DATE, matrix_type=mat_type, indices=["entity_id"]) matrix_store = get_matrix_store(project_storage, matrix, metadata) train_matrix_columns = matrix.columns[0:-1].tolist() predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns, ) # assert # 1. that the returned predictions are of the desired length assert len(predict_proba) == 2 # 2. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute("""select entity_id, as_of_date from {}_results.predictions join model_metadata.models using (model_id)""".format( mat_type, mat_type)) ] assert len(records) == 2 # 3. that the contained as_of_dates match what we sent in for record in records: assert record[1].date() == AS_OF_DATE # 4. that the entity ids match the given dataset assert sorted([record[0] for record in records]) == [1, 2] # 5. running with same model_id, different as of date # then with same as of date only replaces the records # with the same date # Runs the same test for training and testing predictions for mat_type in ("train", "test"): new_matrix = matrix_creator(index="entity_id") new_metadata = matrix_metadata_creator( end_time=AS_OF_DATE + datetime.timedelta(days=1), matrix_type=mat_type, indices=["entity_id"], ) new_matrix_store = get_matrix_store(project_storage, new_matrix, new_metadata) predictor.predict( model_id, new_matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns, ) predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns, ) records = [ row for row in db_engine.execute("""select entity_id, as_of_date from {}_results.predictions join model_metadata.models using (model_id)""".format( mat_type, mat_type)) ] assert len(records) == 4 # 6. That we can delete the model when done prediction on it predictor.delete_model(model_id) assert predictor.load_model(model_id) is None
def test_integration(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') project_path = 'econ-dev/inspections' # create train and test matrices train_matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') train_metadata = { 'feature_start_time': datetime.date(2012, 12, 20), 'end_time': datetime.date(2016, 12, 20), 'label_name': 'label', 'label_timespan': '1y', 'feature_names': ['ft1', 'ft2'], 'metta-uuid': '1234', 'indices': ['entity_id'], 'matrix_type': 'train' } # Creates a matrix entry in the matrices table with uuid from train_metadata MatrixFactory(matrix_uuid="1234") session.commit() train_store = InMemoryMatrixStore(train_matrix, sample_metadata()) as_of_dates = [ datetime.date(2016, 12, 21), datetime.date(2017, 1, 21) ] test_stores = [ InMemoryMatrixStore( pandas.DataFrame.from_dict({ 'entity_id': [3], 'feature_one': [8], 'feature_two': [5], 'label': [5] }), { 'label_name': 'label', 'label_timespan': '1y', 'end_time': as_of_date, 'metta-uuid': '1234', 'indices': ['entity_id'], 'matrix_type': 'test', 'as_of_date_frequency': '1month' }) for as_of_date in as_of_dates ] model_storage_engine = S3ModelStorageEngine(project_path) experiment_hash = save_experiment_and_get_hash({}, db_engine) # instantiate pipeline objects trainer = ModelTrainer( project_path=project_path, experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, ) predictor = Predictor(project_path, model_storage_engine, db_engine) model_evaluator = ModelEvaluator([{ 'metrics': ['precision@'], 'thresholds': { 'top_n': [5] } }], [{}], db_engine) # run the pipeline grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict(), train_matrix_columns=['feature_one', 'feature_two']) model_evaluator.evaluate( predictions_proba, test_store, model_id, ) # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( '''select entity_id, model_id, as_of_date from test_results.test_predictions join model_metadata.models using (model_id) order by 3, 2''') ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(''' select model_id, evaluation_start_time, metric, parameter from test_results.test_evaluations order by 2, 1''') ] assert records == [ (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), ]
def test_predictor_retrieve(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) project_path = 'econ-dev/inspections' model_storage_engine = InMemoryModelStorageEngine(project_path) _, model_id = \ fake_trained_model(project_path, model_storage_engine, db_engine, train_matrix_uuid='1234') predictor = Predictor(project_path, model_storage_engine, db_engine, replace=False) dayone = datetime.date(2011, 1, 1).strftime(predictor.expected_matrix_ts_format) daytwo = datetime.date(2011, 1, 2).strftime(predictor.expected_matrix_ts_format) # create prediction set matrix_data = { 'entity_id': [1, 2, 1, 2], 'as_of_date': [dayone, dayone, daytwo, daytwo], 'feature_one': [3, 4, 5, 6], 'feature_two': [5, 6, 7, 8], 'label': [7, 8, 8, 7] } matrix = pandas.DataFrame.from_dict(matrix_data)\ .set_index(['entity_id', 'as_of_date']) metadata = { 'label_name': 'label', 'end_time': AS_OF_DATE, 'label_timespan': '3month', 'metta-uuid': '1234', 'indices': ['entity_id', 'as_of_date'], 'matrix_type': 'test' } matrix_store = InMemoryMatrixStore(matrix, metadata) predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=['feature_one', 'feature_two']) # When run again, the predictions retrieved from the database # should match. # # Some trickiness here. Let's explain: # # If we are not careful, retrieving predictions from the database and # presenting them as a numpy array can result in a bad ordering, # since the given matrix may not be 'ordered' by some criteria # that can be easily represented by an ORDER BY clause. # # It will sometimes work, because without ORDER BY you will get # it back in the table's physical order, which unless something has # happened to the table will be the order you inserted it, # which could very well be the order in the matrix. # So it's not a bug that would necessarily immediately show itself, # but when it does go wrong your scores will be garbage. # # So we simulate a table order mutation that can happen over time: # Remove the first row and put it at the end. # If the Predictor doesn't explicitly reorder the results, this will fail # Only running on TestPrediction because TrainPrediction behaves the exact same way reorder_session = sessionmaker(bind=db_engine)() obj = reorder_session.query(TestPrediction).first() reorder_session.delete(obj) reorder_session.commit() make_transient(obj) reorder_session = sessionmaker(bind=db_engine)() reorder_session.add(obj) reorder_session.commit() predictor.load_model = Mock() new_predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=['feature_one', 'feature_two']) assert_array_equal(new_predict_proba, predict_proba) assert not predictor.load_model.called
def test_predictor_retrieve(): with prepare() as (project_storage, db_engine, model_id): predictor = Predictor(project_storage.model_storage_engine(), db_engine, replace=False) # create prediction set matrix = matrix_creator() metadata = matrix_metadata_creator() matrix_store = get_matrix_store(project_storage, matrix, metadata) predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=matrix.columns[0:-1].tolist(), ) # When run again, the predictions retrieved from the database # should match. # # Some trickiness here. Let's explain: # # If we are not careful, retrieving predictions from the database and # presenting them as a numpy array can result in a bad ordering, # since the given matrix may not be 'ordered' by some criteria # that can be easily represented by an ORDER BY clause. # # It will sometimes work, because without ORDER BY you will get # it back in the table's physical order, which unless something has # happened to the table will be the order you inserted it, # which could very well be the order in the matrix. # So it's not a bug that would necessarily immediately show itself, # but when it does go wrong your scores will be garbage. # # So we simulate a table order mutation that can happen over time: # Remove the first row and put it at the end. # If the Predictor doesn't explicitly reorder the results, this will fail # Only running on TestPrediction because TrainPrediction behaves the exact same way try: reorder_session = sessionmaker(bind=db_engine)() obj = reorder_session.query(TestPrediction).first() reorder_session.delete(obj) reorder_session.commit() finally: reorder_session.close() make_transient(obj) try: reorder_session = sessionmaker(bind=db_engine)() reorder_session.add(obj) reorder_session.commit() finally: reorder_session.close() predictor.load_model = Mock() new_predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=matrix.columns[0:-1].tolist(), ) assert_array_equal(new_predict_proba, predict_proba) assert not predictor.load_model.called
class ModelTester(object): def __init__( self, db_engine, model_storage_engine, matrix_storage_engine, replace, evaluator_config, individual_importance_config, ): self.matrix_storage_engine = matrix_storage_engine self.predictor = Predictor( db_engine=db_engine, model_storage_engine=model_storage_engine, replace=replace, ) self.individual_importance_calculator = IndividualImportanceCalculator( db_engine=db_engine, n_ranks=individual_importance_config.get("n_ranks", 5), methods=individual_importance_config.get("methods", ["uniform"]), replace=replace, ) self.evaluator = ModelEvaluator( db_engine=db_engine, sort_seed=evaluator_config.get("sort_seed", None), testing_metric_groups=evaluator_config.get("testing_metric_groups", []), training_metric_groups=evaluator_config.get( "training_metric_groups", []), ) def generate_model_test_tasks(self, split, train_store, model_ids): test_tasks = [] for test_matrix_def, test_uuid in zip(split["test_matrices"], split["test_uuids"]): test_store = self.matrix_storage_engine.get_store(test_uuid) if test_store.empty: logging.warning( """Test matrix for uuid %s was empty, no point in generating predictions. Not creating test task. """, test_uuid, ) continue test_tasks.append({ "test_store": test_store, "train_store": train_store, "model_ids": [model_id for model_id in model_ids if model_id], }) return test_tasks def process_model_test_task(self, test_store, train_store, model_ids): as_of_times = test_store.metadata["as_of_times"] logging.info( "Testing and scoring all model ids with test matrix %s. " "as_of_times min: %s max: %s num: %s", test_store.uuid, min(as_of_times), max(as_of_times), len(as_of_times), ) for model_id in model_ids: logging.info("Testing model id %s", model_id) self.individual_importance_calculator.calculate_and_save_all_methods_and_dates( model_id, test_store) # Generate predictions for the testing data then training data for store in (test_store, train_store): if self.evaluator.needs_evaluations(store, model_id): logging.info( "The evaluations needed for matrix %s-%s and model %s" "are not all present in db, so predicting and evaluating", store.uuid, store.matrix_type, model_id) predictions_proba = self.predictor.predict( model_id, store, misc_db_parameters=dict(), train_matrix_columns=train_store.columns(), ) self.evaluator.evaluate( predictions_proba=predictions_proba, matrix_store=store, model_id=model_id, ) else: logging.info( "The evaluations needed for matrix %s-%s and model %s are all present" "in db from a previous run (or none needed at all), so skipping!", store.uuid, store.matrix_type, model_id)
class ModelTester(object): def __init__(self, db_engine, project_path, model_storage_engine, replace, evaluator_config, individual_importance_config): self.predictor = Predictor(db_engine=db_engine, model_storage_engine=model_storage_engine, project_path=project_path, replace=replace) self.individual_importance_calculator = IndividualImportanceCalculator( db_engine=db_engine, n_ranks=individual_importance_config.get('n_ranks', 5), methods=individual_importance_config.get('methods', ['uniform']), replace=replace) self.evaluator = ModelEvaluator( db_engine=db_engine, sort_seed=evaluator_config.get('sort_seed', None), metric_groups=evaluator_config['metric_groups'], training_metric_groups=evaluator_config['training_metric_groups']) def generate_model_test_tasks(self, split, train_store, model_ids, matrix_store_creator): test_tasks = [] for test_matrix_def, test_uuid in zip(split['test_matrices'], split['test_uuids']): test_store = matrix_store_creator(test_uuid) if test_store.empty: logging.warning( '''Test matrix for uuid %s was empty, no point in generating predictions. Not creating test task. ''', test_uuid) continue test_tasks.append({ 'test_store': test_store, 'train_store': train_store, 'model_ids': [model_id for model_id in model_ids if model_id] }) return test_tasks def process_model_test_task(self, test_store, train_store, model_ids): as_of_times = test_store.metadata['as_of_times'] logging.info( 'Testing and scoring all model ids with test matrix %s. as_of_times min: %s max: %s num: %s', test_store.uuid, min(as_of_times), max(as_of_times), len(as_of_times)) for model_id in model_ids: logging.info('Testing model id %s', model_id) self.individual_importance_calculator\ .calculate_and_save_all_methods_and_dates( model_id, test_store ) # Generate predictions for the testing data then training data for store in (test_store, train_store): predictions_proba = self.predictor.predict( model_id, store, misc_db_parameters=dict(), train_matrix_columns=train_store.columns()) self.evaluator.evaluate( predictions_proba=predictions_proba, matrix_store=store, model_id=model_id, )
def predictor(predict_setup_args): (project_storage, db_engine, model_id) = predict_setup_args return Predictor(project_storage.model_storage_engine(), db_engine, rank_order='worst')
def add_predictions(db_engine, model_groups, project_path, experiment_hashes=None, train_end_times_range=None, rank_order='worst', replace=True): """ For a set of modl_groups generate test predictions and write to DB Args: db_engine: Sqlalchemy engine model_groups (list): The list of model group ids we are interested in (ideally, chosen through audition) project_path (str): Path where the created matrices and trained model objects are stored for the experiment experiment_hashes (List[str]): Optional. hash(es) of the experiments we are interested in. Can be used to narrow down the model_ids in the model groups specified range_train_end_times (Dict): Optional. If provided, only the models with train_end_times that fall in the range are scored. This too, helps narrow down model_ids in the model groups specified. A dictionary with two possible keys 'range_start_date' and 'range_end_date'. Either or both could be set rank_order (str) : How to deal with ties in the scores. replace (bool) : Whether to overwrite the preditctions for a model_id, if already found in the DB. Returns: None This directly writes to the test_results.predictions table """ model_matrix_info = _fetch_relevant_model_matrix_info( db_engine=db_engine, model_groups=model_groups, experiment_hashes=experiment_hashes) # If we are only generating predictions for a specific time range if train_end_times_range is not None: if 'range_start_date' in train_end_times_range: range_start = train_end_times_range['range_start_date'] msk = (model_matrix_info['train_end_time'] >= range_start) logging.info( 'Filtering out models with a train_end_time before {}'.format( range_start)) model_matrix_info = model_matrix_info[msk] if 'range_end_date' in train_end_times_range: range_end = train_end_times_range['range_end_date'] msk = (model_matrix_info['train_end_time'] <= range_end) logging.info( 'Filtering out models with a train_end_time after {}'.format( range_end)) model_matrix_info = model_matrix_info[msk] if len(model_matrix_info) == 0: raise ValueError('Configis not valid. No models were found!') # Al the model groups specified in the config file should valid (even if the experiment_hashes and train_end_times are specified) not_fetched_model_grps = [ x for x in model_groups if not x in model_matrix_info['model_group_id'].unique() ] if len(not_fetched_model_grps) > 0: raise ValueError( 'The config is not valid. No models were found for the model group(s) {}. All specified model groups should be present' .format(not_fetched_model_grps)) logging.info('Scoring {} model ids'.format(len(model_matrix_info))) # summary of the models that we are scoring. To check any special things worth noting _summary_of_models(model_matrix_info) logging.info('Instantiating storage engines and the predictor') # Storage objects to handle already stored models and matrices project_storage = ProjectStorage(project_path) model_storage_engine = project_storage.model_storage_engine() matrix_storage_engine = project_storage.matrix_storage_engine() # Prediction generation is handled by the Predictor class in catwalk predictor = Predictor(model_storage_engine=model_storage_engine, db_engine=db_engine, rank_order=rank_order, replace=replace, save_predictions=True) # Organizing prediction run over unique (train_mat, test_mat) pairs # This is to reduce no. the times the matrices get loaded to memory groupby_obj = model_matrix_info.groupby( ['train_matrix_uuid', 'test_matrix_uuid']) for group, _ in groupby_obj: train_uuid = group[0] test_uuid = group[1] df_grp = groupby_obj.get_group(group) logging.info( 'Processing {} model_ids for train matrix {} and test matrix {}'. format(len(df_grp), train_uuid, test_uuid)) train_matrix_store = matrix_storage_engine.get_store( matrix_uuid=train_uuid) # To ensure that the column order we use for predictions match the order we used in model training train_matrix_columns = list(train_matrix_store.design_matrix.columns) test_matrix_store = matrix_storage_engine.get_store( matrix_uuid=test_uuid) for model_id in df_grp['model_id'].tolist(): logging.info( 'Writing predictions for model_id {}'.format(model_id)) predictor.predict(model_id=model_id, matrix_store=test_matrix_store, train_matrix_columns=train_matrix_columns, misc_db_parameters={}) logging.info('Successfully generated predictions for {} models!'.format( len(model_matrix_info)))
def predict_forward_with_existed_model(db_engine, project_path, model_id, as_of_date): """Predict forward given model_id and as_of_date and store the prediction in database Args: db_engine (sqlalchemy.db.engine) project_storage (catwalk.storage.ProjectStorage) model_id (int) The id of a given model in the database as_of_date (string) a date string like "YYYY-MM-DD" """ logger.spam("In PREDICT LIST................") upgrade_db(db_engine=db_engine) project_storage = ProjectStorage(project_path) matrix_storage_engine = project_storage.matrix_storage_engine() # 1. Get feature and cohort config from database (train_matrix_uuid, matrix_metadata) = train_matrix_info_from_model_id(db_engine, model_id) experiment_config = experiment_config_from_model_id(db_engine, model_id) # 2. Generate cohort cohort_table_name = f"triage_production.cohort_{experiment_config['cohort_config']['name']}" cohort_table_generator = EntityDateTableGenerator( db_engine=db_engine, query=experiment_config['cohort_config']['query'], entity_date_table_name=cohort_table_name) cohort_table_generator.generate_entity_date_table( as_of_dates=[dt_from_str(as_of_date)]) # 3. Generate feature aggregations feature_generator = FeatureGenerator( db_engine=db_engine, features_schema_name="triage_production", feature_start_time=experiment_config['temporal_config'] ['feature_start_time'], ) collate_aggregations = feature_generator.aggregations( feature_aggregation_config=experiment_config['feature_aggregations'], feature_dates=[as_of_date], state_table=cohort_table_name) feature_generator.process_table_tasks( feature_generator.generate_all_table_tasks(collate_aggregations, task_type='aggregation')) # 4. Reconstruct feature disctionary from feature_names and generate imputation reconstructed_feature_dict = FeatureGroup() imputation_table_tasks = OrderedDict() for aggregation in collate_aggregations: feature_group, feature_names = get_feature_names( aggregation, matrix_metadata) reconstructed_feature_dict[feature_group] = feature_names # Make sure that the features imputed in training should also be imputed in production features_imputed_in_train = get_feature_needs_imputation_in_train( aggregation, feature_names) features_imputed_in_production = get_feature_needs_imputation_in_production( aggregation, db_engine) total_impute_cols = set(features_imputed_in_production) | set( features_imputed_in_train) total_nonimpute_cols = set(f for f in set(feature_names) if '_imp' not in f) - total_impute_cols task_generator = feature_generator._generate_imp_table_tasks_for imputation_table_tasks.update( task_generator(aggregation, impute_cols=list(total_impute_cols), nonimpute_cols=list(total_nonimpute_cols))) feature_generator.process_table_tasks(imputation_table_tasks) # 5. Build matrix db_config = { "features_schema_name": "triage_production", "labels_schema_name": "public", "cohort_table_name": cohort_table_name, } matrix_builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, engine=db_engine, experiment_hash=None, replace=True, ) feature_start_time = experiment_config['temporal_config'][ 'feature_start_time'] label_name = experiment_config['label_config']['name'] label_type = 'binary' cohort_name = experiment_config['cohort_config']['name'] user_metadata = experiment_config['user_metadata'] # Use timechop to get the time definition for production temporal_config = experiment_config["temporal_config"] temporal_config.update( temporal_params_from_matrix_metadata(db_engine, model_id)) timechopper = Timechop(**temporal_config) prod_definitions = timechopper.define_test_matrices( train_test_split_time=dt_from_str(as_of_date), test_duration=temporal_config['test_durations'][0], test_label_timespan=temporal_config['test_label_timespans'][0]) matrix_metadata = Planner.make_metadata( prod_definitions[-1], reconstructed_feature_dict, label_name, label_type, cohort_name, 'production', feature_start_time, user_metadata, ) matrix_metadata['matrix_id'] = str( as_of_date) + f'_model_id_{model_id}' + '_risklist' matrix_uuid = filename_friendly_hash(matrix_metadata) matrix_builder.build_matrix( as_of_times=[as_of_date], label_name=label_name, label_type=label_type, feature_dictionary=reconstructed_feature_dict, matrix_metadata=matrix_metadata, matrix_uuid=matrix_uuid, matrix_type="production", ) # 6. Predict the risk score for production predictor = Predictor( model_storage_engine=project_storage.model_storage_engine(), db_engine=db_engine, rank_order='best') predictor.predict( model_id=model_id, matrix_store=matrix_storage_engine.get_store(matrix_uuid), misc_db_parameters={}, train_matrix_columns=matrix_storage_engine.get_store( train_matrix_uuid).columns())
def predict(self, prediction_date): """Predict forward by creating a matrix using as_of_date = prediction_date and applying the retrain model on it Args: prediction_date(str) """ cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_predict" # 1. Generate cohort self.generate_entity_date_table(prediction_date, cohort_table_name) # 2. Generate feature aggregations collate_aggregations = self.get_collate_aggregations( prediction_date, cohort_table_name) self.feature_generator.process_table_tasks( self.feature_generator.generate_all_table_tasks( collate_aggregations, task_type='aggregation')) # 3. Reconstruct feature disctionary from feature_names and generate imputation reconstructed_feature_dict, imputation_table_tasks = self.get_feature_dict_and_imputation_task( collate_aggregations, self.retrain_model_id) self.feature_generator.process_table_tasks(imputation_table_tasks) # 4. Build matrix db_config = { "features_schema_name": "triage_production", "labels_schema_name": "public", "cohort_table_name": cohort_table_name, } matrix_builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=self.matrix_storage_engine, engine=self.db_engine, experiment_hash=None, replace=True, ) # Use timechop to get the time definition for production temporal_config = self.get_temporal_config_for_retrain( dt_from_str(prediction_date)) timechopper = Timechop(**temporal_config) retrain_config = get_retrain_config_from_model_id( self.db_engine, self.retrain_model_id) prod_definitions = timechopper.define_test_matrices( train_test_split_time=dt_from_str(prediction_date), test_duration=retrain_config['test_duration'], test_label_timespan=retrain_config['test_label_timespan']) last_split_definition = prod_definitions[-1] matrix_metadata = Planner.make_metadata( matrix_definition=last_split_definition, feature_dictionary=reconstructed_feature_dict, label_name=self.label_name, label_type='binary', cohort_name=self.cohort_name, matrix_type='production', feature_start_time=self.feature_start_time, user_metadata=self.user_metadata, ) matrix_metadata['matrix_id'] = str( prediction_date ) + f'_model_id_{self.retrain_model_id}' + '_risklist' matrix_uuid = filename_friendly_hash(matrix_metadata) matrix_builder.build_matrix( as_of_times=[prediction_date], label_name=self.label_name, label_type='binary', feature_dictionary=reconstructed_feature_dict, matrix_metadata=matrix_metadata, matrix_uuid=matrix_uuid, matrix_type="production", ) # 5. Predict the risk score for production predictor = Predictor( model_storage_engine=self.project_storage.model_storage_engine(), db_engine=self.db_engine, rank_order='best') predictor.predict( model_id=self.retrain_model_id, matrix_store=self.matrix_storage_engine.get_store(matrix_uuid), misc_db_parameters={}, train_matrix_columns=self.matrix_storage_engine.get_store( self.retrain_matrix_uuid).columns(), ) self.predict_matrix_uuid = matrix_uuid
def test_integration(): with rig_engines() as (db_engine, project_storage): train_store = get_matrix_store( project_storage, matrix_creator(), matrix_metadata_creator(matrix_type="train"), ) as_of_dates = [datetime.date(2016, 12, 21), datetime.date(2017, 1, 21)] test_stores = [] for as_of_date in as_of_dates: matrix_store = get_matrix_store( project_storage, pandas.DataFrame.from_dict({ "entity_id": [3], "feature_one": [8], "feature_two": [5], "label": [0], }).set_index("entity_id"), matrix_metadata_creator(end_time=as_of_date, indices=["entity_id"]), ) test_stores.append(matrix_store) model_storage_engine = ModelStorageEngine(project_storage) experiment_hash = save_experiment_and_get_hash({}, db_engine) # instantiate pipeline objects trainer = ModelTrainer( experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, ) predictor = Predictor(model_storage_engine, db_engine) model_evaluator = ModelEvaluator([{ "metrics": ["precision@"], "thresholds": { "top_n": [5] } }], [{}], db_engine) # run the pipeline grid_config = { "sklearn.linear_model.LogisticRegression": { "C": [0.00001, 0.0001], "penalty": ["l1", "l2"], "random_state": [2193], } } model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict(), train_matrix_columns=["feature_one", "feature_two"], ) model_evaluator.evaluate(predictions_proba, test_store, model_id) # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute( """select entity_id, model_id, as_of_date from test_results.predictions join model_metadata.models using (model_id) order by 3, 2""") ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(""" select model_id, evaluation_start_time, metric, parameter from test_results.evaluations order by 2, 1""") ] assert records == [ (1, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (2, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (3, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (4, datetime.datetime(2016, 12, 21), "precision@", "5_abs"), (1, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), (2, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), (3, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), (4, datetime.datetime(2017, 1, 21), "precision@", "5_abs"), ]