def __init__( self, config, db_engine, model_storage_class=FSModelStorageEngine, project_path=None, replace=True, cleanup=False, cleanup_timeout=None, ): self._check_config_version(config) self.config = config if isinstance(db_engine, Engine): logging.warning( 'Raw, unserializable SQLAlchemy engine passed. URL will be used, other options may be lost in multi-process environments' ) self.db_engine = create_engine(db_engine.url) else: self.db_engine = db_engine if model_storage_class: self.model_storage_engine = model_storage_class( project_path=project_path) self.matrix_store_class = CSVMatrixStore # can't be configurable until Architect obeys self.project_path = project_path self.replace = replace upgrade_db(db_engine=self.db_engine) self.features_schema_name = 'features' if project_path: self.matrices_directory = os.path.join(self.project_path, 'matrices') if not os.path.exists(self.matrices_directory): os.makedirs(self.matrices_directory) self.experiment_hash = save_experiment_and_get_hash( self.config, self.db_engine) self.labels_table_name = 'labels_{}'.format(self.experiment_hash) self.initialize_components() self.cleanup = cleanup if self.cleanup: logging.info( 'cleanup is set to True, so intermediate tables (labels and states) will be removed after matrix creation' ) else: logging.info( 'cleanup is set to False, so intermediate tables (labels and states) will not be removed after matrix creation' ) self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout)
def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, materialize_subquery_fromobjs=True, features_ignore_cohort=False, profile=False, save_predictions=True, ): self._check_config_version(config) self.config = config self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine( self.project_storage, matrix_storage_class) self.project_path = project_path self.replace = replace self.save_predictions = save_predictions self.db_engine = db_engine upgrade_db(db_engine=self.db_engine) self.features_schema_name = "features" self.materialize_subquery_fromobjs = materialize_subquery_fromobjs self.features_ignore_cohort = features_ignore_cohort self.experiment_hash = save_experiment_and_get_hash( self.config, self.db_engine) self.initialize_components() self.cleanup = cleanup if self.cleanup: logging.info( "cleanup is set to True, so intermediate tables (labels and cohort) " "will be removed after matrix creation and subset tables will be " "removed after model training and testing") else: logging.info( "cleanup is set to False, so intermediate tables (labels, cohort, and subsets) " "will not be removed") self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout) self.profile = profile logging.info("Generate profiling stats? (profile option): %s", self.profile)
def __init__( self, config, db_engine, project_path=None, matrix_storage_class=CSVMatrixStore, replace=True, cleanup=False, cleanup_timeout=None, ): self._check_config_version(config) self.config = config if isinstance(db_engine, Engine): logging.warning( "Raw, unserializable SQLAlchemy engine passed. " "URL will be used, other options may be lost in multi-process environments" ) self.db_engine = create_engine(db_engine.url) else: self.db_engine = db_engine self.project_storage = ProjectStorage(project_path) self.model_storage_engine = ModelStorageEngine(self.project_storage) self.matrix_storage_engine = MatrixStorageEngine( self.project_storage, matrix_storage_class) self.project_path = project_path self.replace = replace upgrade_db(db_engine=self.db_engine) self.features_schema_name = "features" self.experiment_hash = save_experiment_and_get_hash( self.config, self.db_engine) self.labels_table_name = "labels_{}".format(self.experiment_hash) self.initialize_components() self.cleanup = cleanup if self.cleanup: logging.info( "cleanup is set to True, so intermediate tables (labels and states) " "will be removed after matrix creation") else: logging.info( "cleanup is set to False, so intermediate tables (labels and states) " "will not be removed after matrix creation") self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout)
def upgrade(self, args): """Upgrade triage results database""" upgrade_db(args.dbfile)
def upgrade(self, args): """Upgrade triage results database""" upgrade_db(revision=args.revision, dburl=self.root.db_url)
def predict_forward_with_existed_model(db_engine, project_path, model_id, as_of_date): """Predict forward given model_id and as_of_date and store the prediction in database Args: db_engine (sqlalchemy.db.engine) project_storage (catwalk.storage.ProjectStorage) model_id (int) The id of a given model in the database as_of_date (string) a date string like "YYYY-MM-DD" """ logger.spam("In PREDICT LIST................") upgrade_db(db_engine=db_engine) project_storage = ProjectStorage(project_path) matrix_storage_engine = project_storage.matrix_storage_engine() # 1. Get feature and cohort config from database (train_matrix_uuid, matrix_metadata) = train_matrix_info_from_model_id(db_engine, model_id) experiment_config = experiment_config_from_model_id(db_engine, model_id) # 2. Generate cohort cohort_table_name = f"triage_production.cohort_{experiment_config['cohort_config']['name']}" cohort_table_generator = EntityDateTableGenerator( db_engine=db_engine, query=experiment_config['cohort_config']['query'], entity_date_table_name=cohort_table_name) cohort_table_generator.generate_entity_date_table( as_of_dates=[dt_from_str(as_of_date)]) # 3. Generate feature aggregations feature_generator = FeatureGenerator( db_engine=db_engine, features_schema_name="triage_production", feature_start_time=experiment_config['temporal_config'] ['feature_start_time'], ) collate_aggregations = feature_generator.aggregations( feature_aggregation_config=experiment_config['feature_aggregations'], feature_dates=[as_of_date], state_table=cohort_table_name) feature_generator.process_table_tasks( feature_generator.generate_all_table_tasks(collate_aggregations, task_type='aggregation')) # 4. Reconstruct feature disctionary from feature_names and generate imputation reconstructed_feature_dict = FeatureGroup() imputation_table_tasks = OrderedDict() for aggregation in collate_aggregations: feature_group, feature_names = get_feature_names( aggregation, matrix_metadata) reconstructed_feature_dict[feature_group] = feature_names # Make sure that the features imputed in training should also be imputed in production features_imputed_in_train = get_feature_needs_imputation_in_train( aggregation, feature_names) features_imputed_in_production = get_feature_needs_imputation_in_production( aggregation, db_engine) total_impute_cols = set(features_imputed_in_production) | set( features_imputed_in_train) total_nonimpute_cols = set(f for f in set(feature_names) if '_imp' not in f) - total_impute_cols task_generator = feature_generator._generate_imp_table_tasks_for imputation_table_tasks.update( task_generator(aggregation, impute_cols=list(total_impute_cols), nonimpute_cols=list(total_nonimpute_cols))) feature_generator.process_table_tasks(imputation_table_tasks) # 5. Build matrix db_config = { "features_schema_name": "triage_production", "labels_schema_name": "public", "cohort_table_name": cohort_table_name, } matrix_builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, engine=db_engine, experiment_hash=None, replace=True, ) feature_start_time = experiment_config['temporal_config'][ 'feature_start_time'] label_name = experiment_config['label_config']['name'] label_type = 'binary' cohort_name = experiment_config['cohort_config']['name'] user_metadata = experiment_config['user_metadata'] # Use timechop to get the time definition for production temporal_config = experiment_config["temporal_config"] temporal_config.update( temporal_params_from_matrix_metadata(db_engine, model_id)) timechopper = Timechop(**temporal_config) prod_definitions = timechopper.define_test_matrices( train_test_split_time=dt_from_str(as_of_date), test_duration=temporal_config['test_durations'][0], test_label_timespan=temporal_config['test_label_timespans'][0]) matrix_metadata = Planner.make_metadata( prod_definitions[-1], reconstructed_feature_dict, label_name, label_type, cohort_name, 'production', feature_start_time, user_metadata, ) matrix_metadata['matrix_id'] = str( as_of_date) + f'_model_id_{model_id}' + '_risklist' matrix_uuid = filename_friendly_hash(matrix_metadata) matrix_builder.build_matrix( as_of_times=[as_of_date], label_name=label_name, label_type=label_type, feature_dictionary=reconstructed_feature_dict, matrix_metadata=matrix_metadata, matrix_uuid=matrix_uuid, matrix_type="production", ) # 6. Predict the risk score for production predictor = Predictor( model_storage_engine=project_storage.model_storage_engine(), db_engine=db_engine, rank_order='best') predictor.predict( model_id=model_id, matrix_store=matrix_storage_engine.get_store(matrix_uuid), misc_db_parameters={}, train_matrix_columns=matrix_storage_engine.get_store( train_matrix_uuid).columns())
def __init__(self, db_engine, project_path, model_group_id): self.retrain_hash = None self.db_engine = db_engine upgrade_db(db_engine=self.db_engine) self.project_storage = ProjectStorage(project_path) self.model_group_id = model_group_id self.model_group_info = get_model_group_info(self.db_engine, self.model_group_id) self.matrix_storage_engine = self.project_storage.matrix_storage_engine( ) self.triage_run_id, self.experiment_config = experiment_config_from_model_group_id( self.db_engine, self.model_group_id) # This feels like it needs some refactoring since in some edge cases at least the test matrix temporal parameters # might differ across models in the mdoel group (the training ones shouldn't), but this should probably work for # the vast majorty of use cases... self.experiment_config['temporal_config'].update( temporal_params_from_matrix_metadata( self.db_engine, self.model_group_info['model_id_last_split'])) # Since "testing" here is predicting forward to a single new date, the test_duration should always be '0day' # (regardless of what it may have been before) self.experiment_config['temporal_config']['test_durations'] = ['0day'] # These lists should now only contain one item (the value actually used for the last model in this group) self.training_label_timespan = self.experiment_config[ 'temporal_config']['training_label_timespans'][0] self.test_label_timespan = self.experiment_config['temporal_config'][ 'test_label_timespans'][0] self.test_duration = self.experiment_config['temporal_config'][ 'test_durations'][0] self.feature_start_time = self.experiment_config['temporal_config'][ 'feature_start_time'] self.label_name = self.experiment_config['label_config']['name'] self.cohort_name = self.experiment_config['cohort_config']['name'] self.user_metadata = self.experiment_config['user_metadata'] self.feature_dictionary_creator = FeatureDictionaryCreator( features_schema_name='triage_production', db_engine=self.db_engine) self.label_generator = LabelGenerator( label_name=self.experiment_config['label_config'].get( "name", None), query=self.experiment_config['label_config']["query"], replace=True, db_engine=self.db_engine, ) self.labels_table_name = "labels_{}_{}_production".format( self.experiment_config['label_config'].get('name', 'default'), filename_friendly_hash( self.experiment_config['label_config']['query'])) self.feature_generator = FeatureGenerator( db_engine=self.db_engine, features_schema_name="triage_production", feature_start_time=self.feature_start_time, ) self.model_trainer = ModelTrainer( experiment_hash=None, model_storage_engine=ModelStorageEngine(self.project_storage), db_engine=self.db_engine, replace=True, run_id=self.triage_run_id, )