def setUpClass(cls): cls.df = hcai_datasets.load_diabetes() # Drop columns that won't help machine learning columns_to_remove = ['PatientID'] cls.df.drop(columns_to_remove, axis=1, inplace=True) np.random.seed(42) clean_regression_df = pipelines.full_pipeline( REGRESSION, REGRESION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True).fit_transform(cls.df) clean_classification_df = pipelines.full_pipeline( CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True).fit_transform(cls.df) cls.regression_trainer = AdvancedSupervisedModelTrainer( pipelines.full_pipeline(REGRESSION, REGRESION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True), clean_regression_df, REGRESSION, REGRESION_PREDICTED_COLUMN) cls.classification_trainer = AdvancedSupervisedModelTrainer( pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True).fit_transform(cls.df), clean_classification_df, CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN)
def __init__(self, dataframe, predicted_column, model_type, impute=True, grain_column=None, verbose=False): """ Set up a SupervisedModelTrainer Args: dataframe (pandas.core.frame.DataFrame): The training data in a pandas dataframe predicted_column (str): The name of the prediction column model_type (str): the trainer type - 'classification' or 'regression' impute (bool): True to impute data (mean of numeric columns and mode of categorical ones). False to drop rows that contain any null values. grain_column (str): The name of the grain column verbose (bool): Set to true for verbose output. Defaults to False. """ self.predicted_column = predicted_column self.grain_column = grain_column # Build the pipeline # Note: Missing numeric values are imputed in prediction. If we don't impute, then some rows on the prediction # data frame will be removed, which results in missing predictions. pipeline = hcai_pipelines.full_pipeline(model_type, predicted_column, grain_column, impute=impute) prediction_pipeline = hcai_pipelines.full_pipeline(model_type, predicted_column, grain_column, impute=True) # Run the raw data through the data preparation pipeline clean_dataframe = pipeline.fit_transform(dataframe) _ = prediction_pipeline.fit_transform(dataframe) # Instantiate the advanced class self._advanced_trainer = AdvancedSupervisedModelTrainer( dataframe=clean_dataframe, model_type=model_type, predicted_column=predicted_column, grain_column=grain_column, original_column_names=dataframe.columns.values, verbose=verbose) # Save the pipeline to the parent class self._advanced_trainer.pipeline = prediction_pipeline # Split the data into train and test self._advanced_trainer.train_test_split() self._advanced_trainer.categorical_column_info = get_categorical_levels( dataframe=dataframe, columns_to_ignore=[grain_column, predicted_column])
def setUp(self): df = hcai_datasets.load_diabetes() # Drop uninformative columns df.drop(['PatientID'], axis=1, inplace=True) np.random.seed(42) clean_df = pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True).fit_transform(df) self.trainer = AdvancedSupervisedModelTrainer(pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True),clean_df, CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN) self.trainer.train_test_split(random_seed=0)
def test_random_foarest_tuning_2_column_raises_error(self): df_raw = hcai_datasets.load_diabetes() # select only specific columns df = df_raw[['ThirtyDayReadmitFLG', 'SystolicBPNBR', 'LDLNBR']] np.random.seed(42) clean_df = pipelines.full_pipeline( CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True).fit_transform(df) trainer = AdvancedSupervisedModelTrainer(pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True),clean_df, CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN) trainer.train_test_split() self.assertRaises(HealthcareAIError, trainer.random_forest_classifier, trees=200, randomized_search=True)
def setUpClass(cls): cls.df = hcai_datasets.load_diabetes() # Drop columns that won't help machine learning columns_to_remove = ['PatientID'] cls.df.drop(columns_to_remove, axis=1, inplace=True) np.random.seed(42) clean_regression_df = pipelines.full_pipeline( REGRESSION, REGRESION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True).fit_transform(cls.df) clean_classification_df = pipelines.full_pipeline( CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True).fit_transform(cls.df) cls.regression_trainer = AdvancedSupervisedModelTrainer(pipelines.full_pipeline( REGRESSION, REGRESION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True), clean_regression_df, REGRESSION, REGRESION_PREDICTED_COLUMN) cls.classification_trainer = AdvancedSupervisedModelTrainer(pipelines.full_pipeline( CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True).fit_transform(cls.df), clean_classification_df, CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN)
def __init__(self, dataframe, predicted_column, model_type, impute=True, grain_column=None, verbose=True, imputeStrategy='MeanMedian'): """ Set up a SupervisedModelTrainer. Helps the user by checking for high cardinality features (such as IDs or other unique identifiers) and low cardinality features (a column where all values are equal. Args: dataframe (pandas.core.frame.DataFrame): The training data in a pandas dataframe predicted_column (str): The name of the prediction column model_type (str): the trainer type - 'classification' or 'regression' impute (bool): True to impute data (mean of numeric columns and mode of categorical ones). False to drop rows that contain any null values. grain_column (str): The name of the grain column verbose (bool): Set to true for verbose output. Defaults to True. """ self.predicted_column = predicted_column self.grain_column = grain_column # Build the pipeline # Note: Missing numeric values are imputed in prediction. If we don't # impute, then some rows on the prediction # data frame will be removed, which results in missing predictions. pipeline = hcai_pipelines.full_pipeline(model_type, predicted_column, grain_column, impute=impute, verbose=True, imputeStrategy=imputeStrategy) prediction_pipeline = hcai_pipelines.full_pipeline( model_type, predicted_column, grain_column, impute=True, verbose=False, imputeStrategy=imputeStrategy) # Run a low and high cardinality check. Warn the user, and allow # them to proceed. hcai_ordinality.check_high_cardinality(dataframe, self.grain_column) hcai_ordinality.check_one_cardinality(dataframe) # Run the raw data through the data preparation pipeline clean_dataframe = pipeline.fit_transform(dataframe) _ = prediction_pipeline.fit_transform(dataframe) # Instantiate the advanced class self._advanced_trainer = AdvancedSupervisedModelTrainer( pipeline=pipeline, dataframe=clean_dataframe, model_type=model_type, predicted_column=predicted_column, grain_column=grain_column, original_column_names=dataframe.columns.values, verbose=verbose) # Save the pipeline to the parent class self._advanced_trainer.pipeline = prediction_pipeline # Split the data into train and test self._advanced_trainer.train_test_split() self._advanced_trainer.categorical_column_info = get_categorical_levels( dataframe=dataframe, columns_to_ignore=[grain_column, predicted_column])
def __init__(self, dataframe, predicted_column, model_type, impute=True, grain_column=None, verbose=True): """ Set up a SupervisedModelTrainer. Helps the user by checking for high cardinality features (such as IDs or other unique identifiers) and low cardinality features (a column where all values are equal. Args: dataframe (pandas.core.frame.DataFrame): The training data in a pandas dataframe predicted_column (str): The name of the prediction column model_type (str): the trainer type - 'classification' or 'regression' impute (bool): True to impute data (mean of numeric columns and mode of categorical ones). False to drop rows that contain any null values. grain_column (str): The name of the grain column verbose (bool): Set to true for verbose output. Defaults to True. """ self.predicted_column = predicted_column self.grain_column = grain_column # Build the pipeline # Note: Missing numeric values are imputed in prediction. If we don't # impute, then some rows on the prediction # data frame will be removed, which results in missing predictions. pipeline = hcai_pipelines.full_pipeline(model_type, predicted_column, grain_column, impute=impute, verbose=True) prediction_pipeline = hcai_pipelines.full_pipeline(model_type, predicted_column, grain_column, impute=True, verbose=False) # Run a low and high cardinality check. Warn the user, and allow # them to proceed. hcai_ordinality.check_high_cardinality(dataframe, self.grain_column) hcai_ordinality.check_one_cardinality(dataframe) # Run the raw data through the data preparation pipeline clean_dataframe = pipeline.fit_transform(dataframe) _ = prediction_pipeline.fit_transform(dataframe) # Instantiate the advanced class self._advanced_trainer = AdvancedSupervisedModelTrainer(pipeline=pipeline, dataframe=clean_dataframe, model_type=model_type, predicted_column=predicted_column, grain_column=grain_column, original_column_names=dataframe.columns.values, verbose=verbose) # Save the pipeline to the parent class self._advanced_trainer.pipeline = prediction_pipeline # Split the data into train and test self._advanced_trainer.train_test_split() self._advanced_trainer.categorical_column_info = get_categorical_levels( dataframe=dataframe, columns_to_ignore=[grain_column, predicted_column])
def main(): """Template script for ADVANCED USERS using healthcareai.""" # Load the included diabetes sample data dataframe = healthcareai.load_diabetes() # ...or load your own data from a .csv file: Uncomment to pull data from your CSV # dataframe = healthcareai.load_csv('path/to/your.csv') # ...or load data from a MSSQL server: Uncomment to pull data from MSSQL server # server = 'localhost' # database = 'SAM' # query = """SELECT * # FROM [SAM].[dbo].[DiabetesClincialSampleData] # -- In this step, just grab rows that have a target # WHERE ThirtyDayReadmitFLG is not null""" # # engine = hcai_db.build_mssql_engine_using_trusted_connections(server=server, database=database) # dataframe = pd.read_sql(query, engine) # Peek at the first 5 rows of data print(dataframe.head(5)) # Drop columns that won't help machine learning dataframe.drop(['PatientID'], axis=1, inplace=True) # Step 1: Prepare the data using optional imputation. There are two options for this: # ## Option 1: Use built in data prep pipeline that does enocding, imputation, null filtering, dummification clean_training_dataframe = hcai_pipelines.full_pipeline( 'classification', 'ThirtyDayReadmitFLG', 'PatientEncounterID', impute=True).fit_transform(dataframe) # ## Option 2: Build your own pipeline using healthcare.ai methods, your own, or a combination of either. # - Please note this is intentionally spartan, so we don't hinder your creativity. :) # - Also note that many of the healthcare.ai transformers intentionally return dataframes, compared to scikit that # return numpy arrays # custom_pipeline = Pipeline([ # ('remove_grain_column', hcai_filters.DataframeColumnRemover(columns_to_remove=['PatientEncounterID', 'PatientID'])), # ('imputation', hcai_transformers.DataFrameImputer(impute=True)), # ('convert_target_to_binary', hcai_transformers.DataFrameConvertTargetToBinary('classification', 'ThirtyDayReadmitFLG')), # # ('prediction_to_numeric', hcai_transformers.DataFrameConvertColumnToNumeric('ThirtyDayReadmitFLG')), # # ('create_dummy_variables', hcai_transformers.DataFrameCreateDummyVariables(excluded_columns=['ThirtyDayReadmitFLG'])), # ]) # # clean_training_dataframe = custom_pipeline.fit_transform(dataframe) # Step 2: Instantiate an Advanced Trainer class with your clean and prepared training data classification_trainer = healthcareai.AdvancedSupervisedModelTrainer( dataframe=clean_training_dataframe, model_type='classification', predicted_column='ThirtyDayReadmitFLG', grain_column='PatientEncounterID', verbose=False) # Step 3: split the data into train and test classification_trainer.train_test_split() # Step 4: Train some models # ## Train a KNN classifier with a randomized search over custom hyperparameters knn_hyperparameters = { 'algorithm': ['ball_tree', 'kd_tree'], 'n_neighbors': [1, 4, 6, 8, 10, 15, 20, 30, 50, 100, 200], 'weights': ['uniform', 'distance'] } trained_knn = classification_trainer.knn( scoring_metric='accuracy', hyperparameter_grid=knn_hyperparameters, randomized_search=True, # Set this relative to the size of your hyperparameter space. Higher will train more models and be slower # Lower will be faster and possibly less performant number_iteration_samples=10) # ## Train a random forest classifier with a randomized search over custom hyperparameters # TODO these are bogus hyperparams for random forest random_forest_hyperparameters = { 'n_estimators': [50, 100, 200, 300], 'max_features': [1, 2, 3, 4], 'max_leaf_nodes': [None, 30, 400] } trained_random_forest = classification_trainer.random_forest_classifier( scoring_metric='accuracy', hyperparameter_grid=random_forest_hyperparameters, randomized_search=True, # Set this relative to the size of your hyperparameter space. Higher will train more models and be slower # Lower will be faster and possibly less performant number_iteration_samples=10) # Show the random forest feature importance graph hcai_tsm.plot_rf_features_from_tsm(trained_random_forest, classification_trainer.x_train, feature_limit=20, save=False) # ## Train a custom ensemble of models # The ensemble methods take a dictionary of TrainedSupervisedModels by a name of your choice custom_ensemble = { 'KNN': classification_trainer.knn(hyperparameter_grid=knn_hyperparameters, randomized_search=False, scoring_metric='roc_auc'), 'Logistic Regression': classification_trainer.logistic_regression(), 'Random Forest Classifier': classification_trainer.random_forest_classifier( randomized_search=False, scoring_metric='roc_auc') } trained_ensemble = classification_trainer.ensemble_classification( scoring_metric='roc_auc', trained_model_by_name=custom_ensemble) # Step 5: Evaluate and compare the models # Create a list of all the models you just trained that you want to compare models_to_compare = [trained_knn, trained_random_forest, trained_ensemble] # Create a ROC plot that compares all the them. hcai_tsm.tsm_classification_comparison_plots( trained_supervised_models=models_to_compare, plot_type='ROC', save=False) # Create a PR plot that compares all the them. hcai_tsm.tsm_classification_comparison_plots( trained_supervised_models=models_to_compare, plot_type='PR', save=False) # Inspect the raw ROC or PR cutoffs print(trained_random_forest.roc(print_output=False)) print(trained_random_forest.pr(print_output=False))
def main(): """Template script for ADVANCED USERS using healthcareai.""" # Load the included diabetes sample data dataframe = healthcareai.load_diabetes() # ...or load your own data from a .csv file: Uncomment to pull data from your CSV # dataframe = healthcareai.load_csv('path/to/your.csv') # ...or load data from a MSSQL server: Uncomment to pull data from MSSQL server # server = 'localhost' # database = 'SAM' # query = """SELECT * # FROM [SAM].[dbo].[DiabetesClincialSampleData] # -- In this step, just grab rows that have a target # WHERE ThirtyDayReadmitFLG is not null""" # # engine = hcai_db.build_mssql_engine_using_trusted_connections(server=server, database=database) # dataframe = pd.read_sql(query, engine) # Peek at the first 5 rows of data print(dataframe.head(5)) # Drop columns that won't help machine learning dataframe.drop(['PatientID'], axis=1, inplace=True) # Step 1: Prepare the data using optional imputation. There are two options for this: # ## Option 1: Use built in data prep pipeline that does enocding, imputation, null filtering, dummification clean_training_dataframe = hcai_pipelines.full_pipeline( 'classification', 'ThirtyDayReadmitFLG', 'PatientEncounterID', impute=True).fit_transform(dataframe) # ## Option 2: Build your own pipeline using healthcare.ai methods, your own, or a combination of either. # - Please note this is intentionally spartan, so we don't hinder your creativity. :) # - Also note that many of the healthcare.ai transformers intentionally return dataframes, compared to scikit that # return numpy arrays # custom_pipeline = Pipeline([ # ('remove_grain_column', hcai_filters.DataframeColumnRemover(columns_to_remove=['PatientEncounterID', 'PatientID'])), # ('imputation', hcai_transformers.DataFrameImputer(impute=True)), # ('convert_target_to_binary', hcai_transformers.DataFrameConvertTargetToBinary('classification', 'ThirtyDayReadmitFLG')), # # ('prediction_to_numeric', hcai_transformers.DataFrameConvertColumnToNumeric('ThirtyDayReadmitFLG')), # # ('create_dummy_variables', hcai_transformers.DataFrameCreateDummyVariables(excluded_columns=['ThirtyDayReadmitFLG'])), # ]) # # clean_training_dataframe = custom_pipeline.fit_transform(dataframe) # Step 2: Instantiate an Advanced Trainer class with your clean and prepared training data classification_trainer = healthcareai.AdvancedSupervisedModelTrainer( dataframe=clean_training_dataframe, model_type='classification', predicted_column='ThirtyDayReadmitFLG', grain_column='PatientEncounterID', verbose=False) # Step 3: split the data into train and test classification_trainer.train_test_split() # Step 4: Train some models # ## Train a KNN classifier with a randomized search over custom hyperparameters knn_hyperparameters = { 'algorithm': ['ball_tree', 'kd_tree'], 'n_neighbors': [1, 4, 6, 8, 10, 15, 20, 30, 50, 100, 200], 'weights': ['uniform', 'distance']} trained_knn = classification_trainer.knn( scoring_metric='accuracy', hyperparameter_grid=knn_hyperparameters, randomized_search=True, # Set this relative to the size of your hyperparameter space. Higher will train more models and be slower # Lower will be faster and possibly less performant number_iteration_samples=10 ) # ## Train a random forest classifier with a randomized search over custom hyperparameters # TODO these are bogus hyperparams for random forest random_forest_hyperparameters = { 'n_estimators': [50, 100, 200, 300], 'max_features': [1, 2, 3, 4], 'max_leaf_nodes': [None, 30, 400]} trained_random_forest = classification_trainer.random_forest_classifier( scoring_metric='accuracy', hyperparameter_grid=random_forest_hyperparameters, randomized_search=True, # Set this relative to the size of your hyperparameter space. Higher will train more models and be slower # Lower will be faster and possibly less performant number_iteration_samples=10 ) # Show the random forest feature importance graph hcai_tsm.plot_rf_features_from_tsm( trained_random_forest, classification_trainer.x_train, feature_limit=20, save=False) # ## Train a custom ensemble of models # The ensemble methods take a dictionary of TrainedSupervisedModels by a name of your choice custom_ensemble = { 'KNN': classification_trainer.knn( hyperparameter_grid=knn_hyperparameters, randomized_search=False, scoring_metric='roc_auc'), 'Logistic Regression': classification_trainer.logistic_regression(), 'Random Forest Classifier': classification_trainer.random_forest_classifier( randomized_search=False, scoring_metric='roc_auc')} trained_ensemble = classification_trainer.ensemble_classification( scoring_metric='roc_auc', trained_model_by_name=custom_ensemble) # Step 5: Evaluate and compare the models # Create a list of all the models you just trained that you want to compare models_to_compare = [trained_knn, trained_random_forest, trained_ensemble] # Create a ROC plot that compares all the them. hcai_tsm.tsm_classification_comparison_plots( trained_supervised_models=models_to_compare, plot_type='ROC', save=False) # Create a PR plot that compares all the them. hcai_tsm.tsm_classification_comparison_plots( trained_supervised_models=models_to_compare, plot_type='PR', save=False) # Inspect the raw ROC or PR cutoffs print(trained_random_forest.roc(print_output=False)) print(trained_random_forest.pr(print_output=False))
def __init__(self, dataframe, predicted_column, model_type, impute=True, grain_column=None, verbose=True, imputeStrategy='MeanMode', tunedRandomForest=False, numeric_columns_as_categorical=None ): """ Set up a SupervisedModelTrainer. Helps the user by checking for high cardinality features (such as IDs or other unique identifiers) and low cardinality features (a column where all values are equal. Args: ----- dataframe (pandas.core.frame.DataFrame): The training data in a pandas dataframe predicted_column (str): The name of the prediction column model_type (str): the trainer type - 'classification' or 'regression' impute (bool): True to impute data (mean of numeric columns and mode of categorical ones). False to drop rows that contain any null values. grain_column (str): The name of the grain column verbose (bool): Set to true for verbose output. Defaults to True. impute : boolean, default=True If True, imputation of missing value takes place. If False, drop rows that contain any null values. imputeStrategy : string, default='MeanMode' It decides the technique to be used for imputation of missing values. If imputeStrategy = 'MeanMode', Columns of dtype object or category (assumed categorical) and imputed by the mode value of that column. Columns of other types (assumed continuous) : by mean of column. If imputeStrategy = 'RandomForest', Columns of dtype object or category (assumed categorical) : imputed using RandomForestClassifier. Columns of other types (assumed continuous) : imputed using RandomForestRegressor tunedRandomForest : boolean, default=False If set to True, RandomForestClassifier/RandomForestRegressor to be used for imputation of missing values are tuned using grid search and K-fold cross validation. Note: If set to True, imputation process may take longer time depending upon size of dataframe and number of columns having missing values. numeric_columns_as_categorical : List of type String, default=None List of column names which are numeric(int/float) in dataframe, but by nature they are to be considered as categorical. For example: There is a column JobCode( Levels : 1,2,3,4,5,6) If there are missing values in JobCode column, panadas will by default convert this column into type float. If numeric_columns_as_categorical=None Missing values of this column will be imputed by Mean value of JobCode column. type of 'JobCode' column will remain float. If numeric_columns_as_categorical=['JobCode'] Missing values of this column will be imputed by mode value of JobCode column. Also final type of 'JobCode' column will be numpy.object """ self.predicted_column = predicted_column self.grain_column = grain_column # Build the pipeline # Note: Missing numeric values are imputed in prediction. If we don't # impute, then some rows on the prediction # data frame will be removed, which results in missing predictions. pipeline = hcai_pipelines.full_pipeline(model_type, predicted_column, grain_column, impute=impute, verbose=True, imputeStrategy=imputeStrategy, tunedRandomForest=tunedRandomForest, numeric_columns_as_categorical=numeric_columns_as_categorical ) prediction_pipeline = hcai_pipelines.full_pipeline(model_type, predicted_column, grain_column, impute=True, verbose=False, imputeStrategy=imputeStrategy, tunedRandomForest=tunedRandomForest, numeric_columns_as_categorical=numeric_columns_as_categorical ) # Run a low and high cardinality check. Warn the user, and allow # them to proceed. hcai_ordinality.check_high_cardinality(dataframe, self.grain_column) hcai_ordinality.check_one_cardinality(dataframe) # Run the raw data through the data preparation pipeline clean_dataframe = pipeline.fit_transform(dataframe) _ = prediction_pipeline.fit_transform(dataframe) # Instantiate the advanced class self._advanced_trainer = AdvancedSupervisedModelTrainer(pipeline=pipeline, dataframe=clean_dataframe, model_type=model_type, predicted_column=predicted_column, grain_column=grain_column, original_column_names=dataframe.columns.values, verbose=verbose) # Save the pipeline to the parent class self._advanced_trainer.pipeline = prediction_pipeline # Split the data into train and test self._advanced_trainer.train_test_split() self._advanced_trainer.categorical_column_info = get_categorical_levels( dataframe=dataframe, columns_to_ignore=[grain_column, predicted_column])