def setUpClass(cls): df = hcai_datasets.load_diabetes() # Drop columns that won't help machine learning columns_to_remove = ['PatientID'] df.drop(columns_to_remove, axis=1, inplace=True) cls.classification_trainer = SupervisedModelTrainer(dataframe=df, predicted_column='ThirtyDayReadmitFLG', model_type='classification', impute=True, grain_column='PatientEncounterID', verbose=False) cls.regression_trainer = SupervisedModelTrainer(df, 'SystolicBPNBR', 'regression', grain_column='PatientEncounterID', impute=True, verbose=False) cls.regression_trainer_impute_false = SupervisedModelTrainer(df, 'SystolicBPNBR', 'regression', grain_column='PatientEncounterID', impute=False, verbose=False)
def setUpClass(cls): cls.df = hcai_datasets.load_diabetes() # Drop columns that won't help machine learning columns_to_remove = ['PatientID'] cls.df.drop(columns_to_remove, axis=1, inplace=True) np.random.seed(42) clean_regression_df = pipelines.full_pipeline( REGRESSION, REGRESION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True).fit_transform(cls.df) clean_classification_df = pipelines.full_pipeline( CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True).fit_transform(cls.df) cls.regression_trainer = AdvancedSupervisedModelTrainer( pipelines.full_pipeline(REGRESSION, REGRESION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True), clean_regression_df, REGRESSION, REGRESION_PREDICTED_COLUMN) cls.classification_trainer = AdvancedSupervisedModelTrainer( pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True).fit_transform(cls.df), clean_classification_df, CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN)
def test_impute_false_nan_data(self): # Train the linear regression model with impute = False trained_linear_model = self.regression_trainer_impute_false.linear_regression() # Load a new df for predicting prediction_df = hcai_datasets.load_diabetes() # Assert that the number of rows of prediction should be equal between df and model predictions self.assertEqual(len(trained_linear_model.make_predictions(prediction_df)), len(prediction_df))
def test_linear_regression_raises_error_on_missing_columns(self): # TODO how is this working since the model does not use the training df??? training_df = hcai_datasets.load_diabetes() # Drop columns that won't help machine learning training_df.drop(['PatientID'], axis=1, inplace=True) # Train the linear regression model trained_linear_model = self.regression_trainer.linear_regression() # Load a new df for predicting prediction_df = hcai_datasets.load_diabetes() # Drop columns that model expects prediction_df.drop('GenderFLG', axis=1, inplace=True) # Make some predictions self.assertRaises(HealthcareAIError, trained_linear_model.make_predictions, prediction_df)
def setUp(self): df = hcai_datasets.load_diabetes() # Drop uninformative columns df.drop(['PatientID'], axis=1, inplace=True) np.random.seed(42) clean_df = pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True).fit_transform(df) self.trainer = AdvancedSupervisedModelTrainer(clean_df, CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN) self.trainer.train_test_split(random_seed=0)
def setUpClass(cls): """ Load a dataframe, train a linear model and prepare a prediction dataframe for assertions """ training_df = hcai_datasets.load_diabetes() # Drop columns that won't help machine learning training_df.drop(['PatientID'], axis=1, inplace=True) regression_trainer = SupervisedModelTrainer( training_df, 'SystolicBPNBR', 'regression', impute=True, grain_column='PatientEncounterID') classification_trainer = SupervisedModelTrainer( training_df, 'ThirtyDayReadmitFLG', 'classification', impute=True, grain_column='PatientEncounterID') # Train the models cls.trained_linear_model = regression_trainer.linear_regression() cls.trained_lr = classification_trainer.logistic_regression() # Load a new df for predicting cls.prediction_df = hcai_datasets.load_diabetes() # Drop columns that won't help machine learning columns_to_remove = ['PatientID'] cls.prediction_df.drop(columns_to_remove, axis=1, inplace=True) # Create various outputs cls.predictions = cls.trained_linear_model.make_predictions( cls.prediction_df) cls.factors = cls.trained_linear_model.make_factors( cls.prediction_df, number_top_features=3) cls.predictions_with_3_factors = cls.trained_linear_model.make_predictions_with_k_factors( cls.prediction_df, number_top_features=3) cls.original_with_predictions_3_factors = cls.trained_linear_model.make_original_with_predictions_and_factors( cls.prediction_df, number_top_features=3) cls.catalyst_dataframe = cls.trained_linear_model.create_catalyst_dataframe( cls.prediction_df)
def setUpClass(cls): """ Load a dataframe, train a linear model and prepare a prediction dataframe for assertions """ training_df = hcai_datasets.load_diabetes() # Drop columns that won't help machine learning training_df.drop(['PatientID'], axis=1, inplace=True) regression_trainer = SupervisedModelTrainer( training_df, 'SystolicBPNBR', 'regression', impute=True, grain_column='PatientEncounterID') classification_trainer = SupervisedModelTrainer( training_df, 'ThirtyDayReadmitFLG', 'classification', impute=True, grain_column='PatientEncounterID') # Train the models cls.trained_linear_model = regression_trainer.linear_regression() cls.trained_lr = classification_trainer.logistic_regression() # Load a new df for predicting cls.prediction_df = hcai_datasets.load_diabetes() # Drop columns that won't help machine learning columns_to_remove = ['PatientID'] cls.prediction_df.drop(columns_to_remove, axis=1, inplace=True) # Create various outputs cls.predictions = cls.trained_linear_model.make_predictions(cls.prediction_df) cls.factors = cls.trained_linear_model.make_factors(cls.prediction_df, number_top_features=3) cls.predictions_with_3_factors = cls.trained_linear_model.make_predictions_with_k_factors( cls.prediction_df, number_top_features=3) cls.original_with_predictions_3_factors = cls.trained_linear_model.make_original_with_predictions_and_factors( cls.prediction_df, number_top_features=3) cls.catalyst_dataframe = cls.trained_linear_model.create_catalyst_dataframe(cls.prediction_df)
def setUp(self): df = hcai_datasets.load_diabetes() # Drop uninformative columns df.drop(['PatientID'], axis=1, inplace=True) np.random.seed(42) clean_df = pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True).fit_transform(df) self.trainer = AdvancedSupervisedModelTrainer(pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True),clean_df, CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN) self.trainer.train_test_split(random_seed=0)
def test_random_foarest_tuning_2_column_raises_error(self): df_raw = hcai_datasets.load_diabetes() # select only specific columns df = df_raw[['ThirtyDayReadmitFLG', 'SystolicBPNBR', 'LDLNBR']] np.random.seed(42) clean_df = pipelines.full_pipeline( CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True).fit_transform(df) trainer = AdvancedSupervisedModelTrainer(clean_df, CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN) trainer.train_test_split() self.assertRaises(HealthcareAIError, trainer.random_forest_classifier, trees=200, randomized_search=True)
def test_random_foarest_tuning_2_column_raises_error(self): df_raw = hcai_datasets.load_diabetes() # select only specific columns df = df_raw[['ThirtyDayReadmitFLG', 'SystolicBPNBR', 'LDLNBR']] np.random.seed(42) clean_df = pipelines.full_pipeline( CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True).fit_transform(df) trainer = AdvancedSupervisedModelTrainer(pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True),clean_df, CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN) trainer.train_test_split() self.assertRaises(HealthcareAIError, trainer.random_forest_classifier, trees=200, randomized_search=True)
def main(): # Load the diabetes sample data dataframe = hcai_datasets.load_diabetes() # ## Load data from a MSSQL server: Uncomment to pull data from MSSQL server # server = 'localhost' # database = 'SAM' # query = """SELECT * # FROM [SAM].[dbo].[DiabetesClincialSampleData] # -- In this step, just grab rows that have a target # WHERE ThirtyDayReadmitFLG is not null""" # # engine = hcai_db.build_mssql_engine(server=server, database=database) # dataframe = pd.read_sql(query, engine) # Drop columns that won't help machine learning dataframe.drop(['PatientID'], axis=1, inplace=True) # Step 1: Setup a healthcareai regression trainer. This prepares your data for model building regression_trainer = SupervisedModelTrainer( dataframe=dataframe, predicted_column='SystolicBPNBR', model_type='regression', grain_column='PatientEncounterID', impute=True, verbose=False) # Look at the first few rows of your dataframe after loading the data print( '\n\n-------------------[ Cleaned Dataframe ]--------------------------' ) print(regression_trainer.clean_dataframe.head()) # Step 2: train some models # Train and evaluate linear regression model trained_linear_model = regression_trainer.linear_regression() # Train and evaluate random forest model trained_random_forest = regression_trainer.random_forest_regression() # Once you are happy with the performance of any model, you can save it for use later in predicting new data. # File names are timestamped and look like '2017-05-31T12-36-21_regression_LinearRegression.pkl') # Note the file you saved and that will be used in example_regression_2.py trained_linear_model.save()
def setUp(self): df = hcai_datasets.load_diabetes() # Drop columns that won't help machine learning columns_to_remove = ['PatientID'] df.drop(columns_to_remove, axis=1, inplace=True) self.regression_trainer = SupervisedModelTrainer(df, 'SystolicBPNBR', 'regression', grain_column='PatientEncounterID', impute=True, verbose=False) def undecorated_lr(self): return self._advanced_trainer.linear_regression(randomized_search=False) self.regression_trainer.undecorated_lr = undecorated_lr.__get__(self.regression_trainer, self.regression_trainer.__class__)
def setUpClass(cls): cls.df = hcai_datasets.load_diabetes() # Drop columns that won't help machine learning columns_to_remove = ['PatientID'] cls.df.drop(columns_to_remove, axis=1, inplace=True) np.random.seed(42) clean_regression_df = pipelines.full_pipeline( REGRESSION, REGRESION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True).fit_transform(cls.df) clean_classification_df = pipelines.full_pipeline( CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True).fit_transform(cls.df) cls.regression_trainer = AdvancedSupervisedModelTrainer(pipelines.full_pipeline( REGRESSION, REGRESION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True), clean_regression_df, REGRESSION, REGRESION_PREDICTED_COLUMN) cls.classification_trainer = AdvancedSupervisedModelTrainer(pipelines.full_pipeline( CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME, impute=True).fit_transform(cls.df), clean_classification_df, CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN)
def main(): # Load the diabetes sample data dataframe = hcai_datasets.load_diabetes() # ## Load data from a MSSQL server: Uncomment to pull data from MSSQL server # server = 'localhost' # database = 'SAM' # query = """SELECT * # FROM [SAM].[dbo].[DiabetesClincialSampleData] # -- In this step, just grab rows that have a target # WHERE ThirtyDayReadmitFLG is not null""" # # engine = hcai_db.build_mssql_engine(server=server, database=database) # dataframe = pd.read_sql(query, engine) # Drop columns that won't help machine learning dataframe.drop(['PatientID'], axis=1, inplace=True) # Step 1: Setup a healthcareai classification trainer. This prepares your data for model building classification_trainer = SupervisedModelTrainer( dataframe=dataframe, predicted_column='ThirtyDayReadmitFLG', model_type='classification', grain_column='PatientEncounterID', impute=True, verbose=False) # Look at the first few rows of your dataframe after loading the data print( '\n\n-------------------[ Cleaned Dataframe ]--------------------------' ) print(classification_trainer.clean_dataframe.head()) # Step 2: train some models # Train a KNN model trained_knn = classification_trainer.knn() # View the ROC and PR plots trained_knn.roc_plot() trained_knn.pr_plot() # Uncomment if you want to see all the ROC and/or PR thresholds # trained_knn.roc() # trained_knn.pr() # Train a logistic regression model trained_lr = classification_trainer.logistic_regression() # View the ROC and PR plots trained_lr.roc_plot() trained_lr.pr_plot() # Uncomment if you want to see all the ROC and/or PR thresholds # trained_lr.roc() # trained_lr.pr() # Train a random forest model and view the feature importance plot trained_random_forest = classification_trainer.random_forest( save_plot=False) # View the ROC and PR plots trained_random_forest.roc_plot() trained_random_forest.pr_plot() # Uncomment if you want to see all the ROC and/or PR thresholds # trained_random_forest.roc() # trained_random_forest.pr() # Create a list of all the models you just trained that you want to compare models_to_compare = [trained_knn, trained_lr, trained_random_forest] # Create a ROC plot that compares them. tsm_plots.tsm_classification_comparison_plots( trained_supervised_models=models_to_compare, plot_type='ROC', save=False) # Create a PR plot that compares them. tsm_plots.tsm_classification_comparison_plots( trained_supervised_models=models_to_compare, plot_type='PR', save=False) # Once you are happy with the performance of any model, you can save it for use later in predicting new data. # File names are timestamped and look like '2017-05-31T12-36-21_classification_RandomForestClassifier.pkl') # Note the file you saved and that will be used in example_classification_2.py trained_random_forest.save()
def main(): # Load the diabetes sample data prediction_dataframe = hcai_datasets.load_diabetes() # Load data from a MSSQL server: Uncomment to pull data from MSSQL server # server = 'localhost' # database = 'SAM' # query = """SELECT * # FROM [SAM].[dbo].[DiabetesClincialSampleData] # WHERE SystolicBPNBR is null""" # # engine = hcai_db.build_mssql_engine(server=server, database=database) # prediction_dataframe = pd.read_sql(query, engine) # Drop columns that won't help machine learning columns_to_remove = ['PatientID'] prediction_dataframe.drop(columns_to_remove, axis=1, inplace=True) # Load the saved model using your filename. # File names are timestamped and look like '2017-05-31T12-36-21_classification_RandomForestClassifier.pkl') # Note the file you saved in example_classification_1.py and set that here. trained_model = hcai_io_utilities.load_saved_model( 'your_filename_here.pkl') # Any saved model can be inspected for properties such as plots, metrics, columns, etc. (More examples in the docs) trained_model.roc_plot() print(trained_model.roc()) # print(trained_model.column_names) # print(trained_model.grain_column) # print(trained_model.prediction_column) # # Make predictions. Please note that there are four different formats you can choose from. All are shown # here, though you only need one. # ## Get predictions predictions = trained_model.make_predictions(prediction_dataframe) print( '\n\n-------------------[ Predictions ]----------------------------------------------------\n' ) print(predictions.head()) # ## Get the important factors factors = trained_model.make_factors(prediction_dataframe, number_top_features=3) print( '\n\n-------------------[ Factors ]----------------------------------------------------\n' ) print(factors.head()) # ## Get predictions with factors predictions_with_factors_df = trained_model.make_predictions_with_k_factors( prediction_dataframe, number_top_features=3) print( '\n\n-------------------[ Predictions + factors ]----------------------------------------------------\n' ) print(predictions_with_factors_df.head()) # ## Get original dataframe with predictions and factors original_plus_predictions_and_factors = trained_model.make_original_with_predictions_and_factors( prediction_dataframe, number_top_features=3) print( '\n\n-------------------[ Original + predictions + factors ]-------------------------------------------\n' ) print(original_plus_predictions_and_factors.head()) # Save your predictions. You can save predictions to a csv or database. Examples are shown below. # Please note that you will likely only need one of these output types. Feel free to delete the others. # Save results to csv predictions_with_factors_df.to_csv('ClinicalPredictions.csv')
def main(): # Load the diabetes sample data dataframe = hcai_datasets.load_diabetes() # Drop columns that won't help machine learning dataframe.drop(['PatientID'], axis=1, inplace=True) # Step 1: Prepare the data using optional imputation. There are two options for this: # ## Option 1: Use built in data prep pipeline that does enocding, imputation, null filtering, dummification clean_training_dataframe = hcai_pipelines.full_pipeline( 'classification', 'ThirtyDayReadmitFLG', 'PatientEncounterID', impute=True).fit_transform(dataframe) # ## Option 2: Build your own pipeline using healthcare.ai methods, your own, or a combination of either. # - Please note this is intentionally spartan, so we don't hinder your creativity. :) # - Also note that many of the healthcare.ai transformers intentionally return dataframes, compared to scikit that # return numpy arrays # custom_pipeline = Pipeline([ # ('remove_grain_column', hcai_filters.DataframeColumnRemover(columns_to_remove=['PatientEncounterID', 'PatientID'])), # ('imputation', hcai_transformers.DataFrameImputer(impute=True)), # ('convert_target_to_binary', hcai_transformers.DataFrameConvertTargetToBinary('classification', 'ThirtyDayReadmitFLG')), # # ('prediction_to_numeric', hcai_transformers.DataFrameConvertColumnToNumeric('ThirtyDayReadmitFLG')), # # ('create_dummy_variables', hcai_transformers.DataFrameCreateDummyVariables(excluded_columns=['ThirtyDayReadmitFLG'])), # ]) # # clean_training_dataframe = custom_pipeline.fit_transform(dataframe) # Step 2: Instantiate an Advanced Trainer class with your clean and prepared training data classification_trainer = AdvancedSupervisedModelTrainer( dataframe=clean_training_dataframe, model_type='classification', predicted_column='ThirtyDayReadmitFLG', grain_column='PatientEncounterID', verbose=False) # Step 3: split the data into train and test classification_trainer.train_test_split() # Step 4: Train some models # ## Train a KNN classifier with a randomized search over custom hyperparameters knn_hyperparameters = { 'algorithm': ['ball_tree', 'kd_tree'], 'n_neighbors': [1, 4, 6, 8, 10, 15, 20, 30, 50, 100, 200], 'weights': ['uniform', 'distance'] } trained_knn = classification_trainer.knn( scoring_metric='accuracy', hyperparameter_grid=knn_hyperparameters, randomized_search=True, # Set this relative to the size of your hyperparameter space. Higher will train more models and be slower # Lower will be faster and possibly less performant number_iteration_samples=10) # ## Train a random forest classifier with a randomized search over custom hyperparameters # TODO these are bogus hyperparams for random forest random_forest_hyperparameters = { 'n_estimators': [50, 100, 200, 300], 'max_features': [1, 2, 3, 4], 'max_leaf_nodes': [None, 30, 400] } trained_random_forest = classification_trainer.random_forest_classifier( scoring_metric='accuracy', hyperparameter_grid=random_forest_hyperparameters, randomized_search=True, # Set this relative to the size of your hyperparameter space. Higher will train more models and be slower # Lower will be faster and possibly less performant number_iteration_samples=10) # Show the random forest feature importance graph hcai_tsm.plot_rf_features_from_tsm(trained_random_forest, classification_trainer.x_train, save=False) # ## Train a custom ensemble of models # The ensemble methods take a dictionary of TrainedSupervisedModels by a name of your choice custom_ensemble = { 'KNN': classification_trainer.knn(hyperparameter_grid=knn_hyperparameters, randomized_search=False, scoring_metric='roc_auc'), 'Logistic Regression': classification_trainer.logistic_regression(), 'Random Forest Classifier': classification_trainer.random_forest_classifier( randomized_search=False, scoring_metric='roc_auc') } trained_ensemble = classification_trainer.ensemble_classification( scoring_metric='roc_auc', trained_model_by_name=custom_ensemble) # Step 5: Evaluate and compare the models # Create a list of all the models you just trained that you want to compare models_to_compare = [trained_knn, trained_random_forest, trained_ensemble] # Create a ROC plot that compares all the them. hcai_tsm.tsm_classification_comparison_plots( trained_supervised_models=models_to_compare, plot_type='ROC', save=False) # Create a PR plot that compares all the them. hcai_tsm.tsm_classification_comparison_plots( trained_supervised_models=models_to_compare, plot_type='PR', save=False) # Inspect the raw ROC or PR cutoffs print(trained_random_forest.roc(print_output=False)) print(trained_random_forest.pr(print_output=False))
def test_load_diabetes(self): df = ds.load_diabetes() self.assertEqual(1000, df.shape[0]) self.assertEqual(7, df.shape[1])
def test_class_counter_on_many(self): df = hcai_datasets.load_diabetes() result = count_unique_elements_in_column(df, 'PatientEncounterID') self.assertEqual(result, 1000)
def test_class_counter_on_binary(self): df = hcai_datasets.load_diabetes() df.dropna(axis=0, how='any', inplace=True) result = count_unique_elements_in_column(df, 'ThirtyDayReadmitFLG') self.assertEqual(result, 2)