Пример #1
0
    def setUpClass(cls):
        cls.df = hcai_datasets.load_diabetes()

        # Drop columns that won't help machine learning
        columns_to_remove = ['PatientID']
        cls.df.drop(columns_to_remove, axis=1, inplace=True)

        np.random.seed(42)
        clean_regression_df = pipelines.full_pipeline(
            REGRESSION,
            REGRESION_PREDICTED_COLUMN,
            GRAIN_COLUMN_NAME,
            impute=True).fit_transform(cls.df)

        clean_classification_df = pipelines.full_pipeline(
            CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN,
            GRAIN_COLUMN_NAME,
            impute=True).fit_transform(cls.df)

        cls.regression_trainer = AdvancedSupervisedModelTrainer(
            pipelines.full_pipeline(REGRESSION,
                                    REGRESION_PREDICTED_COLUMN,
                                    GRAIN_COLUMN_NAME,
                                    impute=True), clean_regression_df,
            REGRESSION, REGRESION_PREDICTED_COLUMN)

        cls.classification_trainer = AdvancedSupervisedModelTrainer(
            pipelines.full_pipeline(CLASSIFICATION,
                                    CLASSIFICATION_PREDICTED_COLUMN,
                                    GRAIN_COLUMN_NAME,
                                    impute=True).fit_transform(cls.df),
            clean_classification_df, CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN)
Пример #2
0
    def __init__(self,
                 dataframe,
                 predicted_column,
                 model_type,
                 impute=True,
                 grain_column=None,
                 verbose=False):
        """
        Set up a SupervisedModelTrainer

        Args:
            dataframe (pandas.core.frame.DataFrame): The training data in a pandas dataframe
            predicted_column (str): The name of the prediction column 
            model_type (str): the trainer type - 'classification' or 'regression'
            impute (bool): True to impute data (mean of numeric columns and mode of categorical ones). False to drop rows
                that contain any null values.
            grain_column (str): The name of the grain column
            verbose (bool): Set to true for verbose output. Defaults to False.
        """
        self.predicted_column = predicted_column
        self.grain_column = grain_column

        # Build the pipeline
        # Note: Missing numeric values are imputed in prediction. If we don't impute, then some rows on the prediction
        # data frame will be removed, which results in missing predictions.
        pipeline = hcai_pipelines.full_pipeline(model_type,
                                                predicted_column,
                                                grain_column,
                                                impute=impute)
        prediction_pipeline = hcai_pipelines.full_pipeline(model_type,
                                                           predicted_column,
                                                           grain_column,
                                                           impute=True)

        # Run the raw data through the data preparation pipeline
        clean_dataframe = pipeline.fit_transform(dataframe)
        _ = prediction_pipeline.fit_transform(dataframe)

        # Instantiate the advanced class
        self._advanced_trainer = AdvancedSupervisedModelTrainer(
            dataframe=clean_dataframe,
            model_type=model_type,
            predicted_column=predicted_column,
            grain_column=grain_column,
            original_column_names=dataframe.columns.values,
            verbose=verbose)

        # Save the pipeline to the parent class
        self._advanced_trainer.pipeline = prediction_pipeline

        # Split the data into train and test
        self._advanced_trainer.train_test_split()

        self._advanced_trainer.categorical_column_info = get_categorical_levels(
            dataframe=dataframe,
            columns_to_ignore=[grain_column, predicted_column])
Пример #3
0
    def setUp(self):
        df = hcai_datasets.load_diabetes()
        # Drop uninformative columns
        df.drop(['PatientID'], axis=1, inplace=True)

        np.random.seed(42)
        clean_df = pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME,
                                           impute=True).fit_transform(df)
        self.trainer = AdvancedSupervisedModelTrainer(pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME,
                                           impute=True),clean_df, CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN)
        self.trainer.train_test_split(random_seed=0)
Пример #4
0
    def test_random_foarest_tuning_2_column_raises_error(self):
        df_raw = hcai_datasets.load_diabetes()
        # select only specific columns
        df = df_raw[['ThirtyDayReadmitFLG', 'SystolicBPNBR', 'LDLNBR']]

        np.random.seed(42)
        clean_df = pipelines.full_pipeline(
            CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN,
            GRAIN_COLUMN_NAME,
            impute=True).fit_transform(df)
        trainer = AdvancedSupervisedModelTrainer(pipelines.full_pipeline(CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN, GRAIN_COLUMN_NAME,
                                           impute=True),clean_df, CLASSIFICATION, CLASSIFICATION_PREDICTED_COLUMN)

        trainer.train_test_split()

        self.assertRaises(HealthcareAIError, trainer.random_forest_classifier, trees=200, randomized_search=True)
    def setUpClass(cls):
        cls.df = hcai_datasets.load_diabetes()

        # Drop columns that won't help machine learning
        columns_to_remove = ['PatientID']
        cls.df.drop(columns_to_remove, axis=1, inplace=True)

        np.random.seed(42)
        clean_regression_df = pipelines.full_pipeline(
            REGRESSION,
            REGRESION_PREDICTED_COLUMN,
            GRAIN_COLUMN_NAME,
            impute=True).fit_transform(cls.df)

        clean_classification_df = pipelines.full_pipeline(
            CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN,
            GRAIN_COLUMN_NAME,
            impute=True).fit_transform(cls.df)

        cls.regression_trainer = AdvancedSupervisedModelTrainer(pipelines.full_pipeline(
            REGRESSION,
            REGRESION_PREDICTED_COLUMN,
            GRAIN_COLUMN_NAME,
            impute=True),
            clean_regression_df,
            REGRESSION,
            REGRESION_PREDICTED_COLUMN)

        cls.classification_trainer = AdvancedSupervisedModelTrainer(pipelines.full_pipeline(
            CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN,
            GRAIN_COLUMN_NAME,
            impute=True).fit_transform(cls.df),
            clean_classification_df, CLASSIFICATION,
            CLASSIFICATION_PREDICTED_COLUMN)
Пример #6
0
    def __init__(self,
                 dataframe,
                 predicted_column,
                 model_type,
                 impute=True,
                 grain_column=None,
                 verbose=True,
                 imputeStrategy='MeanMedian'):
        """
        Set up a SupervisedModelTrainer.

        Helps the user by checking for high cardinality features (such as IDs or
        other unique identifiers) and low cardinality features (a column where
        all values are equal.

        Args:
            dataframe (pandas.core.frame.DataFrame): The training data in a 
            pandas dataframe 
            
            predicted_column (str): The name of the prediction column

            model_type (str): the trainer type - 'classification' or 'regression'

            impute (bool): True to impute data (mean of numeric columns and 
            mode of categorical ones). False to drop
            rows that contain any null values.

            grain_column (str): The name of the grain column

            verbose (bool): Set to true for verbose output. Defaults to True.
        """
        self.predicted_column = predicted_column
        self.grain_column = grain_column

        # Build the pipeline
        # Note: Missing numeric values are imputed in prediction. If we don't
        # impute, then some rows on the prediction
        # data frame will be removed, which results in missing predictions.
        pipeline = hcai_pipelines.full_pipeline(model_type,
                                                predicted_column,
                                                grain_column,
                                                impute=impute,
                                                verbose=True,
                                                imputeStrategy=imputeStrategy)

        prediction_pipeline = hcai_pipelines.full_pipeline(
            model_type,
            predicted_column,
            grain_column,
            impute=True,
            verbose=False,
            imputeStrategy=imputeStrategy)

        # Run a low and high cardinality check. Warn the user, and allow
        # them to proceed.
        hcai_ordinality.check_high_cardinality(dataframe, self.grain_column)
        hcai_ordinality.check_one_cardinality(dataframe)

        # Run the raw data through the data preparation pipeline
        clean_dataframe = pipeline.fit_transform(dataframe)
        _ = prediction_pipeline.fit_transform(dataframe)

        # Instantiate the advanced class
        self._advanced_trainer = AdvancedSupervisedModelTrainer(
            pipeline=pipeline,
            dataframe=clean_dataframe,
            model_type=model_type,
            predicted_column=predicted_column,
            grain_column=grain_column,
            original_column_names=dataframe.columns.values,
            verbose=verbose)

        # Save the pipeline to the parent class
        self._advanced_trainer.pipeline = prediction_pipeline

        # Split the data into train and test
        self._advanced_trainer.train_test_split()

        self._advanced_trainer.categorical_column_info = get_categorical_levels(
            dataframe=dataframe,
            columns_to_ignore=[grain_column, predicted_column])
    def __init__(self, dataframe, predicted_column, model_type, impute=True, grain_column=None, verbose=True):
        """
        Set up a SupervisedModelTrainer.

        Helps the user by checking for high cardinality features (such as IDs or
        other unique identifiers) and low cardinality features (a column where
        all values are equal.

        Args:
            dataframe (pandas.core.frame.DataFrame): The training data in a 
            pandas dataframe 
            
            predicted_column (str): The name of the prediction column

            model_type (str): the trainer type - 'classification' or 'regression'

            impute (bool): True to impute data (mean of numeric columns and 
            mode of categorical ones). False to drop
            rows that contain any null values.

            grain_column (str): The name of the grain column

            verbose (bool): Set to true for verbose output. Defaults to True.
        """
        self.predicted_column = predicted_column
        self.grain_column = grain_column

        # Build the pipeline
        # Note: Missing numeric values are imputed in prediction. If we don't 
        # impute, then some rows on the prediction
        # data frame will be removed, which results in missing predictions.
        pipeline = hcai_pipelines.full_pipeline(model_type, predicted_column, grain_column, impute=impute,
                                                verbose=True)

        prediction_pipeline = hcai_pipelines.full_pipeline(model_type, predicted_column, grain_column, impute=True,
                                                           verbose=False)

        # Run a low and high cardinality check. Warn the user, and allow
        # them to proceed.
        hcai_ordinality.check_high_cardinality(dataframe, self.grain_column)
        hcai_ordinality.check_one_cardinality(dataframe)

        # Run the raw data through the data preparation pipeline
        clean_dataframe = pipeline.fit_transform(dataframe)
        _ = prediction_pipeline.fit_transform(dataframe)

        # Instantiate the advanced class
        self._advanced_trainer = AdvancedSupervisedModelTrainer(pipeline=pipeline,
            dataframe=clean_dataframe,
            model_type=model_type,
            predicted_column=predicted_column,
            grain_column=grain_column,
            original_column_names=dataframe.columns.values,
            verbose=verbose)

        # Save the pipeline to the parent class
        self._advanced_trainer.pipeline = prediction_pipeline

        # Split the data into train and test
        self._advanced_trainer.train_test_split()

        self._advanced_trainer.categorical_column_info = get_categorical_levels(
            dataframe=dataframe,
            columns_to_ignore=[grain_column, predicted_column])
Пример #8
0
def main():
    """Template script for ADVANCED USERS using healthcareai."""
    # Load the included diabetes sample data
    dataframe = healthcareai.load_diabetes()

    # ...or load your own data from a .csv file: Uncomment to pull data from your CSV
    # dataframe = healthcareai.load_csv('path/to/your.csv')

    # ...or load data from a MSSQL server: Uncomment to pull data from MSSQL server
    # server = 'localhost'
    # database = 'SAM'
    # query = """SELECT *
    #             FROM [SAM].[dbo].[DiabetesClincialSampleData]
    #             -- In this step, just grab rows that have a target
    #             WHERE ThirtyDayReadmitFLG is not null"""
    #
    # engine = hcai_db.build_mssql_engine_using_trusted_connections(server=server, database=database)
    # dataframe = pd.read_sql(query, engine)

    # Peek at the first 5 rows of data
    print(dataframe.head(5))

    # Drop columns that won't help machine learning
    dataframe.drop(['PatientID'], axis=1, inplace=True)

    # Step 1: Prepare the data using optional imputation. There are two options for this:

    # ## Option 1: Use built in data prep pipeline that does enocding, imputation, null filtering, dummification
    clean_training_dataframe = hcai_pipelines.full_pipeline(
        'classification',
        'ThirtyDayReadmitFLG',
        'PatientEncounterID',
        impute=True).fit_transform(dataframe)

    # ## Option 2: Build your own pipeline using healthcare.ai methods, your own, or a combination of either.
    # - Please note this is intentionally spartan, so we don't hinder your creativity. :)
    # - Also note that many of the healthcare.ai transformers intentionally return dataframes, compared to scikit that
    #   return numpy arrays
    # custom_pipeline = Pipeline([
    #     ('remove_grain_column', hcai_filters.DataframeColumnRemover(columns_to_remove=['PatientEncounterID', 'PatientID'])),
    #     ('imputation', hcai_transformers.DataFrameImputer(impute=True)),
    #     ('convert_target_to_binary', hcai_transformers.DataFrameConvertTargetToBinary('classification', 'ThirtyDayReadmitFLG')),
    #     # ('prediction_to_numeric', hcai_transformers.DataFrameConvertColumnToNumeric('ThirtyDayReadmitFLG')),
    #     # ('create_dummy_variables', hcai_transformers.DataFrameCreateDummyVariables(excluded_columns=['ThirtyDayReadmitFLG'])),
    # ])
    #
    # clean_training_dataframe = custom_pipeline.fit_transform(dataframe)

    # Step 2: Instantiate an Advanced Trainer class with your clean and prepared training data
    classification_trainer = healthcareai.AdvancedSupervisedModelTrainer(
        dataframe=clean_training_dataframe,
        model_type='classification',
        predicted_column='ThirtyDayReadmitFLG',
        grain_column='PatientEncounterID',
        verbose=False)

    # Step 3: split the data into train and test
    classification_trainer.train_test_split()

    # Step 4: Train some models

    # ## Train a KNN classifier with a randomized search over custom hyperparameters
    knn_hyperparameters = {
        'algorithm': ['ball_tree', 'kd_tree'],
        'n_neighbors': [1, 4, 6, 8, 10, 15, 20, 30, 50, 100, 200],
        'weights': ['uniform', 'distance']
    }

    trained_knn = classification_trainer.knn(
        scoring_metric='accuracy',
        hyperparameter_grid=knn_hyperparameters,
        randomized_search=True,
        # Set this relative to the size of your hyperparameter space. Higher will train more models and be slower
        # Lower will be faster and possibly less performant
        number_iteration_samples=10)

    # ## Train a random forest classifier with a randomized search over custom hyperparameters
    # TODO these are bogus hyperparams for random forest
    random_forest_hyperparameters = {
        'n_estimators': [50, 100, 200, 300],
        'max_features': [1, 2, 3, 4],
        'max_leaf_nodes': [None, 30, 400]
    }

    trained_random_forest = classification_trainer.random_forest_classifier(
        scoring_metric='accuracy',
        hyperparameter_grid=random_forest_hyperparameters,
        randomized_search=True,
        # Set this relative to the size of your hyperparameter space. Higher will train more models and be slower
        # Lower will be faster and possibly less performant
        number_iteration_samples=10)

    # Show the random forest feature importance graph
    hcai_tsm.plot_rf_features_from_tsm(trained_random_forest,
                                       classification_trainer.x_train,
                                       feature_limit=20,
                                       save=False)

    # ## Train a custom ensemble of models
    # The ensemble methods take a dictionary of TrainedSupervisedModels by a name of your choice
    custom_ensemble = {
        'KNN':
        classification_trainer.knn(hyperparameter_grid=knn_hyperparameters,
                                   randomized_search=False,
                                   scoring_metric='roc_auc'),
        'Logistic Regression':
        classification_trainer.logistic_regression(),
        'Random Forest Classifier':
        classification_trainer.random_forest_classifier(
            randomized_search=False, scoring_metric='roc_auc')
    }

    trained_ensemble = classification_trainer.ensemble_classification(
        scoring_metric='roc_auc', trained_model_by_name=custom_ensemble)

    # Step 5: Evaluate and compare the models

    # Create a list of all the models you just trained that you want to compare
    models_to_compare = [trained_knn, trained_random_forest, trained_ensemble]

    # Create a ROC plot that compares all the them.
    hcai_tsm.tsm_classification_comparison_plots(
        trained_supervised_models=models_to_compare,
        plot_type='ROC',
        save=False)

    # Create a PR plot that compares all the them.
    hcai_tsm.tsm_classification_comparison_plots(
        trained_supervised_models=models_to_compare,
        plot_type='PR',
        save=False)

    # Inspect the raw ROC or PR cutoffs
    print(trained_random_forest.roc(print_output=False))
    print(trained_random_forest.pr(print_output=False))
Пример #9
0
def main():
    """Template script for ADVANCED USERS using healthcareai."""
    # Load the included diabetes sample data
    dataframe = healthcareai.load_diabetes()

    # ...or load your own data from a .csv file: Uncomment to pull data from your CSV
    # dataframe = healthcareai.load_csv('path/to/your.csv')

    # ...or load data from a MSSQL server: Uncomment to pull data from MSSQL server
    # server = 'localhost'
    # database = 'SAM'
    # query = """SELECT *
    #             FROM [SAM].[dbo].[DiabetesClincialSampleData]
    #             -- In this step, just grab rows that have a target
    #             WHERE ThirtyDayReadmitFLG is not null"""
    #
    # engine = hcai_db.build_mssql_engine_using_trusted_connections(server=server, database=database)
    # dataframe = pd.read_sql(query, engine)

    # Peek at the first 5 rows of data
    print(dataframe.head(5))

    # Drop columns that won't help machine learning
    dataframe.drop(['PatientID'], axis=1, inplace=True)

    # Step 1: Prepare the data using optional imputation. There are two options for this:

    # ## Option 1: Use built in data prep pipeline that does enocding, imputation, null filtering, dummification
    clean_training_dataframe = hcai_pipelines.full_pipeline(
        'classification',
        'ThirtyDayReadmitFLG',
        'PatientEncounterID',
        impute=True).fit_transform(dataframe)

    # ## Option 2: Build your own pipeline using healthcare.ai methods, your own, or a combination of either.
    # - Please note this is intentionally spartan, so we don't hinder your creativity. :)
    # - Also note that many of the healthcare.ai transformers intentionally return dataframes, compared to scikit that
    #   return numpy arrays
    # custom_pipeline = Pipeline([
    #     ('remove_grain_column', hcai_filters.DataframeColumnRemover(columns_to_remove=['PatientEncounterID', 'PatientID'])),
    #     ('imputation', hcai_transformers.DataFrameImputer(impute=True)),
    #     ('convert_target_to_binary', hcai_transformers.DataFrameConvertTargetToBinary('classification', 'ThirtyDayReadmitFLG')),
    #     # ('prediction_to_numeric', hcai_transformers.DataFrameConvertColumnToNumeric('ThirtyDayReadmitFLG')),
    #     # ('create_dummy_variables', hcai_transformers.DataFrameCreateDummyVariables(excluded_columns=['ThirtyDayReadmitFLG'])),
    # ])
    #
    # clean_training_dataframe = custom_pipeline.fit_transform(dataframe)

    # Step 2: Instantiate an Advanced Trainer class with your clean and prepared training data
    classification_trainer = healthcareai.AdvancedSupervisedModelTrainer(
        dataframe=clean_training_dataframe,
        model_type='classification',
        predicted_column='ThirtyDayReadmitFLG',
        grain_column='PatientEncounterID',
        verbose=False)

    # Step 3: split the data into train and test
    classification_trainer.train_test_split()

    # Step 4: Train some models

    # ## Train a KNN classifier with a randomized search over custom hyperparameters
    knn_hyperparameters = {
        'algorithm': ['ball_tree', 'kd_tree'],
        'n_neighbors': [1, 4, 6, 8, 10, 15, 20, 30, 50, 100, 200],
        'weights': ['uniform', 'distance']}

    trained_knn = classification_trainer.knn(
        scoring_metric='accuracy',
        hyperparameter_grid=knn_hyperparameters,
        randomized_search=True,
        # Set this relative to the size of your hyperparameter space. Higher will train more models and be slower
        # Lower will be faster and possibly less performant
        number_iteration_samples=10
    )

    # ## Train a random forest classifier with a randomized search over custom hyperparameters
    # TODO these are bogus hyperparams for random forest
    random_forest_hyperparameters = {
        'n_estimators': [50, 100, 200, 300],
        'max_features': [1, 2, 3, 4],
        'max_leaf_nodes': [None, 30, 400]}

    trained_random_forest = classification_trainer.random_forest_classifier(
        scoring_metric='accuracy',
        hyperparameter_grid=random_forest_hyperparameters,
        randomized_search=True,
        # Set this relative to the size of your hyperparameter space. Higher will train more models and be slower
        # Lower will be faster and possibly less performant
        number_iteration_samples=10
    )

    # Show the random forest feature importance graph
    hcai_tsm.plot_rf_features_from_tsm(
        trained_random_forest,
        classification_trainer.x_train,
        feature_limit=20,
        save=False)

    # ## Train a custom ensemble of models
    # The ensemble methods take a dictionary of TrainedSupervisedModels by a name of your choice
    custom_ensemble = {
        'KNN': classification_trainer.knn(
            hyperparameter_grid=knn_hyperparameters,
            randomized_search=False,
            scoring_metric='roc_auc'),
        'Logistic Regression': classification_trainer.logistic_regression(),
        'Random Forest Classifier': classification_trainer.random_forest_classifier(
            randomized_search=False,
            scoring_metric='roc_auc')}

    trained_ensemble = classification_trainer.ensemble_classification(
        scoring_metric='roc_auc',
        trained_model_by_name=custom_ensemble)

    # Step 5: Evaluate and compare the models

    # Create a list of all the models you just trained that you want to compare
    models_to_compare = [trained_knn, trained_random_forest, trained_ensemble]

    # Create a ROC plot that compares all the them.
    hcai_tsm.tsm_classification_comparison_plots(
        trained_supervised_models=models_to_compare,
        plot_type='ROC',
        save=False)

    # Create a PR plot that compares all the them.
    hcai_tsm.tsm_classification_comparison_plots(
        trained_supervised_models=models_to_compare,
        plot_type='PR',
        save=False)

    # Inspect the raw ROC or PR cutoffs
    print(trained_random_forest.roc(print_output=False))
    print(trained_random_forest.pr(print_output=False))
Пример #10
0
    def __init__(self, dataframe, predicted_column, model_type, impute=True, grain_column=None, verbose=True, imputeStrategy='MeanMode',
                 tunedRandomForest=False, numeric_columns_as_categorical=None ):
        """
        Set up a SupervisedModelTrainer.

        Helps the user by checking for high cardinality features (such as IDs or
        other unique identifiers) and low cardinality features (a column where
        all values are equal.

        Args:
        -----
        dataframe (pandas.core.frame.DataFrame): The training data in a 
        pandas dataframe 
        
        predicted_column (str): The name of the prediction column

        model_type (str): the trainer type - 'classification' or 'regression'

        impute (bool): True to impute data (mean of numeric columns and 
        mode of categorical ones). False to drop
        rows that contain any null values.

        grain_column (str): The name of the grain column

        verbose (bool): Set to true for verbose output. Defaults to True.
        
        impute : boolean, default=True
        	If True, imputation of missing value takes place.
        	If False, drop rows that contain any null values.
        	
        imputeStrategy : string, default='MeanMode'
        	It decides the technique to be used for imputation of missing values.
        	If imputeStrategy = 'MeanMode', Columns of dtype object or category 
            (assumed categorical) and imputed by the mode value of that column. 
            Columns of other types (assumed continuous) : by mean of column.
        	
             If imputeStrategy = 'RandomForest', Columns of dtype object or category 
            (assumed categorical) : imputed using RandomForestClassifier. 
            Columns of other types (assumed continuous) : imputed using RandomForestRegressor
        			
        
        tunedRandomForest : boolean, default=False
        	If set to True, RandomForestClassifier/RandomForestRegressor to be used for 
        	imputation of missing values are tuned using grid search and K-fold cross 
        	validation.
        	
        	Note:
        	If set to True, imputation process may take longer time depending upon size of 
        	dataframe and number of columns having missing values.
        	
        numeric_columns_as_categorical : List of type String, default=None
        	List of column names which are numeric(int/float) in dataframe, but by nature 
        	they are to be considered as categorical.
        	
        	For example:
        	There is a column JobCode( Levels : 1,2,3,4,5,6)
        	If there are missing values in JobCode column, panadas will by default convert 
        	this column into type float.
        	
        	If numeric_columns_as_categorical=None
        	Missing values of this column will be imputed by Mean value of JobCode column.
        	type of 'JobCode' column will remain float. 
        	
            If numeric_columns_as_categorical=['JobCode']
            Missing values of this column will be imputed by mode value of JobCode column.
            Also final type of 'JobCode' column will be numpy.object 
				
        """
        self.predicted_column = predicted_column
        self.grain_column = grain_column

        # Build the pipeline
        # Note: Missing numeric values are imputed in prediction. If we don't 
        # impute, then some rows on the prediction
        # data frame will be removed, which results in missing predictions.
        pipeline = hcai_pipelines.full_pipeline(model_type, predicted_column, grain_column, impute=impute,
                                                verbose=True, imputeStrategy=imputeStrategy, tunedRandomForest=tunedRandomForest,
                                                numeric_columns_as_categorical=numeric_columns_as_categorical )

        prediction_pipeline = hcai_pipelines.full_pipeline(model_type, predicted_column, grain_column, impute=True,
                                                           verbose=False, imputeStrategy=imputeStrategy, tunedRandomForest=tunedRandomForest, 
                                                           numeric_columns_as_categorical=numeric_columns_as_categorical )

        # Run a low and high cardinality check. Warn the user, and allow
        # them to proceed.
        hcai_ordinality.check_high_cardinality(dataframe, self.grain_column)
        hcai_ordinality.check_one_cardinality(dataframe)

        # Run the raw data through the data preparation pipeline
        clean_dataframe = pipeline.fit_transform(dataframe)
        _ = prediction_pipeline.fit_transform(dataframe)

        # Instantiate the advanced class
        self._advanced_trainer = AdvancedSupervisedModelTrainer(pipeline=pipeline,
            dataframe=clean_dataframe,
            model_type=model_type,
            predicted_column=predicted_column,
            grain_column=grain_column,
            original_column_names=dataframe.columns.values,
            verbose=verbose)

        # Save the pipeline to the parent class
        self._advanced_trainer.pipeline = prediction_pipeline

        # Split the data into train and test
        self._advanced_trainer.train_test_split()

        self._advanced_trainer.categorical_column_info = get_categorical_levels(
            dataframe=dataframe,
            columns_to_ignore=[grain_column, predicted_column])