Пример #1
0
    def setUpClass(cls):
        df = hcai_datasets.load_diabetes()

        # Drop columns that won't help machine learning
        columns_to_remove = ['PatientID']
        df.drop(columns_to_remove, axis=1, inplace=True)

        cls.classification_trainer = SupervisedModelTrainer(dataframe=df,
                                                            predicted_column='ThirtyDayReadmitFLG',
                                                            model_type='classification',
                                                            impute=True,
                                                            grain_column='PatientEncounterID',
                                                            verbose=False)
        cls.regression_trainer = SupervisedModelTrainer(df,
                                                        'SystolicBPNBR',
                                                        'regression',
                                                        grain_column='PatientEncounterID',
                                                        impute=True,
                                                        verbose=False)

        cls.regression_trainer_impute_false = SupervisedModelTrainer(df,
                                                                     'SystolicBPNBR',
                                                                     'regression',
                                                                     grain_column='PatientEncounterID',
                                                                     impute=False,
                                                                     verbose=False)
    def setUpClass(cls):
        """ Load a dataframe, train a linear model and prepare a prediction dataframe for assertions """

        # Build a toy dataset with known correlations (and some noise) for testing
        # TODO Modify probs and cat_weights (and expected df) once top factors is made independent of dummification
        probs = (0.334, 0.333, 0.333)  # distribution of categorical
        cat_weights = (-3, -4, 7)
        # TODO Modify mu and sigma once feature scaling is built into the logistic regression
        mu = 0  # mean of negative_corr
        sigma = 1  # standard deviation of negative_corr
        noise = 0.5  # standard deviation of noise
        rows = 200
        # Set seed to guarantee data set is always the same
        np.random.seed(1066)
        factors_df = pd.DataFrame({
            'id': range(rows),  # grain col
            'positive_corr': np.random.normal(size=rows),
            'categorical': np.random.choice(['Common', 'Medium', 'Rare'], p=probs, size=rows),
            'negative_corr': np.random.normal(loc=mu, scale=sigma, size=rows),
            'useless_pred_1': np.random.normal(size=rows),
            'useless_pred_2': np.random.choice(['Y', 'N'], size=rows)},
            columns=['id', 'positive_corr', 'categorical', 'negative_corr', 'useless_pred_1', 'useless_pred_2'])

        # Set true decision boundary using importance
        factors_df['dot_product'] = 4 * factors_df['positive_corr']
        factors_df.loc[factors_df['categorical'] == 'Common', 'dot_product'] += cat_weights[0]
        factors_df.loc[factors_df['categorical'] == 'Medium', 'dot_product'] += cat_weights[1]
        factors_df.loc[factors_df['categorical'] == 'Rare', 'dot_product'] += cat_weights[2]
        factors_df['dot_product'] += -2 * (factors_df['negative_corr'] - mu) / sigma

        # Add noise
        factors_df['dot_product'] += np.random.normal(scale=noise, size=rows)

        # Add labels
        factors_df['response'] = 'N'
        factors_df.loc[factors_df['dot_product'] > 0, 'response'] = 'Y'

        # Remove column defining decision boundary
        factors_df.drop('dot_product', axis=1, inplace=True)
        # Reset random seed
        np.random.seed()

        training_df = factors_df.copy()

        hcai = SupervisedModelTrainer(
            dataframe=training_df,
            predicted_column='response',
            model_type='classification',
            impute=True,
            grain_column='id')

        # Train the logistic regression model
        cls.trained_lr = hcai.logistic_regression()

        # Load a new df for predicting
        cls.prediction_df = factors_df.copy()

        # Create various outputs
        cls.factors = cls.trained_lr.make_factors(cls.prediction_df, number_top_features=3)
Пример #3
0
def main():
    # Load the diabetes sample data
    dataframe = hcai_datasets.load_diabetes()

    # ## Load data from a MSSQL server: Uncomment to pull data from MSSQL server
    # server = 'localhost'
    # database = 'SAM'
    # query = """SELECT *
    #             FROM [SAM].[dbo].[DiabetesClincialSampleData]
    #             -- In this step, just grab rows that have a target
    #             WHERE ThirtyDayReadmitFLG is not null"""
    #
    # engine = hcai_db.build_mssql_engine(server=server, database=database)
    # dataframe = pd.read_sql(query, engine)

    # Drop columns that won't help machine learning
    dataframe.drop(['PatientID'], axis=1, inplace=True)

    # Step 1: Setup a healthcareai regression trainer. This prepares your data for model building
    regression_trainer = SupervisedModelTrainer(
        dataframe=dataframe,
        predicted_column='SystolicBPNBR',
        model_type='regression',
        grain_column='PatientEncounterID',
        impute=True,
        verbose=False)

    # Look at the first few rows of your dataframe after loading the data
    print(
        '\n\n-------------------[ Cleaned Dataframe ]--------------------------'
    )
    print(regression_trainer.clean_dataframe.head())

    # Step 2: train some models

    # Train and evaluate linear regression model
    trained_linear_model = regression_trainer.linear_regression()

    # Train and evaluate random forest model
    trained_random_forest = regression_trainer.random_forest_regression()

    # Once you are happy with the performance of any model, you can save it for use later in predicting new data.
    # File names are timestamped and look like '2017-05-31T12-36-21_regression_LinearRegression.pkl')
    # Note the file you saved and that will be used in example_regression_2.py
    trained_linear_model.save()
    def setUp(self):
        df = hcai_datasets.load_diabetes()

        # Drop columns that won't help machine learning
        columns_to_remove = ['PatientID']
        df.drop(columns_to_remove, axis=1, inplace=True)

        self.regression_trainer = SupervisedModelTrainer(df,
                                                         'SystolicBPNBR',
                                                         'regression',
                                                         grain_column='PatientEncounterID',
                                                         impute=True,
                                                         verbose=False)

        def undecorated_lr(self):
            return self._advanced_trainer.linear_regression(randomized_search=False)

        self.regression_trainer.undecorated_lr = undecorated_lr.__get__(self.regression_trainer,
                                                                        self.regression_trainer.__class__)
    def setUpClass(cls):
        """ Load a dataframe, train a linear model and prepare a prediction dataframe for assertions """
        training_df = hcai_datasets.load_diabetes()

        # Drop columns that won't help machine learning
        training_df.drop(['PatientID'], axis=1, inplace=True)

        regression_trainer = SupervisedModelTrainer(
            training_df,
            'SystolicBPNBR',
            'regression',
            impute=True,
            grain_column='PatientEncounterID')

        classification_trainer = SupervisedModelTrainer(
            training_df,
            'ThirtyDayReadmitFLG',
            'classification',
            impute=True,
            grain_column='PatientEncounterID')

        # Train the models
        cls.trained_linear_model = regression_trainer.linear_regression()
        cls.trained_lr = classification_trainer.logistic_regression()

        # Load a new df for predicting
        cls.prediction_df = hcai_datasets.load_diabetes()

        # Drop columns that won't help machine learning
        columns_to_remove = ['PatientID']
        cls.prediction_df.drop(columns_to_remove, axis=1, inplace=True)

        # Create various outputs
        cls.predictions = cls.trained_linear_model.make_predictions(cls.prediction_df)
        cls.factors = cls.trained_linear_model.make_factors(cls.prediction_df, number_top_features=3)
        cls.predictions_with_3_factors = cls.trained_linear_model.make_predictions_with_k_factors(
            cls.prediction_df,
            number_top_features=3)
        cls.original_with_predictions_3_factors = cls.trained_linear_model.make_original_with_predictions_and_factors(
            cls.prediction_df,
            number_top_features=3)
        cls.catalyst_dataframe = cls.trained_linear_model.create_catalyst_dataframe(cls.prediction_df)
class TestTrainerDecorator(unittest.TestCase):

    """Tests for the training decorator.

    We will compare a decorated linear regression and a undecorated linear regression output. They should not be
    the same.

    """

    def setUp(self):
        df = hcai_datasets.load_diabetes()

        # Drop columns that won't help machine learning
        columns_to_remove = ['PatientID']
        df.drop(columns_to_remove, axis=1, inplace=True)

        self.regression_trainer = SupervisedModelTrainer(df,
                                                         'SystolicBPNBR',
                                                         'regression',
                                                         grain_column='PatientEncounterID',
                                                         impute=True,
                                                         verbose=False)

        def undecorated_lr(self):
            return self._advanced_trainer.linear_regression(randomized_search=False)

        self.regression_trainer.undecorated_lr = undecorated_lr.__get__(self.regression_trainer,
                                                                        self.regression_trainer.__class__)

    def test_decorator(self):
        out = StringIO()
        sys.stdout = out
        self.regression_trainer.linear_regression()
        decorated_output = out.getvalue().strip()

        out = StringIO()
        sys.stdout = out
        self.regression_trainer.undecorated_lr()
        undecorated_output = out.getvalue().strip()

        assert decorated_output != undecorated_output
Пример #7
0
    def setUpClass(cls):
        """ Load a dataframe, train a linear model and prepare a prediction dataframe for assertions """
        training_df = hcai_datasets.load_diabetes()

        # Drop columns that won't help machine learning
        training_df.drop(['PatientID'], axis=1, inplace=True)

        regression_trainer = SupervisedModelTrainer(
            training_df,
            'SystolicBPNBR',
            'regression',
            impute=True,
            grain_column='PatientEncounterID')

        classification_trainer = SupervisedModelTrainer(
            training_df,
            'ThirtyDayReadmitFLG',
            'classification',
            impute=True,
            grain_column='PatientEncounterID')

        # Train the models
        cls.trained_linear_model = regression_trainer.linear_regression()
        cls.trained_lr = classification_trainer.logistic_regression()

        # Load a new df for predicting
        cls.prediction_df = hcai_datasets.load_diabetes()

        # Drop columns that won't help machine learning
        columns_to_remove = ['PatientID']
        cls.prediction_df.drop(columns_to_remove, axis=1, inplace=True)

        # Create various outputs
        cls.predictions = cls.trained_linear_model.make_predictions(
            cls.prediction_df)
        cls.factors = cls.trained_linear_model.make_factors(
            cls.prediction_df, number_top_features=3)
        cls.predictions_with_3_factors = cls.trained_linear_model.make_predictions_with_k_factors(
            cls.prediction_df, number_top_features=3)
        cls.original_with_predictions_3_factors = cls.trained_linear_model.make_original_with_predictions_and_factors(
            cls.prediction_df, number_top_features=3)
        cls.catalyst_dataframe = cls.trained_linear_model.create_catalyst_dataframe(
            cls.prediction_df)
def main():
    # Load the diabetes sample data
    dataframe = hcai_datasets.load_diabetes()

    # ## Load data from a MSSQL server: Uncomment to pull data from MSSQL server
    # server = 'localhost'
    # database = 'SAM'
    # query = """SELECT *
    #             FROM [SAM].[dbo].[DiabetesClincialSampleData]
    #             -- In this step, just grab rows that have a target
    #             WHERE ThirtyDayReadmitFLG is not null"""
    #
    # engine = hcai_db.build_mssql_engine(server=server, database=database)
    # dataframe = pd.read_sql(query, engine)

    # Drop columns that won't help machine learning
    dataframe.drop(['PatientID'], axis=1, inplace=True)

    # Step 1: Setup a healthcareai classification trainer. This prepares your data for model building
    classification_trainer = SupervisedModelTrainer(
        dataframe=dataframe,
        predicted_column='ThirtyDayReadmitFLG',
        model_type='classification',
        grain_column='PatientEncounterID',
        impute=True,
        verbose=False)

    # Look at the first few rows of your dataframe after loading the data
    print(
        '\n\n-------------------[ Cleaned Dataframe ]--------------------------'
    )
    print(classification_trainer.clean_dataframe.head())

    # Step 2: train some models

    # Train a KNN model
    trained_knn = classification_trainer.knn()

    # View the ROC and PR plots
    trained_knn.roc_plot()
    trained_knn.pr_plot()

    # Uncomment if you want to see all the ROC and/or PR thresholds
    # trained_knn.roc()
    # trained_knn.pr()

    # Train a logistic regression model
    trained_lr = classification_trainer.logistic_regression()

    # View the ROC and PR plots
    trained_lr.roc_plot()
    trained_lr.pr_plot()

    # Uncomment if you want to see all the ROC and/or PR thresholds
    # trained_lr.roc()
    # trained_lr.pr()

    # Train a random forest model and view the feature importance plot
    trained_random_forest = classification_trainer.random_forest(
        save_plot=False)
    # View the ROC and PR plots
    trained_random_forest.roc_plot()
    trained_random_forest.pr_plot()

    # Uncomment if you want to see all the ROC and/or PR thresholds
    # trained_random_forest.roc()
    # trained_random_forest.pr()

    # Create a list of all the models you just trained that you want to compare
    models_to_compare = [trained_knn, trained_lr, trained_random_forest]

    # Create a ROC plot that compares them.
    tsm_plots.tsm_classification_comparison_plots(
        trained_supervised_models=models_to_compare,
        plot_type='ROC',
        save=False)

    # Create a PR plot that compares them.
    tsm_plots.tsm_classification_comparison_plots(
        trained_supervised_models=models_to_compare,
        plot_type='PR',
        save=False)

    # Once you are happy with the performance of any model, you can save it for use later in predicting new data.
    # File names are timestamped and look like '2017-05-31T12-36-21_classification_RandomForestClassifier.pkl')
    # Note the file you saved and that will be used in example_classification_2.py
    trained_random_forest.save()
    def setUpClass(cls):
        """ Load a dataframe, train a linear model and prepare prediction data frames for assertions """
        rows = 200
        np.random.seed(112358)
        train_df = pd.DataFrame({'id': range(rows),
                                 'x': np.random.uniform(low=-5, high=5, size=rows),
                                 'y': np.random.normal(loc=0, scale=1, size=rows),
                                 'color': np.random.choice(['red', 'blue', 'green'], size=rows),
                                 'gender': np.random.choice(['male', 'female'], size=rows)},
                                columns=['id', 'x', 'y', 'color', 'gender'])
        # Assign labels
        # build true decision boundary using temp variable
        train_df['temp'] = 2 * train_df['x'] - train_df['y']
        train_df.loc[train_df['color'] == 'red', 'temp'] += 1
        train_df.loc[train_df['color'] == 'blue', 'temp'] += -1
        train_df.loc[train_df['gender'] == 'male', 'temp'] += 2
        # Add noise to avoid perfect separation
        train_df['temp'] += np.random.normal(scale=1, size=rows)
        # Add label
        train_df['response'] = np.where(train_df['temp'] > 0, 'Y', 'N')
        # drop temp column
        train_df.drop('temp', axis=1, inplace=True)

        hcai = SupervisedModelTrainer(
            dataframe=train_df,
            predicted_column='response',
            model_type='classification',
            impute=True,
            grain_column='id')

        # Train the logistic regression model
        cls.trained_lr = hcai.logistic_regression()

        # single row prediction dataframe
        cls.one_row1 = pd.DataFrame({'id': [2017],
                                     'x': [1.2],
                                     'y': [0.7],
                                     'color': ['red'],
                                     'gender': ['female']},
                                    columns=['id', 'x', 'y', 'color', 'gender'])

        # single row prediction dataframe with different values
        cls.one_row2 = pd.DataFrame({'id': [1066],
                                     'x': [0],
                                     'y': [-1],
                                     'color': ['green'],
                                     'gender': ['male']},
                                    columns=['id', 'x', 'y', 'color', 'gender'])

        # put these rows in a dataframe with all of the training data
        cls.large = pd.concat([cls.one_row1, cls.one_row2, train_df.drop('response', axis=1)])
        # prediction dataframe missing a numeric column
        cls.missing_x = pd.DataFrame({'id': range(50),
                                      'y': np.random.normal(loc=0, scale=1, size=50),
                                      'color': np.random.choice(['red', 'blue', 'green'], size=50),
                                      'gender': np.random.choice(['male', 'female'], size=50)},
                                     columns=['id', 'y', 'color', 'gender'])

        # prediction dataframe missing a categorical column
        cls.missing_color = pd.DataFrame({'id': range(50),
                                          'x': np.random.uniform(low=-5, high=5, size=50),
                                          'y': np.random.normal(loc=0, scale=1, size=50),
                                          'gender': np.random.choice(['male', 'female'], size=50)},
                                         columns=['id', 'x', 'y', 'gender'])

        # dataframe with new category level in one column
        cls.new_color = pd.DataFrame({'id': [1728, 1729],
                                      'x': [1.2, 1.2],
                                      'y': [-0.3, -0.3],
                                      'color': ['purple', np.NaN],
                                      'gender': ['female', 'female']},
                                     columns=['id', 'x', 'y', 'color', 'gender'])

        # dataframe with new category levels in two columns
        cls.new_color_and_gender = pd.DataFrame({'id': [1728, 1729],
                                                 'x': [1.2, 1.2],
                                                 'y': [-0.3, -0.3],
                                                 'color': ['purple', np.NaN],
                                                 'gender': ['other', np.NaN]},
                                                columns=['id', 'x', 'y', 'color', 'gender'])

        # dataframe with known distribution of cagegory levels
        cls.get_levels_df = pd.DataFrame({'grain': range(10),
                                          'letters': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'D'],
                                          'numeric': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                                          'numbers_mod_3': ['1', '2', '0', '1', '2', '0', '1', '2', '0', '1'],
                                          'float': [1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0],
                                          'mathematicians': ['Gauss', 'Euler', 'Gauss', 'Galois', 'Gauss',
                                                             'Euler', 'Grothendiek', 'Wiles', 'Hilbert', 'Hilbert'],
                                          'predicted': ['Y', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'N', 'Y', 'Y']},
                                         columns=['grain', 'letters', 'numeric', 'numbers_mod_3', 'float',
                                                  'mathematicians', 'predicted'])
        # Set mathematician column to category and choose the order in which the levels are listed (default is
        # alphabetical)
        cls.get_levels_df['mathematicians'] = cls.get_levels_df['mathematicians'].astype('category',
                                                                                         categories=['Wiles',
                                                                                                     'Euler',
                                                                                                     'Grotheniek',
                                                                                                     'Hilbert',
                                                                                                     'Gauss'],
                                                                                         ordered=False)

        # Reset random seed
        np.random.seed()
Пример #10
0
    def setUpClass(cls):
        """ Load a dataframe, train a linear model and prepare prediction data frames for assertions """
        rows = 200
        np.random.seed(112358)
        train_df = pd.DataFrame(
            {
                'id': range(rows),
                'x': np.random.uniform(low=-5, high=5, size=rows),
                'y': np.random.normal(loc=0, scale=1, size=rows),
                'color': np.random.choice(['red', 'blue', 'green'], size=rows),
                'gender': np.random.choice(['male', 'female'], size=rows)
            },
            columns=['id', 'x', 'y', 'color', 'gender'])
        # Assign labels
        # build true decision boundary using temp variable
        train_df['temp'] = 2 * train_df['x'] - train_df['y']
        train_df.loc[train_df['color'] == 'red', 'temp'] += 1
        train_df.loc[train_df['color'] == 'blue', 'temp'] += -1
        train_df.loc[train_df['gender'] == 'male', 'temp'] += 2
        # Add noise to avoid perfect separation
        train_df['temp'] += np.random.normal(scale=1, size=rows)
        # Add label
        train_df['response'] = np.where(train_df['temp'] > 0, 'Y', 'N')
        # drop temp column
        train_df.drop('temp', axis=1, inplace=True)

        hcai = SupervisedModelTrainer(dataframe=train_df,
                                      predicted_column='response',
                                      model_type='classification',
                                      impute=True,
                                      grain_column='id')

        # Train the logistic regression model
        cls.trained_lr = hcai.logistic_regression()

        # single row prediction dataframe
        cls.one_row1 = pd.DataFrame(
            {
                'id': [2017],
                'x': [1.2],
                'y': [0.7],
                'color': ['red'],
                'gender': ['female']
            },
            columns=['id', 'x', 'y', 'color', 'gender'])

        # single row prediction dataframe with different values
        cls.one_row2 = pd.DataFrame(
            {
                'id': [1066],
                'x': [0],
                'y': [-1],
                'color': ['green'],
                'gender': ['male']
            },
            columns=['id', 'x', 'y', 'color', 'gender'])

        # put these rows in a dataframe with all of the training data
        cls.large = pd.concat(
            [cls.one_row1, cls.one_row2,
             train_df.drop('response', axis=1)])
        # prediction dataframe missing a numeric column
        cls.missing_x = pd.DataFrame(
            {
                'id': range(50),
                'y': np.random.normal(loc=0, scale=1, size=50),
                'color': np.random.choice(['red', 'blue', 'green'], size=50),
                'gender': np.random.choice(['male', 'female'], size=50)
            },
            columns=['id', 'y', 'color', 'gender'])

        # prediction dataframe missing a categorical column
        cls.missing_color = pd.DataFrame(
            {
                'id': range(50),
                'x': np.random.uniform(low=-5, high=5, size=50),
                'y': np.random.normal(loc=0, scale=1, size=50),
                'gender': np.random.choice(['male', 'female'], size=50)
            },
            columns=['id', 'x', 'y', 'gender'])

        # dataframe with new category level in one column
        cls.new_color = pd.DataFrame(
            {
                'id': [1728, 1729],
                'x': [1.2, 1.2],
                'y': [-0.3, -0.3],
                'color': ['purple', np.NaN],
                'gender': ['female', 'female']
            },
            columns=['id', 'x', 'y', 'color', 'gender'])

        # dataframe with new category levels in two columns
        cls.new_color_and_gender = pd.DataFrame(
            {
                'id': [1728, 1729],
                'x': [1.2, 1.2],
                'y': [-0.3, -0.3],
                'color': ['purple', np.NaN],
                'gender': ['other', np.NaN]
            },
            columns=['id', 'x', 'y', 'color', 'gender'])

        # dataframe with known distribution of cagegory levels
        cls.get_levels_df = pd.DataFrame(
            {
                'grain':
                range(10),
                'letters': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'D'],
                'numeric': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                'numbers_mod_3':
                ['1', '2', '0', '1', '2', '0', '1', '2', '0', '1'],
                'float': [1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0],
                'mathematicians': [
                    'Gauss', 'Euler', 'Gauss', 'Galois', 'Gauss', 'Euler',
                    'Grothendiek', 'Wiles', 'Hilbert', 'Hilbert'
                ],
                'predicted':
                ['Y', 'Y', 'N', 'Y', 'Y', 'N', 'N', 'N', 'Y', 'Y']
            },
            columns=[
                'grain', 'letters', 'numeric', 'numbers_mod_3', 'float',
                'mathematicians', 'predicted'
            ])
        # Set mathematician column to category and choose the order in which the levels are listed (default is
        # alphabetical)
        cls.get_levels_df['mathematicians'] = cls.get_levels_df[
            'mathematicians'].astype('category',
                                     categories=[
                                         'Wiles', 'Euler', 'Grotheniek',
                                         'Hilbert', 'Gauss'
                                     ],
                                     ordered=False)

        # Reset random seed
        np.random.seed()