예제 #1
0
def main():
    # TODO List out which components are supplied by Automater
    # In this example, we're utilizing X and y generated by the Automater, auto.input_nub, auto.input_layers,
    # auto.output_nub, and auto.suggest_loss

    save_results = True

    # TODO Load data
    observations = None
    print('Observation columns: {}'.format(list(observations.columns)))

    # TODO Train /test split
    train_observations, test_observations = train_test_split(observations)
    train_observations = train_observations.copy()
    test_observations = test_observations.copy()

    # TODO List out variable types

    data_type_dict = {
        'numerical': [],
        'categorical': [],
        'text': [],
        'timeseries': []
    }
    output_var = None

    # Create and fit Automater
    auto = Automater(data_type_dict=data_type_dict, output_var=output_var)
    auto.fit(train_observations)

    # Transform data
    train_X, train_y = auto.fit_transform(train_observations)
    test_X, test_y = auto.transform(test_observations)

    # TODO Create and fit keras (deep learning) model.

    x = auto.input_nub
    x = Dense(32)(x)
    x = Dense(32)(x)
    x = auto.output_nub(x)

    model = Model(inputs=auto.input_layers, outputs=x)
    model.compile(optimizer='adam', loss=auto.suggest_loss())

    model.fit(train_X, train_y)

    # Make model predictions and inverse transform model predictions, to get usable results
    pred_test_y = model.predict(test_X)
    auto.inverse_transform_output(pred_test_y)

    # Save all results
    if save_results:
        temp_dir = lib.get_temp_dir()
        model.save(os.path.join(temp_dir, 'model.h5py'))
        pickle.dump(train_X, open(os.path.join(temp_dir, 'train_X.pkl'), 'wb'))
        pickle.dump(train_y, open(os.path.join(temp_dir, 'train_y.pkl'), 'wb'))
        pickle.dump(test_X, open(os.path.join(temp_dir, 'test_X.pkl'), 'wb'))
        pickle.dump(test_y, open(os.path.join(temp_dir, 'test_y.pkl'), 'wb'))
        pickle.dump(pred_test_y,
                    open(os.path.join(temp_dir, 'pred_test_y.pkl'), 'wb'))
예제 #2
0
def main():
    # List out which components are supplied by Automater
    # In this example, we're utilizing X and y generated by the Automater, auto.input_nub, auto.input_layers,
    # auto.output_nub, and auto.suggest_loss

    save_results = True

    # Load data
    observations = lib.load_lending_club()
    print('Observation columns: {}'.format(list(observations.columns)))
    print('Class balance:\n {}'.format(observations['loan_status'].value_counts()))

    # Train /test split
    train_observations, test_observations = train_test_split(observations)
    train_observations = train_observations.copy()
    test_observations = test_observations.copy()

    # List out variable types
    data_type_dict = {'numerical': ['loan_amnt', 'annual_inc', 'open_acc', 'dti', 'delinq_2yrs',
                                    'inq_last_6mths', 'mths_since_last_delinq', 'pub_rec', 'revol_bal',
                                    'revol_util',
                                    'total_acc', 'pub_rec_bankruptcies'],
                      'categorical': ['term', 'grade', 'emp_length', 'home_ownership', 'loan_status', 'addr_state',
                                      'application_type', 'disbursement_method'],
                      'text': ['desc', 'purpose', 'title']}
    output_var = 'loan_status'

    # Create and fit Automater
    auto = Automater(data_type_dict=data_type_dict, output_var=output_var)
    auto.fit(train_observations)

    # Transform data
    train_X, train_y = auto.fit_transform(train_observations)
    test_X, test_y = auto.transform(test_observations)

    # Create and fit keras (deep learning) model.

    x = auto.input_nub
    x = Dense(32)(x)
    x = Dense(32)(x)
    x = auto.output_nub(x)

    model = Model(inputs=auto.input_layers, outputs=x)
    model.compile(optimizer='adam', loss=auto.suggest_loss())

    model.fit(train_X, train_y)

    # Make model predictions and inverse transform model predictions, to get usable results
    pred_test_y = model.predict(test_X)
    auto.inverse_transform_output(pred_test_y)

    # Save all results
    if save_results:
        temp_dir = lib.get_temp_dir()
        model.save(os.path.join(temp_dir, 'model.h5py'))
        pickle.dump(train_X, open(os.path.join(temp_dir, 'train_X.pkl'), 'wb'))
        pickle.dump(train_y, open(os.path.join(temp_dir, 'train_y.pkl'), 'wb'))
        pickle.dump(test_X, open(os.path.join(temp_dir, 'test_X.pkl'), 'wb'))
        pickle.dump(test_y, open(os.path.join(temp_dir, 'test_y.pkl'), 'wb'))
        pickle.dump(pred_test_y, open(os.path.join(temp_dir, 'pred_test_y.pkl'), 'wb'))
예제 #3
0
    def test_create_input_nub_numerical(self):
        iris_df = lib.load_iris()

        # Zero variables
        variable_type_dict = {'numerical_vars': []}
        input_layers, input_nub = Automater()._create_input_nub(
            variable_type_dict, iris_df)
        self.assertEqual(list(), input_layers)

        # One variable
        iris_numerical_cols = ['sepal_length']
        variable_type_dict = {'numerical_vars': iris_numerical_cols}
        input_layers, input_nub = Automater(
            numerical_vars=iris_numerical_cols)._create_input_nub(
                variable_type_dict, iris_df)
        self.assertEqual(1, len(input_layers))

        # Multiple numeric variables
        iris_numerical_cols = [
            'sepal_length', 'sepal_width', 'petal_length', 'petal_width'
        ]
        variable_type_dict = {'numerical_vars': iris_numerical_cols}
        input_layers, input_nub = Automater(
            numerical_vars=iris_numerical_cols)._create_input_nub(
                variable_type_dict, iris_df)
        self.assertEqual(4, len(input_layers))
예제 #4
0
    def test_create_sklearn_pandas_mapper_pipeline_length(self):
        # Base case: No variables
        data = {}
        input_mapper, output_mapper = Automater()._create_mappers(data)
        self.assertCountEqual(list(), input_mapper.features)
        self.assertCountEqual(list(), output_mapper.features)

        # A single numerical
        data = {'numerical_vars': ['n1']}
        input_mapper, output_mapper = Automater()._create_mappers(data)
        self.assertEqual(1, len(input_mapper.features))

        # Two numerical
        data = {'numerical_vars': ['n1', 'n2']}
        input_mapper, output_mapper = Automater()._create_mappers(data)
        self.assertEqual(2, len(input_mapper.features))

        # Two variables of different types
        data = {'numerical_vars': ['n1'],
                'categorical_vars': ['c1']}
        input_mapper, output_mapper = Automater()._create_mappers(data)
        self.assertEqual(2, len(input_mapper.features))

        # Two varibles with default pipelines
        data = {'NO_DEFAULT_ASDFSDA': ['x1', 'x2']}
        input_mapper, output_mapper = Automater()._create_mappers(data)
        self.assertEqual(2, len(input_mapper.features))

        mapper_pipelines = list(map(lambda x: list(x[1]), input_mapper.features))
        self.assertCountEqual([[], []], mapper_pipelines)
예제 #5
0
def main():
    observations = load_titanic()

    # Transform the data set, using keras_pandas
    categorical_vars = ['pclass', 'sex', 'survived']
    numerical_vars = [
        'age', 'siblings_spouses_aboard', 'parents_children_aboard', 'fare'
    ]
    text_vars = ['name']

    auto = Automater(categorical_vars=categorical_vars,
                     numerical_vars=numerical_vars,
                     text_vars=text_vars,
                     response_var='survived')
    X, y = auto.fit_transform(observations)

    # Start model with provided input nub
    x = auto.input_nub

    # Fill in your own hidden layers
    x = Dense(256)(x)
    x = Dense(256, activation='relu')(x)
    x = Dense(256)(x)

    # End model with provided output nub
    x = auto.output_nub(x)

    model = Model(inputs=auto.input_layers, outputs=x)
    model.compile(optimizer='Adam', loss=auto.loss, metrics=['accuracy'])

    # Train model
    model.fit(X, y, epochs=15, validation_split=.2)
def main():
    logging.getLogger().setLevel(logging.INFO)

    # Reference variables
    test_run = True

    observations = load_lending_club()

    if test_run:
        observations = observations.sample(n=100)

    # Transform the data set, using keras_pandas
    categorical_vars = [
        'term', 'grade', 'sub_grade', 'emp_length', 'home_ownership',
        'verification_status', 'issue_d', 'pymnt_plan', 'purpose',
        'addr_state', 'initial_list_status', 'application_type',
        'disbursement_method', 'loan_status'
    ]
    numerical_vars = [
        'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'annual_inc',
        'installment', 'dti', 'inq_last_6mths', 'open_acc', 'pub_rec',
        'revol_bal', 'total_acc', 'pub_rec_bankruptcies', 'int_rate',
        'revol_util'
    ]
    text_vars = ['desc', 'title']

    for categorical_var in categorical_vars:
        observations[categorical_var] = observations[categorical_var].fillna(
            'None')
        observations[categorical_var] = observations[categorical_var].apply(
            str)

    auto = Automater(categorical_vars=categorical_vars,
                     numerical_vars=numerical_vars,
                     text_vars=text_vars,
                     response_var='loan_status')

    X, y = auto.fit_transform(observations)

    # Start model with provided input nub
    x = auto.input_nub

    # Fill in your own hidden layers
    x = Dense(8)(x)
    x = Dense(16, activation='relu')(x)
    x = Dense(8)(x)

    # End model with provided output nub
    x = auto.output_nub(x)

    model = Model(inputs=auto.input_layers, outputs=x)
    model.compile(optimizer='Adam', loss=auto.loss, metrics=['accuracy'])

    # Train model
    logging.warning(
        'Settle in! This training normally takes about 5-20 minutes on CPU')
    model.fit(X, y, epochs=1, validation_split=.2)

    pass
예제 #7
0
    def test_create_input_nub(self):
        data = lib.load_titanic()

        # One variable
        text_vars = ['name']
        auto = Automater(text_vars=text_vars)
        auto.fit(data)

        self.assertEqual(1, len(auto.input_layers))
예제 #8
0
    def test_inverse_transform_numerical_response(self):

        # :oad data
        observations = lib.load_lending_club()

        # Set to test run
        observations = observations.sample(n=100)

        # Declare variable types
        categorical_vars = ['term', 'grade', 'sub_grade', 'emp_length', 'home_ownership', 'verification_status',
                            'issue_d',
                            'pymnt_plan', 'purpose', 'addr_state', 'initial_list_status', 'application_type',
                            'disbursement_method', 'loan_status']
        numerical_vars = ['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'annual_inc', 'installment', 'dti',
                          'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'total_acc', 'pub_rec_bankruptcies',
                          'int_rate', 'revol_util']

        text_vars = ['desc', 'title']

        # Manual null filling
        for categorical_var in categorical_vars:
            observations[categorical_var] = observations[categorical_var].fillna('None')
            observations[categorical_var] = observations[categorical_var].apply(str)

        auto = Automater(categorical_vars=categorical_vars, numerical_vars=numerical_vars, text_vars=text_vars,
                         response_var='funded_amnt')

        X, y = auto.fit_transform(observations)

        # Start model with provided input nub
        x = auto.input_nub

        # Fill in your own hidden layers
        x = Dense(8)(x)
        x = Dense(16, activation='relu')(x)
        x = Dense(8)(x)

        # End model with provided output nub
        x = auto.output_nub(x)

        model = Model(inputs=auto.input_layers, outputs=x)
        model.compile(optimizer='Adam', loss=auto.loss, metrics=['accuracy'])

        # Train model
        logging.warning('Settle in! This training normally takes about 5-20 minutes on CPU')
        model.fit(X, y, epochs=1, validation_split=.2)
        unscaled_preds = model.predict(X)

        logging.debug('unscaled_preds: {}'.format(list(unscaled_preds)))

        scaled_preds = auto.inverse_transform_output(unscaled_preds)

        logging.debug('scaled_preds: {}'.format(list(scaled_preds)))

        self.assertNotAlmostEquals(0, numpy.mean(scaled_preds))

        self.assertNotAlmostEquals(1, numpy.std(scaled_preds))
예제 #9
0
    def test_fit(self):
        train_df = lib.load_mushroom()

        # Two variables
        mushroom_categorical_cols = ['odor', 'habitat']
        auto = Automater(categorical_vars=mushroom_categorical_cols)
        auto.fit(train_df)

        self.assertEqual(Automater, type(auto))
        self.assertEqual(mushroom_categorical_cols, auto._user_provided_variables)
        self.assertTrue(auto.fitted)

        # Assert that transformation pipline has been built / trained
        self.assertEqual([['odor'], ['habitat']], list(map(lambda x: x[0], auto.input_mapper.built_features)))
예제 #10
0
    def test_fit(self):
        data = lib.load_titanic()
        # One variable
        text_vars = ['name']

        auto = Automater(text_vars=text_vars)
        auto.fit(data)

        self.assertEqual(Automater, type(auto))
        self.assertEqual(text_vars, auto._user_provided_variables)
        self.assertTrue(auto.fitted)

        self.assertEqual([['name']],
                         list(
                             map(lambda x: x[0],
                                 auto.input_mapper.built_features)))
def main():

    # Load data
    observations = lib.load_mushroom()
    # observations = lib.load_lending_club(test_run=False)
    print('Observation columns: {}'.format(list(observations.columns)))
    print('Class balance:\n {}'.format(observations['class'].value_counts()))

    # List out variable types
    numerical_vars = []
    categorical_vars = [
        'class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
        'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
        'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
        'stalk-surface-below-ring', 'stalk-color-above-ring',
        'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
        'ring-type', 'spore-print-color', 'population', 'habitat'
    ]
    text_vars = []

    train_observations, test_observations = train_test_split(observations)
    train_observations = train_observations.copy()
    test_observations = test_observations.copy()

    # Create and fit Automater
    auto = Automater(numerical_vars=numerical_vars,
                     categorical_vars=categorical_vars,
                     text_vars=text_vars,
                     response_var='class')
    auto.fit(train_observations)

    # Create and fit keras (deep learning) model
    # The auto.transform, auto.input_nub, auto.input_layers, and auto.loss are provided by keras-pandas, and
    # everything else is core Keras
    train_X, train_y = auto.transform(train_observations)
    test_X, test_y = auto.transform(test_observations)

    x = auto.input_nub
    x = Dense(32)(x)
    x = Dense(32, activation='relu')(x)
    x = Dense(32)(x)
    x = auto.output_nub(x)

    model = Model(inputs=auto.input_layers, outputs=x)
    model.compile(optimizer='Adam', loss=auto.loss, metrics=['accuracy'])

    model.fit(train_X, train_y)

    test_y_pred = model.predict(test_X)

    # Inverse transform model output, to get usable results and save all results
    test_observations[auto.response_var +
                      '_pred'] = auto.inverse_transform_output(test_y_pred)
    print('Predictions: {}'.format(test_observations[auto.response_var +
                                                     '_pred']))

    pass
예제 #12
0
    def test_fit(self):
        iris_df = lib.load_iris()

        # One variable
        iris_numerical_cols = ['sepal_length']
        auto = Automater(numerical_vars=iris_numerical_cols)
        auto.fit(iris_df)

        self.assertEqual(Automater, type(auto))
        self.assertEqual(iris_numerical_cols, auto._user_provided_variables)
        self.assertTrue(auto.fitted)

        # Assert that transformation pipline has been built / trained
        self.assertEqual([['sepal_length']],
                         list(
                             map(lambda x: x[0],
                                 auto.input_mapper.built_features)))
예제 #13
0
def main():

    # Load data
    observations = lib.load_titanic()
    # observations = lib.load_lending_club(test_run=False)
    print('Observation columns: {}'.format(list(observations.columns)))
    print('Class balance:\n {}'.format(
        observations['survived'].value_counts()))

    # List out variable types
    numerical_vars = [
        'age', 'siblings_spouses_aboard', 'parents_children_aboard', 'fare'
    ]
    categorical_vars = ['survived', 'pclass', 'sex']
    text_vars = ['name']

    for var in categorical_vars:
        observations[var] = observations[var].astype(str)

    train_observations, test_observations = train_test_split(observations)
    train_observations = train_observations.copy()
    test_observations = test_observations.copy()

    # Create and fit Automater
    auto = Automater(numerical_vars=numerical_vars,
                     categorical_vars=categorical_vars,
                     text_vars=text_vars,
                     response_var='survived')
    auto.fit(train_observations)

    # Create and fit keras (deep learning) model
    # The auto.transform, auto.input_nub, auto.input_layers, and auto.loss are provided by keras-pandas, and
    # everything else is core Keras
    train_X, train_y = auto.transform(train_observations)
    test_X, test_y = auto.transform(test_observations)

    x = auto.input_nub
    x = Dense(32)(x)
    x = Dense(32, activation='relu')(x)
    x = Dense(32)(x)
    x = auto.output_nub(x)

    model = Model(inputs=auto.input_layers, outputs=x)
    model.compile(optimizer='Adam', loss=auto.loss, metrics=['accuracy'])

    model.fit(train_X, train_y)

    test_y_pred = model.predict(test_X)

    # Inverse transform model output, to get usable results and save all results
    test_observations[auto.response_var +
                      '_pred'] = auto.inverse_transform_output(test_y_pred)
    print('Predictions: {}'.format(test_observations[auto.response_var +
                                                     '_pred']))

    pass
예제 #14
0
    def test_initializer(self):
        # Base case: No variables
        auto = Automater()
        self.assertEqual({'numerical_vars': list(), 'categorical_vars': list(),
                          'boolean_vars': list(), 'datetime_vars': list(), 'text_vars': list(),
                          'non_transformed_vars': list()}, auto._variable_type_dict, )
        self.assertCountEqual(list(), auto._user_provided_variables)

        # Common use case: Variables in each
        data = {
            'numerical_vars': ['n1', 'n2', 'n3'],
            'categorical_vars': ['c1', 'c2', 'c3'],
            'datetime_vars': ['d1', 'd2']
        }

        response = copy.deepcopy(data)
        response['boolean_vars'] = list()
        response['non_transformed_vars'] = list()
        response['text_vars'] = list()

        auto = Automater(numerical_vars=data['numerical_vars'], categorical_vars=data['categorical_vars'],
                         datetime_vars=data['datetime_vars'])

        self.assertEqual(False, auto.fitted)
        self.assertEqual(response, auto._variable_type_dict)

        response_variable_list = [item for sublist in response.values() for item in sublist]
        self.assertCountEqual(response_variable_list, auto._user_provided_variables)
        # Overlapping variable lists
        data = {
            'numerical_vars': ['n1', 'n2', 'n3', 'x1'],
            'categorical_vars': ['c1', 'c2', 'c3'],
            'datetime_vars': ['d1', 'd2', 'x1']
        }

        response = copy.deepcopy(data)
        response['boolean_vars'] = list()
        response['non_transformed_vars'] = list()

        self.assertRaises(ValueError, Automater().__init__(), numerical_vars=data['numerical_vars'],
                          categorical_vars=data['categorical_vars'],
                          datetime_vars=data['datetime_vars'])
예제 #15
0
    def test_transform(self):
        iris_df = lib.load_iris()

        # Two numerical variables, df_out = False
        iris_numerical_cols = ['sepal_length', 'sepal_width']
        auto = Automater(numerical_vars=iris_numerical_cols, df_out=False)
        auto.fit(iris_df)

        (X, y) = auto.transform(iris_df)
        self.assertEqual((150, ), X[0].shape)

        # Two numerical variables, df_out = True
        iris_numerical_cols = ['sepal_length', 'sepal_width']
        auto = Automater(numerical_vars=iris_numerical_cols, df_out=True)
        auto.fit(iris_df)

        transformed = auto.transform(iris_df)
        self.assertEqual(150, len(transformed.index))
        self.assertEqual((150, 2), transformed.shape)
        self.assertCountEqual(iris_numerical_cols, transformed.columns)
예제 #16
0
    def test_transform_no_response(self):
        train_df = lib.load_mushroom()

        # Two numerical variables, df_out = False
        test_columns = ['odor', 'habitat']
        auto = Automater(categorical_vars=test_columns, df_out=False)
        auto.fit(train_df)

        (X, y) = auto.transform(train_df)
        self.assertEqual((8124, ), X[0].shape)
        self.assertEqual(None, y)

        # Two numerical variables, df_out = True
        test_columns = ['odor', 'habitat']
        auto = Automater(categorical_vars=test_columns, df_out=True)
        auto.fit(train_df)

        transformed = auto.transform(train_df)
        self.assertEqual(8124, len(transformed.index))
        self.assertEqual((8124, 2), transformed.shape)
        self.assertCountEqual(test_columns, transformed.columns)
예제 #17
0
    def test_unsupervised(self):
        observations = lib.load_lending_club()

        # Train /test split
        train_observations, test_observations = train_test_split(observations)
        train_observations = train_observations.copy()
        test_observations = test_observations.copy()

        # Unsupervised
        data_type_dict = {
            'numerical': [
                'loan_amnt', 'annual_inc', 'open_acc', 'dti', 'delinq_2yrs',
                'inq_last_6mths', 'mths_since_last_delinq', 'pub_rec',
                'revol_bal', 'revol_util', 'total_acc', 'pub_rec_bankruptcies'
            ],
            'categorical': [
                'term', 'grade', 'emp_length', 'home_ownership', 'loan_status',
                'addr_state', 'application_type'
            ],
            'text': ['desc', 'purpose', 'title']
        }
        auto = Automater(data_type_dict=data_type_dict)
        self.assertFalse(auto.supervised)

        expected_input_vars = reduce(lambda x, y: x + y,
                                     data_type_dict.values())
        self.assertCountEqual(expected_input_vars, auto.input_vars)
        self.assertEqual(None, auto.output_var)
        self.assertTrue(isinstance(auto.input_mapper, DataFrameMapper))
        self.assertIsNone(auto.output_mapper)
        self.assertFalse(auto.fitted)

        self.assertRaises(AssertionError, auto._check_has_response_var)

        # Test fit
        auto.fit(train_observations)
        self.assertTrue(auto.fitted)

        self.assertIsNotNone(auto.input_mapper.built_features)
        self.assertTrue(isinstance(auto.input_layers, list))
        self.assertEqual(len(expected_input_vars), len(auto.input_layers))
        self.assertIsNotNone(auto.input_nub)

        self.assertIsNone(auto.output_nub)
        self.assertIsNone(auto.output_mapper)

        # Test transform, df_out=False
        X, y = auto.transform(test_observations)
        self.assertTrue(isinstance(X, list))
        self.assertIsNone(y)
        self.assertEqual(test_observations.shape[0],
                         X[0].shape[0])  # Correct number of rows back

        # Test transform, df_out=True
        transformed_observations = auto.transform(test_observations,
                                                  df_out=True)
        self.assertTrue(isinstance(transformed_observations, pandas.DataFrame))
        self.assertEqual(
            test_observations.shape[0],
            transformed_observations.shape[0])  # Correct number of rows back
def main():
    logging.getLogger().setLevel(logging.DEBUG)

    observations = load_mushrooms()

    # Transform the data set, using keras_pandas
    auto = Automater(categorical_vars=observations.columns,
                     response_var='class')
    X, y = auto.fit_transform(observations)

    # Create model
    x = auto.input_nub
    x = Dense(30)(x)
    x = auto.output_nub(x)

    model = Model(inputs=auto.input_layers, outputs=x)
    model.compile(optimizer='Adam', loss=auto.loss, metrics=['accuracy'])

    # Train model
    model.fit(X, y, epochs=10, validation_split=.5)

    pass
예제 #19
0
    def test_create_input_nub_numerical(self):
        # TODO rename function, there is no numerical input
        train_df = lib.load_mushroom()

        # Zero variables
        variable_type_dict = {'categorical_vars': []}
        input_layers, input_nub = Automater()._create_input_nub(variable_type_dict, train_df)
        self.assertEqual(list(), input_layers)

        # One variable
        iris_numerical_cols = ['odor']
        variable_type_dict = {'numerical_vars': iris_numerical_cols}
        input_layers, input_nub = Automater(numerical_vars=iris_numerical_cols).\
            _create_input_nub(variable_type_dict, train_df)
        # TODO Check layer type
        self.assertEqual(1, len(input_layers))

        # Multiple numeric variables
        iris_numerical_cols = ['odor', 'habitat', 'population']
        variable_type_dict = {'numerical_vars': iris_numerical_cols}
        input_layers, input_nub = Automater(numerical_vars=iris_numerical_cols).\
            _create_input_nub(variable_type_dict, train_df)
        self.assertEqual(3, len(input_layers))
예제 #20
0
def main():
    # Load data
    observations = lib.load_instanbul_stocks(as_ts=True)
    print('Observation columns: {}'.format(list(observations.columns)))

    # Heuristic data transformations

    # Train /test split
    train_observations, test_observations = train_test_split(observations)
    train_observations = train_observations.copy()
    test_observations = test_observations.copy()

    # List out variable types
    timeseries_vars = ['ise_lagged', 'ise.1_lagged', 'sp_lagged', 'dax_lagged']
    numerical_vars = ['ise']

    # Create and fit Automater
    auto = Automater(numerical_vars=numerical_vars, timeseries_vars=timeseries_vars,
                     response_var='ise')
    auto.fit(train_observations)

    # Create and fit keras (deep learning) model.
    # The auto.transform, auto.input_nub, auto.input_layers, auto.output_nub, and auto.loss are provided by
    # keras-pandas, and everything else is core Keras

    x = auto.input_nub
    x = Dense(16)(x)
    x = Dense(16, activation='relu')(x)
    x = Dense(16)(x)
    x = auto.output_nub(x)

    model = Model(inputs=auto.input_layers, outputs=x)
    model.compile(optimizer='adam', loss=auto.loss)

    train_X, train_y = auto.transform(train_observations)
    model.fit(train_X, train_y)

    # Inverse transform model output, to get usable results
    test_X, test_y = auto.transform(test_observations)
    test_y_pred = model.predict(test_X)
    test_observations[auto.response_var + '_pred'] = auto.inverse_transform_output(test_y_pred)
    print('Predictions: {}'.format(test_observations[auto.response_var + '_pred']))

    # TODO Save all results

    pass
예제 #21
0
    def test_whole(self):
        data = lib.load_titanic()

        msk = numpy.random.rand(len(data)) < 0.95
        data_train = data[msk]
        data_test = data[~msk]

        text_vars = ['name']
        categorical_vars = ['survived']

        # Create auto
        auto = Automater(text_vars=text_vars,
                         categorical_vars=categorical_vars,
                         response_var='survived')

        # Train auto
        auto.fit(data_train)
        X_train, y_train = auto.transform(data)

        # Create model

        x = auto.input_nub
        x = Dense(30, activation='relu')(x)
        x = auto.output_nub(x)

        model = Model(inputs=auto.input_layers, outputs=x)
        model.compile(optimizer='Adam', loss=auto.loss)

        # Train DL model
        model.fit(X_train, y_train)

        # Transform test set
        data_test = data_test.drop('survived', axis=1)
        X_test, y_test = auto.transform(data_test)
        model.predict(X_test)

        pass
예제 #22
0
    def test_categorical_whole(self):
        # St up data set
        mushroom_df = lib.load_mushroom()
        msk = numpy.random.rand(len(mushroom_df)) < 0.95
        mushroom_train = mushroom_df[msk]
        mushroom_test = mushroom_df[~msk]
        categorical_vars = ['odor', 'habitat', 'population', 'class']

        # Create auto
        auto = Automater(categorical_vars=categorical_vars,
                         response_var='class')

        # Train auto
        auto.fit(mushroom_train)
        X_train, y_train = auto.transform(mushroom_train)

        # Extract input_nub from auto
        input_nub = auto.input_nub

        # Extract output_nub from auto
        output_nub = auto.output_nub

        # Create DL model
        x = input_nub
        x = Dense(30)(x)
        x = output_nub(x)

        model = Model(inputs=auto.input_layers, outputs=x)
        model.compile(optimizer='Adam', loss=auto.loss)

        # Train DL model
        model.fit(X_train, y_train)

        # Transform test set
        mushroom_test = mushroom_test.drop('class', axis=1)
        X_test, y_test = auto.transform(mushroom_test)
        model.predict(X_test)

        pass
예제 #23
0
    def test_boolean(self):
        observations = lib.load_mushroom()
        observations['population_bool'] = observations['population'] == 's'

        msk = numpy.random.rand(len(observations)) < 0.95
        mushroom_train = observations[msk]
        mushroom_test = observations[~msk]

        categorical_vars = ['odor', 'habitat', 'class']
        boolean_vars = ['population_bool']

        auto = Automater(categorical_vars=categorical_vars,
                         boolean_vars=boolean_vars,
                         response_var='class')

        auto.fit(mushroom_train)
        X_train, y_train = auto.transform(mushroom_train)

        # Extract input_nub from auto
        input_nub = auto.input_nub

        # Extract output_nub from auto
        output_nub = auto.output_nub

        # Create DL model
        x = input_nub
        x = Dense(30)(x)
        x = output_nub(x)

        model = Model(inputs=auto.input_layers, outputs=x)
        model.compile(optimizer='Adam', loss=auto.loss)

        # Train DL model
        model.fit(X_train, y_train)

        # Transform test set
        mushroom_test = mushroom_test.drop('class', axis=1)
        X_test, y_test = auto.transform(mushroom_test)
        model.predict(X_test)
예제 #24
0
    def test_numerical_whole(self):
        # St up data set
        iris = lib.load_iris()
        iris_train = iris[:100]
        iris_test = iris[101:]
        iris_numerical_cols = ['sepal_length', 'petal_length']

        # Create auto
        auto = Automater(numerical_vars=iris_numerical_cols,
                         response_var='sepal_length')

        # Train auto
        auto.fit(iris_train)
        X_train, y_train = auto.transform(iris_train)

        # Extract input_nub from auto
        input_nub = auto.input_nub

        # Extract output_nub from auto
        output_nub = auto.output_nub

        # Create DL model
        x = input_nub
        x = Dense(30)(x)
        x = output_nub(x)

        model = Model(inputs=auto.input_layers, outputs=x)
        model.compile(optimizer='Adam', loss=auto.loss)

        # Train DL model
        model.fit(X_train, y_train)

        # Transform test set
        iris_test = iris_test.drop('sepal_length', axis=1)
        X_test, y_test = auto.transform(iris_test)
        model.predict(X_test)

        pass
예제 #25
0
    def test_transform_no_response(self):
        data = pandas.DataFrame(
            data=['john clark', 'sue fox', 'mary lastname'], columns=['name'])

        # One variable
        text_vars = ['name']
        auto = Automater(text_vars=text_vars)
        auto.fit(data)

        (X, y) = auto.transform(data)

        # Find correct shape
        self.assertEqual((3, 2), X[0].shape)

        # Test output values
        self.assertEqual(None, y)

        # Test with unseen terms
        test_data = pandas.DataFrame(data=['Brendan Herger'], columns=['name'])
        (X_test, y_test) = auto.transform(test_data)
        self.assertTrue(numpy.array_equal([[0, 0]], X_test[0]))

        pass
예제 #26
0
    def test_timeseries_whole(self):
        observations = lib.load_instanbul_stocks(as_ts=True)

        # Train test split
        train_observations, test_observations = train_test_split(observations)
        train_observations = train_observations.copy()
        test_observations = test_observations.copy()

        # Create data type lists
        timeseries_vars = ['ise_lagged', 'sp_lagged']
        numerical_vars = ['ise']

        # Create automater
        auto = Automater(numerical_vars=numerical_vars,
                         timeseries_vars=timeseries_vars,
                         response_var='ise')

        # Fit automater
        auto.fit(train_observations)

        # Create model
        x = auto.input_nub
        x = Dense(32)(x)
        x = auto.output_nub(x)

        model = Model(inputs=auto.input_layers, outputs=x)
        model.compile(optimizer='adam', loss=auto.loss)

        # Train model
        train_X, train_y = auto.transform(train_observations)
        print(len(train_X))
        print(train_X[0].shape)
        model.fit(train_X, train_y)

        # TODO Use model to predict
        pass
예제 #27
0
def main():
    # TODO List out which components are supplied by Automater
    # In this example, we're utilizing X and y generated by the Automater, auto.input_nub, auto.input_layers,
    # auto.output_nub, and auto.suggest_loss

    save_results = False

    # TODO Load data
    observations = lib.load_titanic()
    print('Observation columns: {}'.format(list(observations.columns)))

    # TODO Train /test split
    train_observations, test_observations = train_test_split(observations)
    train_observations = train_observations.copy()
    test_observations = test_observations.copy()

    # TODO List out variable types

    data_type_dict = {'numerical': ['age', 'siblings_spouses_aboard', 'parents_children_aboard', 'fare'],
                      'categorical': ['survived', 'pclass', 'sex'],
                      'text': ['name'],
                      'timeseries': []
                      }
    output_var = 'survived'

    # Create and fit Automater
    auto = Automater(data_type_dict=data_type_dict, output_var=output_var)
    auto.fit(train_observations)

    # Transform data
    train_X, train_y = auto.fit_transform(train_observations)
    test_X, test_y = auto.transform(test_observations)

    # TODO Create and fit keras (deep learning) model.

    x = auto.input_nub
    x = Dense(32)(x)
    x = Dense(32)(x)
    x = auto.output_nub(x)

    model = Model(inputs=auto.input_layers, outputs=x)
    print(f'Suggested loss: {auto.suggest_loss()}\n\n')
    model.compile(optimizer='adam', loss=auto.suggest_loss(), metrics=['acc'])

    # model.fit(train_X, train_y)
    model.summary()

    print('\n\n' + '^' * 21)
    print(train_X)

    print('\n\n' + '^' * 21)
    print(train_y)
    model.fit(train_X, train_y, batch_size=32, epochs=1, validation_split=0.1)

    # Make model predictions and inverse transform model predictions, to get usable results
    pred_test_y = model.predict(test_X)
    auto.inverse_transform_output(pred_test_y)

    # Save all results
    if save_results:
        temp_dir = lib.get_temp_dir()
        model.save(os.path.join(temp_dir, 'model.h5py'))
        pickle.dump(train_X, open(os.path.join(temp_dir, 'train_X.pkl'), 'wb'))
        pickle.dump(train_y, open(os.path.join(temp_dir, 'train_y.pkl'), 'wb'))
        pickle.dump(test_X, open(os.path.join(temp_dir, 'test_X.pkl'), 'wb'))
        pickle.dump(test_y, open(os.path.join(temp_dir, 'test_y.pkl'), 'wb'))
        pickle.dump(pred_test_y, open(os.path.join(temp_dir, 'pred_test_y.pkl'), 'wb'))
예제 #28
0
def main():

    # Load data
    observations = lib.load_lending_club()
    print('Observation columns: {}'.format(list(observations.columns)))
    print('Class balance:\n {}'.format(
        observations['loan_status'].value_counts()))

    # Heuristic data transformations
    for var in ['int_rate', 'revol_util']:

        # Strip out percent signs
        observations[var] = observations[var].apply(
            lambda x: str(x).replace('%', ''))
        observations[var] = pandas.to_numeric(observations[var],
                                              errors='coerce')
    for var in ['mths_since_last_delinq', 'annual_inc_joint']:

        # Heuristic null filling for some variables
        observations[var] = observations[var].fillna(0)

    # List out variable types
    numerical_vars = [
        'loan_amnt', 'annual_inc', 'open_acc', 'dti', 'delinq_2yrs',
        'inq_last_6mths', 'mths_since_last_delinq', 'pub_rec', 'revol_bal',
        'revol_util', 'total_acc', 'pub_rec_bankruptcies'
    ]
    categorical_vars = [
        'term', 'grade', 'emp_length', 'home_ownership', 'addr_state',
        'application_type', 'disbursement_method'
    ]
    text_vars = ['desc', 'purpose', 'title']

    # Train /test split
    train_observations, test_observations = train_test_split(observations)
    train_observations = train_observations.copy()
    test_observations = test_observations.copy()

    # Create and fit Automater
    auto = Automater(numerical_vars=numerical_vars,
                     categorical_vars=categorical_vars,
                     text_vars=text_vars,
                     response_var='loan_amnt')
    auto.fit(train_observations)

    # Create and fit keras (deep learning) model
    # The auto.transform, auto.input_nub, auto.input_layers, and auto.loss are provided by keras-pandas, and
    # everything else is core Keras
    train_X, train_y = auto.transform(train_observations)
    test_X, test_y = auto.transform(test_observations)

    x = auto.input_nub
    x = Dense(32)(x)
    x = Dense(32, activation='relu')(x)
    x = Dense(32)(x)
    x = auto.output_nub(x)

    model = Model(inputs=auto.input_layers, outputs=x)
    model.compile(optimizer='Adam', loss=auto.loss, metrics=['accuracy'])

    model.fit(train_X, train_y)

    test_y_pred = model.predict(test_X)

    # Inverse transform model output, to get usable results and save all results
    test_observations[auto.response_var +
                      '_pred'] = auto.inverse_transform_output(test_y_pred)
    print('Predictions: {}'.format(test_observations[auto.response_var +
                                                     '_pred']))

    pass
예제 #29
0
    def test_supervised(self):
        observations = lib.load_lending_club()

        # Train /test split
        train_observations, test_observations = train_test_split(observations)
        train_observations = train_observations.copy()
        test_observations = test_observations.copy()

        # Supervised
        data_type_dict = {
            'numerical': [
                'loan_amnt', 'annual_inc', 'open_acc', 'dti', 'delinq_2yrs',
                'inq_last_6mths', 'mths_since_last_delinq', 'pub_rec',
                'revol_bal', 'revol_util', 'total_acc', 'pub_rec_bankruptcies'
            ],
            'categorical': [
                'term', 'grade', 'emp_length', 'home_ownership', 'loan_status',
                'addr_state', 'application_type'
            ],
            'text': ['desc', 'purpose', 'title']
        }
        output_var = 'loan_status'

        auto = Automater(data_type_dict=data_type_dict, output_var=output_var)

        self.assertTrue(auto.supervised)
        expected_input_vars = reduce(lambda x, y: x + y,
                                     data_type_dict.values())
        expected_input_vars.remove(output_var)
        self.assertCountEqual(expected_input_vars, auto.input_vars)
        self.assertEqual(output_var, auto.output_var)
        self.assertTrue(isinstance(auto.input_mapper, DataFrameMapper))
        self.assertTrue(isinstance(auto.output_mapper, DataFrameMapper))
        self.assertFalse(auto.fitted)
        self.assertRaises(AssertionError, auto._check_fitted)

        # Test fit
        auto.fit(train_observations)
        self.assertTrue(auto.fitted)

        self.assertIsNotNone(auto.input_mapper.built_features)
        self.assertTrue(isinstance(auto.input_layers, list))
        self.assertEqual(len(expected_input_vars), len(auto.input_layers))
        self.assertIsNotNone(auto.input_nub)

        self.assertIsNotNone(auto.output_nub)
        self.assertIsNotNone(auto.output_mapper.built_features)

        # Test transform, df_out=False
        train_X, train_y = auto.transform(train_observations)
        test_X, test_y = auto.transform(test_observations)
        self.assertTrue(isinstance(test_X, list))
        self.assertTrue(isinstance(test_y, numpy.ndarray))
        self.assertEqual(test_observations.shape[0],
                         test_X[0].shape[0])  # Correct number of rows back
        self.assertEqual(test_observations.shape[0],
                         test_y.shape[0])  # Correct number of rows back

        # Test transform, df_out=True
        transformed_observations = auto.transform(test_observations,
                                                  df_out=True)
        self.assertTrue(isinstance(transformed_observations, pandas.DataFrame))
        self.assertEqual(
            test_observations.shape[0],
            transformed_observations.shape[0])  # Correct number of rows back

        # Test suggest_loss
        suggested_loss = auto.suggest_loss()
        self.assertTrue(callable(suggested_loss))

        # Test model building

        x = auto.input_nub
        x = Dense(32)(x)
        x = auto.output_nub(x)

        model = Model(inputs=auto.input_layers, outputs=x)
        model.compile(optimizer='Adam', loss=auto.suggest_loss())
        model.fit(train_X, train_y)

        pred_y = model.predict(test_X)

        # Test inverse_transform_output
        inv_transformed_pred_y = auto.inverse_transform_output(pred_y)
        self.assertEqual(test_observations.shape[0],
                         inv_transformed_pred_y.shape[0])
예제 #30
0
def main():
    # List out which components are supplied by Automater
    # In this example, we're utilizing X and y generated by the Automater, auto.input_nub, auto.input_layers,
    # auto.output_nub, and auto.suggest_loss

    save_results = True

    # Load data
    observations = lib.load_instanbul_stocks(as_ts=True)
    print('Observation columns: {}'.format(list(observations.columns)))

    # Notice that the lagged variables are an array of values
    print('One of the lagged variables: \n{}'.format(
        observations['ise_lagged']))

    # Train /test split
    train_observations, test_observations = train_test_split(observations)
    train_observations = train_observations.copy()
    test_observations = test_observations.copy()

    # List out variable types
    data_type_dict = {
        'numerical':
        ['ise', 'ise.1', 'sp', 'dax', 'ftse', 'nikkei', 'bovespa', 'eu', 'em'],
        'categorical': [],
        'text': [],
        'timeseries':
        ['ise_lagged', 'ise.1_lagged', 'sp_lagged', 'dax_lagged']
    }
    output_var = 'ise'

    # Create and fit Automater
    auto = Automater(data_type_dict=data_type_dict, output_var=output_var)
    auto.fit(train_observations)

    # Transform data
    train_X, train_y = auto.fit_transform(train_observations)
    test_X, test_y = auto.transform(test_observations)

    # Create and fit keras (deep learning) model.

    x = auto.input_nub
    x = Dense(32)(x)
    x = Dense(32)(x)
    x = auto.output_nub(x)

    model = Model(inputs=auto.input_layers, outputs=x)
    model.compile(optimizer='adam', loss=auto.suggest_loss())

    model.fit(train_X, train_y)

    # Make model predictions and inverse transform model predictions, to get usable results
    pred_test_y = model.predict(test_X)
    auto.inverse_transform_output(pred_test_y)

    # Save all results
    if save_results:
        temp_dir = lib.get_temp_dir()
        model.save(os.path.join(temp_dir, 'model.h5py'))
        pickle.dump(train_X, open(os.path.join(temp_dir, 'train_X.pkl'), 'wb'))
        pickle.dump(train_y, open(os.path.join(temp_dir, 'train_y.pkl'), 'wb'))
        pickle.dump(test_X, open(os.path.join(temp_dir, 'test_X.pkl'), 'wb'))
        pickle.dump(test_y, open(os.path.join(temp_dir, 'test_y.pkl'), 'wb'))
        pickle.dump(pred_test_y,
                    open(os.path.join(temp_dir, 'pred_test_y.pkl'), 'wb'))