Exemplo n.º 1
0
    def test_transform(self):
        before = pd.DataFrame({
            'a': np.random.normal(100, 10, 1000),
            'b': np.random.rand(1000),
            'c': [0] * 1000,
            'd': np.random.randint(-5, 5, 1000),
            'e': np.random.rand(1000),
        })

        # manually add outliers
        before['e'][0] = 5
        before['e'][1] = -5
        before['d'][2] = 100
        before['d'][3] = -100
        before['b'][4] = 5
        before['b'][5] = -5
        before['a'][4] = 0
        before['a'][5] = 1000

        # save means and deviations
        means = before.mean()
        stds = before.std()
        mins = means - 3 * stds
        maxes = means + 3 * stds

        # fit and transform
        clipper = OutliersClipper(['a', 'b', 'c', 'd'])
        clipper.fit(before)
        after = clipper.transform(before.copy())

        # test output
        pd.testing.assert_index_equal(before.index, after.index)
        pd.testing.assert_series_equal(before['c'], after['c'])
        pd.testing.assert_series_equal(before['e'], after['e'])
        self.assertEqual(after['d'][2], maxes['d'])
        self.assertEqual(after['d'][3], mins['d'])
        self.assertFalse((after['d'] > maxes['d']).any())
        self.assertFalse((after['d'] < mins['d']).any())
        self.assertFalse((after['a'] > maxes['a']).any())
        self.assertFalse((after['a'] < mins['a']).any())
        self.assertFalse((after['b'] > maxes['b']).any())
        self.assertFalse((after['b'] < mins['b']).any())
Exemplo n.º 2
0
def get_test_config_houses():
    data, labels, continuous, discrete, dummy, categorical, target, missing = get_houses(test=False)
    test_data, test_labels = get_houses(test=True)[0:2]
    train = data.drop(target, axis=1)
    test = test_data.drop(target, axis=1)
    scorer = rmse
    binner = CustomBinaryBinner({ col: {'values': [train[col].max()]} for col in continuous + discrete })

    one_hot = CustomOneHotEncoder(columns=categorical)
    model = Pipeline([
        ('onehot', one_hot),
        ('clipper', None),
        ('binner', None),
        ('binner2', None),
        ('simple_imputer', None),
        ('zero_filler', ZeroFiller()),
        ('main_imputer', None),
        ('dropper', FeatureDropper(drop=[])),
        ('poly', None),
        ('combinations', None),
        ('boxcox', None),
        ('scaler', None),
        ('reduce_dim', None),
        ('predictor', None)
    ])
    params = {
        'LinearRegression_best': {
                'params': {'binner2': binner,
               'boxcox': BoxCoxTransformer(lambdas_per_column={'BsmtFinSF1': 0, 'LowQualFinSF': 0, 'GrLivArea': 0, 'WoodDeckSF': 0}),
               'clipper': None,
               'combinations': FeatureProduct(columns=['LotFrontage', 'BsmtFinSF1', 'MasVnrArea', '1stFlrSF', 'GarageArea', 'TotalBsmtSF', 'GrLivArea']),
               'dropper__drop': ['LotFrontage_nan', 'MasVnrArea_nan', 'GarageYrBlt_nan'],
               'main_imputer': ModelBasedFullImputer(columns=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'],
                          model=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=7, normalize=False)),
               'poly': PolynomialsAdder(powers_per_column={'LotFrontage': [3], 'BsmtFinSF1': [3], 'MasVnrArea': [3], '1stFlrSF': [3], 'GarageArea': [3], 'TotalBsmtSF': [3], 'GrLivArea': [3]}),
               'predictor': LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
               'reduce_dim': PCA(copy=True, iterated_power='auto', n_components=80, random_state=None,
                 svd_solver='auto', tol=0.0, whiten=False),
               'scaler': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
                      with_scaling=True),
               'simple_imputer': FillNaTransformer(from_dict={}, mean=[], median=[],
                        nan_flag=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'],
                        zero=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])},
          'score': 23615.61547841841,
          'std': 1575.0296698711768},
     #    same as baseline
     'XGBRegressor_best': {'params': {'binner2': None,
       'boxcox': None,
       'clipper': None,
       'combinations': None,
       'dropper__drop': ['LotFrontage_nan', 'MasVnrArea_nan', 'GarageYrBlt_nan'],
       'main_imputer': None,
       'poly': None,
       'predictor': XGBRegressor(
              colsample_bytree=1, learning_rate=0.07,
              max_depth=3, n_estimators=1000, n_jobs=7),
       'reduce_dim': None,
       'scaler': None,
       'simple_imputer': FillNaTransformer(from_dict={},
                mean=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], median=[],
                nan_flag=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], zero=[])},
      'score': 25011.258367120317,
      'std': 1390.9293424644447},
    'DecisionTreeRegressor_base': {'params': {'predictor': DecisionTreeRegressor(),
       'scaler': None,
       'simple_imputer': FillNaTransformer(from_dict={},
                mean=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], median=[],
                nan_flag=[], zero=[])},
      'score': 41942.13985251157,
      'std': 4160.532009551353},
     'KNeighborsRegressor_base': {'params': {'predictor': KNeighborsRegressor(n_neighbors=7, n_jobs=7),
       'scaler': None,
       'simple_imputer': FillNaTransformer(from_dict={}, mean=[], median=[], nan_flag=[],
                zero=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])},
      'score': 49074.08623384354,
      'std': 4863.286944918721},
     'LinearRegression_base': {'params': {'predictor': LinearRegression(n_jobs=7),
       'scaler': None,
       'simple_imputer': FillNaTransformer(from_dict={},
                mean=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], median=[],
                nan_flag=[], zero=[])},
      'score': 58196.855144405956,
      'std': 28243.597268210826},

        'XGBRegressor_base': {'params': {
        'predictor': XGBRegressor(max_depth=3, n_jobs=7),
       'scaler': None,
       'simple_imputer': FillNaTransformer(from_dict={}, mean=[], median=[], nan_flag=[],
                zero=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])},
      'score': 26368.673265668913,
      'std': 1310.375215389942},
    'XGBRegressor_tuned_base': {'params': {'predictor': XGBRegressor(
              colsample_bytree=1,learning_rate=0.07,
              max_depth=3, n_estimators=1000, n_jobs=7),
       'scaler': None,
       'simple_imputer': FillNaTransformer(from_dict={},
                mean=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], median=[],
                nan_flag=[], zero=[])},
      'score': 24894.281304255797,
      'std': 1177.011047285853},

    'DecisionTreeRegressor_best': {'params': {'binner': None,
    'binner2': binner,
    'clipper': OutliersClipper(columns=['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']),
    'combinations': FeatureProduct(columns=['LotFrontage', 'BsmtFinSF1', 'MasVnrArea', '1stFlrSF', 'GarageArea', 'TotalBsmtSF', 'GrLivArea']),
    'dropper__drop': ['LotFrontage_nan', 'MasVnrArea_nan', 'GarageYrBlt_nan'],
    'main_imputer': HotDeckFullImputer(col_k_pairs=[('LotFrontage', None), ('MasVnrArea', None), ('GarageYrBlt', None)], default_k=5),
    'poly': PolynomialsAdder(powers_per_column={'LotFrontage': [2], 'LotArea': [2], 'MasVnrArea': [2], 'BsmtFinSF1': [2], 'BsmtFinSF2': [2], 'BsmtUnfSF': [2], 'TotalBsmtSF': [2], '1stFlrSF': [2], '2ndFlrSF': [2], 'LowQualFinSF': [2], 'GrLivArea': [2], 'GarageArea': [2], 'WoodDeckSF': [2], 'OpenPorchSF': [2], 'EnclosedPorch': [2], '3SsnPorch': [2], 'ScreenPorch': [2], 'PoolArea': [2], 'MiscVal': [2]}),
    'predictor': DecisionTreeRegressor(),
    'reduce_dim': SelectFromModel(estimator=RandomForestRegressor(max_depth=8)),
    'simple_imputer': FillNaTransformer(from_dict={},
        mean=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], median=[],
        nan_flag=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], zero=[])},
    'score': 37866.96889728187,
    'std': 5359.25597193946},

    'Lasso_best': {'params': {
        'binner': None,
        'binner2': binner,
        'clipper': None,
        'combinations': FeatureProduct(
            columns=['LotFrontage', 'BsmtFinSF1', 'MasVnrArea', '1stFlrSF', 'GarageArea', 'TotalBsmtSF', 'GrLivArea']),
        'dropper__drop': ['LotFrontage_nan', 'MasVnrArea_nan', 'GarageYrBlt_nan'],
        'main_imputer': HotDeckFullImputer(col_k_pairs=[('LotFrontage', None), ('MasVnrArea', None), ('GarageYrBlt', None)],
                                           default_k=7),
        'poly': None,
        'predictor': Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
                           normalize=False, positive=False, precompute=False, random_state=None,
                           selection='cyclic', tol=0.0001, warm_start=False),
        'reduce_dim': PCA(copy=True, iterated_power='auto', n_components=80, random_state=None,
                          svd_solver='auto', tol=0.0, whiten=False),
        'simple_imputer': FillNaTransformer(from_dict={},
                                            mean=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], median=[],
                                            nan_flag=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], zero=[])},
        'score': 24138.931080813963,
        'std': 639.0998169991468} ,

        'Lasso_base': {'params': {
            'predictor': Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
                               normalize=False, positive=False, precompute=False, random_state=None,
                               selection='cyclic', tol=0.0001, warm_start=False),
            'scaler': RobustScaler(),
            'simple_imputer': FillNaTransformer(from_dict={},
                                                mean=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'])},
            'score': 0,
            'std': 0}


    }

    return data, test, test_labels, scorer, model, params, target
Exemplo n.º 3
0
def get_test_config_boston(missing=True):
    data, labels, continuous, discrete, dummy, categorical, target, missing = get_boston(
        test=False, missing=missing)
    test_data, test_labels = get_boston(test=True, missing=missing)[0:2]
    test = test_data.drop(target, axis=1)
    scorer = rmse

    model = Pipeline([('clipper', None), ('binner', None), ('binner2', None),
                      ('simple_imputer', None), ('zero_filler', ZeroFiller()),
                      ('main_imputer', None),
                      ('dropper', FeatureDropper(drop=[])), ('poly', None),
                      ('combinations', None), ('boxcox', None),
                      ('scaler', None), ('reduce_dim', None),
                      ('predictor', None)])

    params = {
        #     0
        'XGBRegressor_best': {
            'params': {
                'binner2':
                None,
                'boxcox':
                BoxCoxTransformer(lambdas_per_column={
                    'age': 2,
                    'tax': 0,
                    'lstat': 0
                }),
                'clipper':
                OutliersClipper(columns=[
                    'crim', 'zn', 'nox', 'indus', 'rm', 'age', 'tax',
                    'ptratio', 'b', 'lstat', 'dis'
                ]),
                'combinations':
                None,
                'dropper__drop': [],
                'main_imputer':
                ModelBasedFullImputer(
                    columns=[
                        'crim', 'zn', 'nox', 'indus', 'rm', 'age', 'tax',
                        'ptratio', 'b', 'dis'
                    ],
                    model=DecisionTreeRegressor(max_depth=None)),
                'poly':
                PolynomialsAdder(
                    powers_per_column={
                        'crim': [2, 3],
                        'zn': [2, 3],
                        'nox': [2, 3],
                        'indus': [2, 3],
                        'rm': [2, 3],
                        'age': [2, 3],
                        'tax': [2, 3],
                        'ptratio': [2, 3],
                        'b': [2, 3],
                        'lstat': [2, 3],
                        'dis': [2, 3]
                    }),
                'predictor':
                XGBRegressor(learning_rate=0.05, max_depth=6,
                             n_estimators=500),
                'reduce_dim':
                None,
                'scaler':
                None,
                'simple_imputer':
                FillNaTransformer(from_dict={},
                                  mean=[],
                                  median=[
                                      'crim', 'zn', 'nox', 'indus', 'rm',
                                      'age', 'tax', 'ptratio', 'b', 'dis'
                                  ],
                                  nan_flag=[
                                      'crim', 'zn', 'nox', 'indus', 'rm',
                                      'age', 'tax', 'ptratio', 'b', 'dis'
                                  ],
                                  zero=[])
            },
            'score': 3.7821358047127682,
            'std': 0.4967512627490983
        },

        # 0
        'Lasso_best': {
            'params': {
                'binner2':
                None,
                'boxcox':
                BoxCoxTransformer(lambdas_per_column={
                    'age': 2,
                    'tax': 0,
                    'lstat': 0
                }),
                'clipper':
                OutliersClipper(columns=[
                    'crim', 'zn', 'nox', 'indus', 'rm', 'age', 'tax',
                    'ptratio', 'b', 'lstat', 'dis'
                ]),
                'combinations':
                FeatureProduct(columns=[
                    'crim', 'zn', 'nox', 'indus', 'rm', 'age', 'tax',
                    'ptratio', 'b', 'lstat', 'dis'
                ]),
                'dropper__drop': [
                    'crim_nan', 'zn_nan', 'nox_nan', 'indus_nan', 'rm_nan',
                    'age_nan', 'tax_nan', 'ptratio_nan', 'b_nan', 'dis_nan'
                ],
                'main_imputer':
                ModelBasedFullImputer(
                    columns=[
                        'crim', 'zn', 'nox', 'indus', 'rm', 'age', 'tax',
                        'ptratio', 'b', 'dis'
                    ],
                    model=DecisionTreeRegressor(max_depth=None)),
                'poly':
                None,
                'predictor':
                Lasso(alpha=0.01),
                'reduce_dim':
                None,
                'scaler':
                RobustScaler(),
                'simple_imputer':
                FillNaTransformer(from_dict={},
                                  mean=[],
                                  median=[
                                      'crim', 'zn', 'nox', 'indus', 'rm',
                                      'age', 'tax', 'ptratio', 'b', 'dis'
                                  ],
                                  nan_flag=[
                                      'crim', 'zn', 'nox', 'indus', 'rm',
                                      'age', 'tax', 'ptratio', 'b', 'dis'
                                  ],
                                  zero=[])
            },
            'score': 3.993797473454735,
            'std': 0.5808956921355953
        },

        #     0
        'LinearRegression_best': {
            'params': {
                'binner2':
                None,
                'boxcox':
                BoxCoxTransformer(lambdas_per_column={
                    'age': 2,
                    'tax': 0,
                    'lstat': 0
                }),
                'clipper':
                OutliersClipper(columns=[
                    'crim', 'zn', 'nox', 'indus', 'rm', 'age', 'tax',
                    'ptratio', 'b', 'lstat', 'dis'
                ]),
                'combinations':
                None,
                'dropper__drop': [
                    'crim_nan', 'zn_nan', 'nox_nan', 'indus_nan', 'rm_nan',
                    'age_nan', 'tax_nan', 'ptratio_nan', 'b_nan', 'dis_nan'
                ],
                'main_imputer':
                ModelBasedFullImputer(
                    columns=[
                        'crim', 'zn', 'nox', 'indus', 'rm', 'age', 'tax',
                        'ptratio', 'b', 'dis'
                    ],
                    model=DecisionTreeRegressor(max_depth=8)),
                'poly':
                PolynomialsAdder(
                    powers_per_column={
                        'crim': [2, 3],
                        'zn': [2, 3],
                        'nox': [2, 3],
                        'indus': [2, 3],
                        'rm': [2, 3],
                        'age': [2, 3],
                        'tax': [2, 3],
                        'ptratio': [2, 3],
                        'b': [2, 3],
                        'lstat': [2, 3],
                        'dis': [2, 3]
                    }),
                'predictor':
                LinearRegression(),
                'reduce_dim':
                None,
                'scaler':
                None,
                'simple_imputer':
                FillNaTransformer(from_dict={},
                                  mean=[],
                                  median=[
                                      'crim', 'zn', 'nox', 'indus', 'rm',
                                      'age', 'tax', 'ptratio', 'b', 'dis'
                                  ],
                                  nan_flag=[
                                      'crim', 'zn', 'nox', 'indus', 'rm',
                                      'age', 'tax', 'ptratio', 'b', 'dis'
                                  ],
                                  zero=[])
            },
            'score': 4.514645815970899,
            'std': 0.7631593234069367
        },
        'DecisionTreeRegressor_base': {
            'params': {
                'predictor':
                DecisionTreeRegressor(criterion='mse',
                                      max_depth=4,
                                      max_features=None,
                                      max_leaf_nodes=None,
                                      min_impurity_decrease=0.0,
                                      min_impurity_split=None,
                                      min_samples_leaf=1,
                                      min_samples_split=2,
                                      min_weight_fraction_leaf=0.0,
                                      presort=False,
                                      random_state=None,
                                      splitter='best'),
                'scaler':
                None,
                'simple_imputer':
                FillNaTransformer(from_dict={},
                                  mean=[],
                                  median=[],
                                  nan_flag=[],
                                  zero=[
                                      'crim', 'zn', 'nox', 'indus', 'rm',
                                      'age', 'tax', 'ptratio', 'b', 'dis'
                                  ])
            },
            'score': 5.5088106991425985,
            'std': 0.293662905734789
        },
        'KNeighborsRegressor_base': {
            'params': {
                'predictor':
                KNeighborsRegressor(algorithm='auto',
                                    leaf_size=30,
                                    metric='minkowski',
                                    metric_params=None,
                                    n_jobs=1,
                                    n_neighbors=7,
                                    p=2,
                                    weights='uniform'),
                'scaler':
                RobustScaler(copy=True,
                             quantile_range=(25.0, 75.0),
                             with_centering=True,
                             with_scaling=True),
                'simple_imputer':
                FillNaTransformer(from_dict={},
                                  mean=[
                                      'crim', 'zn', 'nox', 'indus', 'rm',
                                      'age', 'tax', 'ptratio', 'b', 'dis'
                                  ],
                                  median=[],
                                  nan_flag=[],
                                  zero=[])
            },
            'score': 5.859771905373064,
            'std': 0.90721907618626
        },
        'LinearRegression_base': {
            'params': {
                'predictor':
                LinearRegression(copy_X=True,
                                 fit_intercept=True,
                                 n_jobs=1,
                                 normalize=False),
                'scaler':
                RobustScaler(copy=True,
                             quantile_range=(25.0, 75.0),
                             with_centering=True,
                             with_scaling=True),
                'simple_imputer':
                FillNaTransformer(from_dict={},
                                  mean=[],
                                  median=[
                                      'crim', 'zn', 'nox', 'indus', 'rm',
                                      'age', 'tax', 'ptratio', 'b', 'dis'
                                  ],
                                  nan_flag=[],
                                  zero=[])
            },
            'score': 5.494688479501426,
            'std': 0.5377531716144219
        },
        'Lasso_base': {
            'params': {
                'predictor':
                Lasso(alpha=0.01),
                'scaler':
                RobustScaler(copy=True,
                             quantile_range=(25.0, 75.0),
                             with_centering=True,
                             with_scaling=True),
                'simple_imputer':
                FillNaTransformer(from_dict={},
                                  mean=[],
                                  median=[
                                      'crim', 'zn', 'nox', 'indus', 'rm',
                                      'age', 'tax', 'ptratio', 'b', 'dis'
                                  ],
                                  nan_flag=[],
                                  zero=[])
            },
            'score': 0,
            'std': 0
        },

        #     'XGBRegressor_base': {'params': {
        #         'predictor': XGBRegressor(max_depth=4),
        #         'scaler': None,
        #         'simple_imputer': FillNaTransformer(from_dict={},
        #                 mean=['crim', 'zn', 'nox', 'indus', 'rm', 'age', 'tax', 'ptratio', 'b', 'dis'],
        #                 median=[], nan_flag=[], zero=[])},
        #   'score': 4.088217989236429,
        #   'std': 0.5303490714753816,
        # },
        'XGBRegressor_tuned_base': {
            'params': {
                'predictor':
                XGBRegressor(learning_rate=0.05, max_depth=6,
                             n_estimators=500),
                'scaler':
                None,
                'simple_imputer':
                FillNaTransformer(from_dict={},
                                  mean=[
                                      'crim', 'zn', 'nox', 'indus', 'rm',
                                      'age', 'tax', 'ptratio', 'b', 'dis'
                                  ],
                                  median=[],
                                  nan_flag=[],
                                  zero=[])
            },
            'score': 3.942697483564859,
            'std': 0.6029251513214098
        },
        'DecisionTreeRegressor_best': {
            'params': {
                'binner2':
                None,
                'boxcox':
                BoxCoxTransformer(lambdas_per_column={
                    'age': 2,
                    'tax': 0,
                    'lstat': 0
                }),
                'clipper':
                OutliersClipper(columns=[
                    'crim', 'zn', 'nox', 'indus', 'rm', 'age', 'tax',
                    'ptratio', 'b', 'lstat', 'dis'
                ]),
                'combinations':
                None,
                'dropper__drop': [],
                'main_imputer':
                ModelBasedFullImputer(columns=[
                    'crim', 'zn', 'nox', 'indus', 'rm', 'age', 'tax',
                    'ptratio', 'b', 'dis'
                ],
                                      model=DecisionTreeRegressor(
                                          criterion='mse',
                                          max_depth=None,
                                          max_features=None,
                                          max_leaf_nodes=None,
                                          min_impurity_decrease=0.0,
                                          min_impurity_split=None,
                                          min_samples_leaf=1,
                                          min_samples_split=2,
                                          min_weight_fraction_leaf=0.0,
                                          presort=False,
                                          random_state=None,
                                          splitter='best')),
                'poly':
                None,
                'predictor':
                DecisionTreeRegressor(criterion='mse',
                                      max_depth=8,
                                      max_features=None,
                                      max_leaf_nodes=None,
                                      min_impurity_decrease=0.0,
                                      min_impurity_split=None,
                                      min_samples_leaf=1,
                                      min_samples_split=2,
                                      min_weight_fraction_leaf=0.0,
                                      presort=False,
                                      random_state=None,
                                      splitter='best'),
                'reduce_dim':
                None,
                'scaler':
                None,
                'simple_imputer':
                FillNaTransformer(from_dict={},
                                  mean=[],
                                  median=[
                                      'crim', 'zn', 'nox', 'indus', 'rm',
                                      'age', 'tax', 'ptratio', 'b', 'dis'
                                  ],
                                  nan_flag=[
                                      'crim', 'zn', 'nox', 'indus', 'rm',
                                      'age', 'tax', 'ptratio', 'b', 'dis'
                                  ],
                                  zero=[])
            },
            'score': 4.840076015850126,
            'std': 0.718619669114889
        },
    }

    return data, test, test_labels, scorer, model, params, target
Exemplo n.º 4
0
def get_test_config_heart(missing=True):
    data, labels, continuous, discrete, dummy, categorical, target, missing = get_heart(
        test=False, missing=missing)
    test_data, test_labels = get_heart(test=True, missing=missing)[0:2]
    test = test_data.drop(target, axis=1)
    scorer = error_rate
    one_hot = CustomOneHotEncoder(columns=categorical)
    model = Pipeline([
        ('onehot', one_hot),
        ('clipper', None),
        ('binner', None),
        ('binner2', None),
        ('simple_imputer', None),
        ('zero_filler', ZeroFiller()),  # just in case there are any left
        ('main_imputer', None),
        ('dropper', FeatureDropper(drop=[])),
        ('poly', None),
        ('combinations', None),
        ('boxcox', None),
        ('scaler', None),
        ('reduce_dim', None),
        ('predictor', None)
    ])
    params = {
        'DecisionTreeClassifier_base': {
            'params': {
                'predictor':
                DecisionTreeClassifier(max_depth=None),
                'scaler':
                None,
                'simple_imputer':
                FillNaTransformer(
                    from_dict={},
                    mean=['trestbps', 'chol', 'thalach', 'oldpeak'],
                    median=[],
                    nan_flag=[],
                    zero=[])
            },
            'score': 0.1974390243902439,
            'std': 0.0756691348271984
        },
        'KNeighborsClassifier_base': {
            'params': {
                'predictor':
                KNeighborsClassifier(n_neighbors=7
                                     # ,n_jobs=7
                                     ),
                'scaler':
                RobustScaler(),
                'simple_imputer':
                FillNaTransformer(
                    from_dict={},
                    mean=[],
                    median=['trestbps', 'chol', 'thalach', 'oldpeak'],
                    nan_flag=[],
                    zero=[])
            },
            'score': 0.1924390243902439,
            'std': 0.04087385740197896
        },
        'LogisticRegression_base': {
            'params': {
                'predictor':
                LogisticRegression(
                    # n_jobs=7,
                ),
                'scaler':
                None,
                'simple_imputer':
                FillNaTransformer(
                    from_dict={},
                    mean=['trestbps', 'chol', 'thalach', 'oldpeak'],
                    median=[],
                    nan_flag=[],
                    zero=[])
            },
            'score': 0.1825609756097561,
            'std': 0.04141604180434009
        },

        #   'XGBClassifier_base': {'params': {'predictor': XGBClassifier(
        #              base_score=0.5,
        #       # n_jobs=7,
        #
        #         colsample_bytree=0.8, learning_rate=0.07,
        #         max_depth=7, n_estimators=200,),
        #  'scaler': None,
        #  'simple_imputer': FillNaTransformer(from_dict={}, mean=[], median=[], nan_flag=[],
        #           zero=['trestbps', 'chol', 'thalach', 'oldpeak'])},
        # 'score': 0.16743902439024388,
        # 'std': 0.04176646782554455},
        'DecisionTreeClassifier_best': {
            'params': {
                'binner2':
                CustomBinner(
                    configuration={
                        'chol': {
                            'bins': 3
                        },
                        'thalach': {
                            'bins': 3
                        },
                        'oldpeak': {
                            'bins': 3
                        },
                        'trestbps': {
                            'bins': 3
                        },
                        'age': {
                            'bins': 3
                        },
                        'slope': {
                            'bins': 3
                        },
                        'ca': {
                            'bins': 3
                        }
                    }),
                'boxcox':
                None,
                'clipper':
                OutliersClipper(
                    columns=['chol', 'thalach', 'oldpeak', 'trestbps']),
                'combinations':
                FeatureProduct(
                    columns=['chol', 'thalach', 'oldpeak', 'trestbps']),
                'dropper__drop':
                ['trestbps_nan', 'chol_nan', 'thalach_nan', 'oldpeak_nan'],
                'main_imputer':
                HotDeckFullImputer(col_k_pairs=[('trestbps', None),
                                                ('chol', None),
                                                ('thalach', None),
                                                ('oldpeak', None)],
                                   default_k=7),
                'poly':
                None,
                'predictor':
                DecisionTreeClassifier(max_depth=4),
                'reduce_dim':
                None,
                'scaler':
                None,
                'simple_imputer':
                FillNaTransformer(
                    from_dict={},
                    mean=[],
                    median=[],
                    nan_flag=['trestbps', 'chol', 'thalach', 'oldpeak'],
                    zero=['trestbps', 'chol', 'thalach', 'oldpeak'])
            },
            'score': 0.14780487804878048,
            'std': 0.03090350740255695
        },
        'LogisticRegression_best': {
            'params': {
                'binner2':
                None,
                'boxcox':
                BoxCoxTransformer(lambdas_per_column={
                    'chol': 0,
                    'thalach': 2,
                    'trestbps': 0
                }),
                'clipper':
                OutliersClipper(
                    columns=['chol', 'thalach', 'oldpeak', 'trestbps']),
                'combinations':
                None,
                'dropper__drop': [],
                'main_imputer':
                ModelBasedFullImputer(
                    columns=['trestbps', 'chol', 'thalach', 'oldpeak'],
                    model=LinearRegression(
                        # n_jobs=7
                    )),
                'poly':
                PolynomialsAdder(powers_per_column={
                    'chol': [2],
                    'thalach': [2],
                    'oldpeak': [2],
                    'trestbps': [2]
                }),
                'predictor':
                LogisticRegression(),
                'reduce_dim':
                PCA(n_components=10),
                'scaler':
                None,
                'simple_imputer':
                FillNaTransformer(
                    from_dict={},
                    mean=['trestbps', 'chol', 'thalach', 'oldpeak'],
                    median=[],
                    nan_flag=['trestbps', 'chol', 'thalach', 'oldpeak'],
                    zero=[])
            },
            'score': 0.14280487804878048,
            'std': 0.03915450868377355
        },
        #   'XGBClassifier_best': {'params':
        #       {
        #           'binner2': CustomBinner(configuration={'chol': {'bins': 3}, 'thalach': {'bins': 3}, 'oldpeak': {'bins': 3}, 'trestbps': {'bins': 3}, 'age': {'bins': 3}, 'slope': {'bins': 3}, 'ca': {'bins': 3}},
        #                 drop=False, nan=False),
        #          'boxcox': BoxCoxTransformer(lambdas_per_column={'chol': 0, 'thalach': 2, 'trestbps': 0}),
        #          'clipper': OutliersClipper(columns=['chol', 'thalach', 'oldpeak', 'trestbps']),
        #          'combinations': None,
        #          'dropper__drop': ['trestbps_nan', 'chol_nan', 'thalach_nan', 'oldpeak_nan'],
        #          'main_imputer': HotDeckFullImputer(col_k_pairs=[('trestbps', None), ('chol', None), ('thalach', None), ('oldpeak', None)],
        #                    default_k=7),
        #          'poly': None,
        #          'predictor': XGBClassifier(
        #              # n_jobs=7,
        #              base_score=0.5,
        #              colsample_bytree=0.8, learning_rate=0.07,
        #                 max_depth=7, n_estimators=200,),
        #          'reduce_dim': SelectFromModel(estimator=LogisticRegression(C=0.999, penalty='l1',
        #                                                                     # n_jobs=7
        #                                                                     )),
        #          'scaler': None,
        #          'simple_imputer': FillNaTransformer(from_dict={},
        #                   mean=['trestbps', 'chol', 'thalach', 'oldpeak'], median=[],
        #                   nan_flag=['trestbps', 'chol', 'thalach', 'oldpeak'], zero=[])},
        # 'score': 0.15243902439024387,
        # 'std': 0.04655758333858798}
    }

    return data, test, test_labels, scorer, model, params, target
Exemplo n.º 5
0
             'bins': 3
         },
         'age': {
             'bins': 3
         },
         'slope': {
             'bins': 3
         },
         'ca': {
             'bins': 3
         }
     }),
 'boxcox':
 None,
 'clipper':
 OutliersClipper(
     columns=['chol', 'thalach', 'oldpeak', 'trestbps']),
 'combinations':
 FeatureProduct(columns=['chol', 'thalach', 'oldpeak', 'trestbps']),
 'dropper__drop':
 ['trestbps_nan', 'chol_nan', 'thalach_nan', 'oldpeak_nan'],
 'main_imputer':
 HotDeckFullImputer(col_k_pairs=[('trestbps', None), ('chol', None),
                                 ('thalach', None),
                                 ('oldpeak', None)],
                    default_k=7),
 'poly':
 None,
 'predictor':
 DecisionTreeClassifier(max_depth=4),
 'reduce_dim':
 None,
Exemplo n.º 6
0
  'std': 1310.375215389942},
'XGBRegressor_tuned': {'params': {'predictor': XGBRegressor(
          colsample_bytree=1,learning_rate=0.07,
          max_depth=3, n_estimators=1000),
   'scaler': None,
   'simple_imputer': FillNaTransformer(from_dict={},
            mean=['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], median=[],
            nan_flag=[], zero=[])},
  'score': 24894.281304255797,
  'std': 1177.011047285853}
}

best2 = {'DecisionTreeRegressor': {'params': {'binner': None,
   'binner2': CustomBinaryBinner(configuration={'LotFrontage': {'values': [182.0]}, 'LotArea': {'values': [215245]}, 'MasVnrArea': {'values': [1378.0]}, 'BsmtFinSF1': {'values': [2188]}, 'BsmtFinSF2': {'values': [1120]}, 'BsmtUnfSF': {'values': [2336]}, 'TotalBsmtSF': {'values': [3206]}, '1stFlrSF': {'values': [3228]}, '2ndFlrSF': ... [2010.0]}, 'GarageCars': {'values': [4]}, 'MoSold': {'values': [12]}, 'YrSold': {'values': [2010]}},
             drop=False, nan=False),
   'clipper': OutliersClipper(columns=['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']),
   'combinations': FeatureProduct(columns=['LotFrontage', 'BsmtFinSF1', 'MasVnrArea', '1stFlrSF', 'GarageArea', 'TotalBsmtSF', 'GrLivArea']),
   'dropper__drop': ['LotFrontage_nan', 'MasVnrArea_nan', 'GarageYrBlt_nan'],
   'main_imputer': HotDeckFullImputer(col_k_pairs=[('LotFrontage', None), ('MasVnrArea', None), ('GarageYrBlt', None)],
             default_k=5),
   'poly': PolynomialsAdder(powers_per_column={'LotFrontage': [2], 'LotArea': [2], 'MasVnrArea': [2], 'BsmtFinSF1': [2], 'BsmtFinSF2': [2], 'BsmtUnfSF': [2], 'TotalBsmtSF': [2], '1stFlrSF': [2], '2ndFlrSF': [2], 'LowQualFinSF': [2], 'GrLivArea': [2], 'GarageArea': [2], 'WoodDeckSF': [2], 'OpenPorchSF': [2], 'EnclosedPorch': [2], '3SsnPorch': [2], 'ScreenPorch': [2], 'PoolArea': [2], 'MiscVal': [2]}),
   'predictor': DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
              max_leaf_nodes=None, min_impurity_decrease=0.0,
              min_impurity_split=None, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              presort=False, random_state=None, splitter='best'),
   'reduce_dim': SelectFromModel(estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
              max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
Exemplo n.º 7
0
best = {
    #     0
    'XGBRegressor': {
        'params': {
            'binner2':
            None,
            'boxcox':
            BoxCoxTransformer(lambdas_per_column={
                'age': 2,
                'tax': 0,
                'lstat': 0
            }),
            'clipper':
            OutliersClipper(columns=[
                'crim', 'zn', 'nox', 'indus', 'rm', 'age', 'tax', 'ptratio',
                'b', 'lstat', 'dis'
            ]),
            'combinations':
            None,
            'dropper__drop': [],
            'main_imputer':
            ModelBasedFullImputer(columns=[
                'crim', 'zn', 'nox', 'indus', 'rm', 'age', 'tax', 'ptratio',
                'b', 'dis'
            ],
                                  model=DecisionTreeRegressor(max_depth=None)),
            'poly':
            PolynomialsAdder(
                powers_per_column={
                    'crim': [2, 3],
                    'zn': [2, 3],
Exemplo n.º 8
0
        'trestbps': 0
    }),
    'clipper':
    None,
    'scaler':
    None
}

BASE_BOSTON = {
    'binner':
    CustomBinaryBinner(configuration={}),
    'boxcox':
    None,
    'clipper':
    OutliersClipper(columns=[
        'crim', 'zn', 'nox', 'indus', 'rm', 'age', 'tax', 'ptratio', 'b',
        'lstat'
    ]),
    'scaler':
    StandardScaler(copy=True, with_mean=True, with_std=True)
}

BASE_HOUSES = {
    'binner':
    CustomBinaryBinner(configuration={}),
    'boxcox':
    None,
    'clipper':
    OutliersClipper(columns=[
        'crim', 'zn', 'nox', 'indus', 'rm', 'age', 'tax', 'ptratio', 'b',
        'lstat'
    ]),