'values': [train[col].max()] } for col in continuous + discrete }) # BINNER_CONFIG = [{ col: {'bins': 3} for col in continuous + discrete }, # # { col: {'bins': 5} for col in continuous + discrete }, # { col: {'bins': 7} for col in continuous + discrete }, # { col: {'values': [train[col].max()]} for col in continuous + discrete }] top_cont = [ 'LotFrontage', 'BsmtFinSF1', 'MasVnrArea', '1stFlrSF', 'GarageArea', 'TotalBsmtSF', 'GrLivArea' ] BOX_COX = BOX_COX_HO elif dataset == 'heart': data, labels, continuous, discrete, dummy, categorical, target, missing = get_heart( missing=MISSING) train = data.drop(target, axis=1) cv = KFold(5, shuffle=True, random_state=0) scorer = error_rate predictors = [ LogisticRegression(), SVC(), RandomForestClassifier(), DecisionTreeClassifier(), KNeighborsClassifier(n_neighbors=5), ] # BINNER_CONFIG = { col: {'bins': 3} for col in continuous + discrete } binner = CustomBinner( {col: { 'bins': 3 }
def get_test_config_heart(missing=True): data, labels, continuous, discrete, dummy, categorical, target, missing = get_heart( test=False, missing=missing) test_data, test_labels = get_heart(test=True, missing=missing)[0:2] test = test_data.drop(target, axis=1) scorer = error_rate one_hot = CustomOneHotEncoder(columns=categorical) model = Pipeline([ ('onehot', one_hot), ('clipper', None), ('binner', None), ('binner2', None), ('simple_imputer', None), ('zero_filler', ZeroFiller()), # just in case there are any left ('main_imputer', None), ('dropper', FeatureDropper(drop=[])), ('poly', None), ('combinations', None), ('boxcox', None), ('scaler', None), ('reduce_dim', None), ('predictor', None) ]) params = { 'DecisionTreeClassifier_base': { 'params': { 'predictor': DecisionTreeClassifier(max_depth=None), 'scaler': None, 'simple_imputer': FillNaTransformer( from_dict={}, mean=['trestbps', 'chol', 'thalach', 'oldpeak'], median=[], nan_flag=[], zero=[]) }, 'score': 0.1974390243902439, 'std': 0.0756691348271984 }, 'KNeighborsClassifier_base': { 'params': { 'predictor': KNeighborsClassifier(n_neighbors=7 # ,n_jobs=7 ), 'scaler': RobustScaler(), 'simple_imputer': FillNaTransformer( from_dict={}, mean=[], median=['trestbps', 'chol', 'thalach', 'oldpeak'], nan_flag=[], zero=[]) }, 'score': 0.1924390243902439, 'std': 0.04087385740197896 }, 'LogisticRegression_base': { 'params': { 'predictor': LogisticRegression( # n_jobs=7, ), 'scaler': None, 'simple_imputer': FillNaTransformer( from_dict={}, mean=['trestbps', 'chol', 'thalach', 'oldpeak'], median=[], nan_flag=[], zero=[]) }, 'score': 0.1825609756097561, 'std': 0.04141604180434009 }, # 'XGBClassifier_base': {'params': {'predictor': XGBClassifier( # base_score=0.5, # # n_jobs=7, # # colsample_bytree=0.8, learning_rate=0.07, # max_depth=7, n_estimators=200,), # 'scaler': None, # 'simple_imputer': FillNaTransformer(from_dict={}, mean=[], median=[], nan_flag=[], # zero=['trestbps', 'chol', 'thalach', 'oldpeak'])}, # 'score': 0.16743902439024388, # 'std': 0.04176646782554455}, 'DecisionTreeClassifier_best': { 'params': { 'binner2': CustomBinner( configuration={ 'chol': { 'bins': 3 }, 'thalach': { 'bins': 3 }, 'oldpeak': { 'bins': 3 }, 'trestbps': { 'bins': 3 }, 'age': { 'bins': 3 }, 'slope': { 'bins': 3 }, 'ca': { 'bins': 3 } }), 'boxcox': None, 'clipper': OutliersClipper( columns=['chol', 'thalach', 'oldpeak', 'trestbps']), 'combinations': FeatureProduct( columns=['chol', 'thalach', 'oldpeak', 'trestbps']), 'dropper__drop': ['trestbps_nan', 'chol_nan', 'thalach_nan', 'oldpeak_nan'], 'main_imputer': HotDeckFullImputer(col_k_pairs=[('trestbps', None), ('chol', None), ('thalach', None), ('oldpeak', None)], default_k=7), 'poly': None, 'predictor': DecisionTreeClassifier(max_depth=4), 'reduce_dim': None, 'scaler': None, 'simple_imputer': FillNaTransformer( from_dict={}, mean=[], median=[], nan_flag=['trestbps', 'chol', 'thalach', 'oldpeak'], zero=['trestbps', 'chol', 'thalach', 'oldpeak']) }, 'score': 0.14780487804878048, 'std': 0.03090350740255695 }, 'LogisticRegression_best': { 'params': { 'binner2': None, 'boxcox': BoxCoxTransformer(lambdas_per_column={ 'chol': 0, 'thalach': 2, 'trestbps': 0 }), 'clipper': OutliersClipper( columns=['chol', 'thalach', 'oldpeak', 'trestbps']), 'combinations': None, 'dropper__drop': [], 'main_imputer': ModelBasedFullImputer( columns=['trestbps', 'chol', 'thalach', 'oldpeak'], model=LinearRegression( # n_jobs=7 )), 'poly': PolynomialsAdder(powers_per_column={ 'chol': [2], 'thalach': [2], 'oldpeak': [2], 'trestbps': [2] }), 'predictor': LogisticRegression(), 'reduce_dim': PCA(n_components=10), 'scaler': None, 'simple_imputer': FillNaTransformer( from_dict={}, mean=['trestbps', 'chol', 'thalach', 'oldpeak'], median=[], nan_flag=['trestbps', 'chol', 'thalach', 'oldpeak'], zero=[]) }, 'score': 0.14280487804878048, 'std': 0.03915450868377355 }, # 'XGBClassifier_best': {'params': # { # 'binner2': CustomBinner(configuration={'chol': {'bins': 3}, 'thalach': {'bins': 3}, 'oldpeak': {'bins': 3}, 'trestbps': {'bins': 3}, 'age': {'bins': 3}, 'slope': {'bins': 3}, 'ca': {'bins': 3}}, # drop=False, nan=False), # 'boxcox': BoxCoxTransformer(lambdas_per_column={'chol': 0, 'thalach': 2, 'trestbps': 0}), # 'clipper': OutliersClipper(columns=['chol', 'thalach', 'oldpeak', 'trestbps']), # 'combinations': None, # 'dropper__drop': ['trestbps_nan', 'chol_nan', 'thalach_nan', 'oldpeak_nan'], # 'main_imputer': HotDeckFullImputer(col_k_pairs=[('trestbps', None), ('chol', None), ('thalach', None), ('oldpeak', None)], # default_k=7), # 'poly': None, # 'predictor': XGBClassifier( # # n_jobs=7, # base_score=0.5, # colsample_bytree=0.8, learning_rate=0.07, # max_depth=7, n_estimators=200,), # 'reduce_dim': SelectFromModel(estimator=LogisticRegression(C=0.999, penalty='l1', # # n_jobs=7 # )), # 'scaler': None, # 'simple_imputer': FillNaTransformer(from_dict={}, # mean=['trestbps', 'chol', 'thalach', 'oldpeak'], median=[], # nan_flag=['trestbps', 'chol', 'thalach', 'oldpeak'], zero=[])}, # 'score': 0.15243902439024387, # 'std': 0.04655758333858798} } return data, test, test_labels, scorer, model, params, target