def use_debug_parameters(self, reduced_selected_features):
        # Define parameters as an array of dicts in case different parameters are used for different optimizations
        params_debug = [
            {
                'scaler': [StandardScaler()],
                'sampling':
                [modelutil.Nosampler(),
                 SMOTE(), SMOTEENN(),
                 ADASYN()],
                'feat__cols': reduced_selected_features[0:2],
                'model__kernel': ['linear'],
                'model__C': [0.1, 1, 10],
                'model__gamma': [0.1, 1, 10],
            },
            {
                'scaler': [StandardScaler(), Normalizer()],
                'sampling': [modelutil.Nosampler()],
                'feat__cols': reduced_selected_features[0:1],
                'model__C': [1],  # default C=1
                'model__kernel': ['rbf'],
                'model__gamma': [1]
                # Only relevant in rbf, default='auto'=1/n_features
            }
        ]

        return params_debug
    def use_debug_parameters(self, reduced_selected_features):
        ### XGBOOST CODE start
        params_debug = [{
            'scaler': [StandardScaler()],
            'sampling': [modelutil.Nosampler(),
                         SMOTE(),
                         SMOTEENN(),
                         ADASYN()],
            'feat__cols':
            reduced_selected_features[0:2],
            'model__nthread':
            [4],  # when use hyperthread, xgboost may become slower
            'model__objective': ['binary:logistic'],
            'model__learning_rate': [0.05, 0.5],  # so called `eta` value
            'model__max_depth': [6, 7, 8],
            'model__min_child_weight': [11],
            'model__silent': [1],
            'model__subsample': [0.8],
            'model__colsample_bytree': [0.7],
            'model__n_estimators':
            [5, 10],  # number of trees, change it to 1000 for better results
            'model__missing': [-999],
            'model__seed': [1337]
        }]

        return params_debug
    def use_parameters(self, X_train, selected_features):
        """
        Default Parameter

        """

        test_scaler = [
            StandardScaler(),
            RobustScaler(),
            QuantileTransformer(),
            Normalizer()
        ]
        test_sampling = [
            modelutil.Nosampler(),
            ClusterCentroids(),
            RandomUnderSampler(),
            # NearMiss(version=1),
            # EditedNearestNeighbours(),
            # AllKNN(),
            # CondensedNearestNeighbour(random_state=0),
            # InstanceHardnessThreshold(random_state=0,
            #                          estimator=LogisticRegression(solver='lbfgs', multi_class='auto')),
            RandomOverSampler(random_state=0),
            SMOTE(),
            BorderlineSMOTE(),
            SMOTEENN(),
            SMOTETomek(),
            ADASYN()
        ]
        test_C = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
        test_C_linear = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2]

        # gamma default parameters
        param_scale = 1 / (X_train.shape[1] * np.mean(X_train.var()))

        parameters = [{
            'scaler': test_scaler,
            'sampling': test_sampling,
            'feat__cols': selected_features,
            'model__C': test_C_linear,  # default C=1
            'model__kernel': ['linear']
        }]

        # If no missing values, only one imputer strategy shall be used
        if X_train.isna().sum().sum() > 0:
            parameters['imputer__strategy'] = [
                'mean', 'median', 'most_frequent'
            ]
            print("Missing values used. Test different imputer strategies")
        else:
            print("No missing values. No imputer necessary")

            print("Selected Parameters: ", parameters)
        # else:
        print("Parameters defined in the input: ", parameters)

        return parameters
 def create_pipeline(self):
     pipe_run = Pipeline([('imputer',
                           SimpleImputer(missing_values=np.nan,
                                         strategy='median')),
                          ('scaler', StandardScaler()),
                          ('sampling', modelutil.Nosampler()),
                          ('feat', modelutil.ColumnExtractor(cols=None)),
                          ('model', SVC())])
     return pipe_run
예제 #5
0
 def create_pipeline(self):
     pipe_run = Pipeline([('imputer',
                           SimpleImputer(missing_values=np.nan,
                                         strategy='median')),
                          ('scaler', StandardScaler()),
                          ('sampling', modelutil.Nosampler()),
                          ('feat', modelutil.ColumnExtractor(cols=None)),
                          ('model',
                           XGBClassifier(use_label_encoder=False,
                                         eval_metric='logloss'))])
     return pipe_run
    def use_debug_parameters(self, reduced_selected_features):
        # Define parameters as an array of dicts in case different parameters are used for different optimizations
        params_debug = [{
            'scaler': [StandardScaler()],
            'sampling': [modelutil.Nosampler(),
                         SMOTE(),
                         SMOTEENN(),
                         ADASYN()],
            'feat__cols':
            reduced_selected_features[0:2],
            'model__var_smoothing':
            np.logspace(0, -9, num=100)
        }]

        return params_debug
    def use_debug_parameters(self, reduced_selected_features):
        # Define parameters as an array of dicts in case different parameters are used for different optimizations
        params_debug = [{
            'scaler': [StandardScaler()],
            'sampling': [modelutil.Nosampler(),
                         SMOTE(),
                         SMOTEENN(),
                         ADASYN()],
            'feat__cols':
            reduced_selected_features[0:2],
            'model__n_neighbors': [3, 5],
            'model__weights': ['uniform', 'distance']
        }]

        return params_debug
    def create_pipeline(self):
        n_jobs = multiprocessing.cpu_count() - 1
        n_jobs = 10
        print("Number of CPUs: {}. Using {}".format(
            multiprocessing.cpu_count(), n_jobs))

        pipe_run = Pipeline([('imputer',
                              SimpleImputer(missing_values=np.nan,
                                            strategy='median')),
                             ('scaler', StandardScaler()),
                             ('sampling', modelutil.Nosampler()),
                             ('feat', modelutil.ColumnExtractor(cols=None)),
                             ('model',
                              XGBClassifier(use_label_encoder=False,
                                            eval_metric='logloss',
                                            n_jobs=n_jobs))])
        return pipe_run
def run_basic_svm(X_train,
                  y_train,
                  selected_features,
                  scorers,
                  refit_scorer_name,
                  subset_share=0.1,
                  n_splits=5,
                  parameters=None):
    '''Run an extensive grid search over all parameters to find the best parameters for SVM Classifier.
    The search shall be done only with a subset of the data. Default subset is 0.1. Input is training and test data.

    subset_share=0.1

    '''

    # Create a subset to train on
    print("[Step 1]: Create a data subset")
    subset_min = 300  # Minimal subset is 100 samples.

    if subset_share * X_train.shape[0] < subset_min:
        number_of_samples = subset_min
        print("minimal number of samples used: ", number_of_samples)
    else:
        number_of_samples = subset_share * X_train.shape[0]

    X_train_subset, y_train_subset = modelutil.extract_data_subset(
        X_train, y_train, number_of_samples)
    print("Got subset sizes X train: {} and y train: {}".format(
        X_train_subset.shape, y_train_subset.shape))

    print("[Step 2]: Define test parameters")
    if parameters is None:  # If no parameters have been defined, then do full definition
        # Guides used from
        # https://www.kaggle.com/evanmiller/pipelines-gridsearch-awesome-ml-pipelines
        # Main set of parameters for the grid search run 1: Select scaler, sampler and kernel for the problem
        test_scaler = [
            StandardScaler(),
            RobustScaler(),
            QuantileTransformer(),
            Normalizer()
        ]
        test_sampling = [
            modelutil.Nosampler(),
            #ClusterCentroids(),
            #RandomUnderSampler(),
            #NearMiss(version=1),
            #EditedNearestNeighbours(),
            #AllKNN(),
            #CondensedNearestNeighbour(random_state=0),
            #InstanceHardnessThreshold(random_state=0,
            #                          estimator=LogisticRegression(solver='lbfgs', multi_class='auto')),
            SMOTE(),
            SMOTEENN(),
            SMOTETomek(),
            ADASYN()
        ]
        test_C = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
        test_C_linear = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2]

        # gamma default parameters
        param_scale = 1 / (X_train.shape[1] * np.mean(X_train.var()))

        parameters = [
            {
                'scaler': test_scaler,
                'sampling': test_sampling,
                'feat__cols': selected_features,
                'svm__C': test_C,  # default C=1
                'svm__kernel': ['sigmoid']
            },
            {
                'scaler': test_scaler,
                'sampling': test_sampling,
                'feat__cols': selected_features,
                'svm__C': test_C_linear,  # default C=1
                'svm__kernel': ['linear']
            },
            {
                'scaler': test_scaler,
                'sampling': test_sampling,
                'feat__cols': selected_features,
                'svm__C': test_C,  # default C=1
                'svm__kernel': ['poly'],
                'svm__degree': [2, 3]  # Only relevant for poly
            },
            {
                'scaler': test_scaler,
                'sampling': test_sampling,
                'feat__cols': selected_features,
                'svm__C': test_C,  # default C=1
                'svm__kernel': ['rbf'],
                'svm__gamma':
                [param_scale, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
                # Only relevant in rbf, default='auto'=1/n_features
            }
        ]

        # If no missing values, only one imputer strategy shall be used
        if X_train.isna().sum().sum() > 0:
            parameters['imputer__strategy'] = [
                'mean', 'median', 'most_frequent'
            ]
            print("Missing values used. Test different imputer strategies")
        else:
            print("No missing values. No imputer necessary")

        print("Selected Parameters: ", parameters)
    else:
        print("Parameters defined in the input: ", parameters)

    # Main pipeline for the grid search
    pipe_run1 = Pipeline([('imputer',
                           SimpleImputer(missing_values=np.nan,
                                         strategy='median')),
                          ('scaler', StandardScaler()),
                          ('sampling', modelutil.Nosampler()),
                          ('feat', modelutil.ColumnExtractor(cols=None)),
                          ('svm', SVC())])

    print("Pipeline: ", pipe_run1)

    print("Stratified KFold={} used.".format(n_splits))
    #INFO: KFold Splitter with shuffle=True to get random values
    skf = StratifiedKFold(n_splits=n_splits, random_state=3, shuffle=True)

    pipe_run1 = pipe_run1
    params_run1 = parameters  # params_debug #params_run1
    grid_search_run1 = GridSearchCV(pipe_run1,
                                    params_run1,
                                    verbose=2,
                                    cv=skf,
                                    scoring=scorers,
                                    refit=refit_scorer_name,
                                    return_train_score=True,
                                    n_jobs=-1).fit(X_train_subset,
                                                   y_train_subset)

    #grid_search_run1 = GridSearchCV(pipe_run1, params_run1, verbose=1, cv=skf, scoring=scorers, refit=refit_scorer_name,
    #                                return_train_score=True, iid=True, n_jobs=-1).fit(X_train_subset, y_train_subset)

    results_run1 = modelutil.generate_result_table(grid_search_run1,
                                                   params_run1,
                                                   refit_scorer_name)
    print("Result size=", results_run1.shape)
    print("Number of NaN results: {}. Replace them with 0".format(
        np.sum(results_run1['mean_test_' + refit_scorer_name].isna())))

    return grid_search_run1, params_run1, pipe_run1, results_run1
    def use_parameters(self, X_train, selected_features):
        '''


        Returns
        -------

        '''
        test_scaler = [
            StandardScaler(),
            RobustScaler(),
            QuantileTransformer(),
            Normalizer()
        ]
        test_sampling = [
            modelutil.Nosampler(),
            ClusterCentroids(),
            RandomUnderSampler(),
            # NearMiss(version=1),
            # EditedNearestNeighbours(),
            # AllKNN(),
            # CondensedNearestNeighbour(random_state=0),
            # InstanceHardnessThreshold(random_state=0,
            #                          estimator=LogisticRegression(solver='lbfgs', multi_class='auto')),
            RandomOverSampler(random_state=0),
            SMOTE(),
            BorderlineSMOTE(),
            SMOTEENN(),
            SMOTETomek(),
            ADASYN()
        ]

        ### XGBOOST
        parameters = [{
            'scaler': test_scaler,
            'sampling': test_sampling,
            'feat__cols': selected_features,
            'model__nthread':
            [4],  # when use hyperthread, xgboost may become slower
            'model__objective': ['binary:logistic'],
            'model__learning_rate': [0.005, 0.01, 0.05, 0.1,
                                     0.5],  # so called `eta` value
            'model__max_depth': [6, 7, 8],
            'model__min_child_weight': [11],
            'model__silent': [0],
            'model__subsample': [0.8],
            'model__colsample_bytree': [0.7],
            'model__n_estimators':
            [5, 100,
             1000],  # number of trees, change it to 1000 for better results
            'model__missing': [-999],
            'model__seed': [1337]
        }]

        # If no missing values, only one imputer strategy shall be used
        if X_train.isna().sum().sum() > 0:
            parameters['imputer__strategy'] = [
                'mean', 'median', 'most_frequent'
            ]
            print("Missing values used. Test different imputer strategies")
        else:
            print("No missing values. No imputer necessary")

            print("Selected Parameters: ", parameters)
        # else:
        print("Parameters defined in the input: ", parameters)

        ### XGBOOST
        return parameters