def create_pipeline(self):
     pipe_run = Pipeline([('imputer',
                           SimpleImputer(missing_values=np.nan,
                                         strategy='median')),
                          ('scaler', StandardScaler()),
                          ('sampling', modelutil.Nosampler()),
                          ('feat', modelutil.ColumnExtractor(cols=None)),
                          ('model', SVC())])
     return pipe_run
Пример #2
0
 def create_pipeline(self):
     pipe_run = Pipeline([('imputer',
                           SimpleImputer(missing_values=np.nan,
                                         strategy='median')),
                          ('scaler', StandardScaler()),
                          ('sampling', modelutil.Nosampler()),
                          ('feat', modelutil.ColumnExtractor(cols=None)),
                          ('model',
                           XGBClassifier(use_label_encoder=False,
                                         eval_metric='logloss'))])
     return pipe_run
    def define_best_pipeline(self, best_values_dict, best_columns,
                             models_run1):
        pipe_run_best_first_selection = Pipeline([
            ('scaler', best_values_dict.get('scaler')),
            ('sampling', best_values_dict.get('sampling')),
            ('feat', modelutil.ColumnExtractor(cols=best_columns)),
            ('model',
             models_run1.set_params(
                 kernel=best_values_dict.get('model__kernel')))
        ])

        return pipe_run_best_first_selection
    def define_best_pipeline(self, best_values_dict, best_columns,
                             models_run1):
        pipe_run_best_first_selection = Pipeline([
            ('scaler', best_values_dict.get('scaler')),
            ('sampling', best_values_dict.get('sampling')),
            ('feat', modelutil.ColumnExtractor(cols=best_columns)),
            ('model',
             models_run1.set_params(
                 n_estimators=best_values_dict.get('model__n_estimators'),
                 n_learning_rate=best_values_dict.get('model__learning_rate'),
                 n_max_depth=best_values_dict.get('model__max_depth')))
        ])

        return pipe_run_best_first_selection
    def create_pipeline(self):
        n_jobs = multiprocessing.cpu_count() - 1
        n_jobs = 10
        print("Number of CPUs: {}. Using {}".format(
            multiprocessing.cpu_count(), n_jobs))

        pipe_run = Pipeline([('imputer',
                              SimpleImputer(missing_values=np.nan,
                                            strategy='median')),
                             ('scaler', StandardScaler()),
                             ('sampling', modelutil.Nosampler()),
                             ('feat', modelutil.ColumnExtractor(cols=None)),
                             ('model',
                              XGBClassifier(use_label_encoder=False,
                                            eval_metric='logloss',
                                            n_jobs=n_jobs))])
        return pipe_run
def run_basic_svm(X_train,
                  y_train,
                  selected_features,
                  scorers,
                  refit_scorer_name,
                  subset_share=0.1,
                  n_splits=5,
                  parameters=None):
    '''Run an extensive grid search over all parameters to find the best parameters for SVM Classifier.
    The search shall be done only with a subset of the data. Default subset is 0.1. Input is training and test data.

    subset_share=0.1

    '''

    # Create a subset to train on
    print("[Step 1]: Create a data subset")
    subset_min = 300  # Minimal subset is 100 samples.

    if subset_share * X_train.shape[0] < subset_min:
        number_of_samples = subset_min
        print("minimal number of samples used: ", number_of_samples)
    else:
        number_of_samples = subset_share * X_train.shape[0]

    X_train_subset, y_train_subset = modelutil.extract_data_subset(
        X_train, y_train, number_of_samples)
    print("Got subset sizes X train: {} and y train: {}".format(
        X_train_subset.shape, y_train_subset.shape))

    print("[Step 2]: Define test parameters")
    if parameters is None:  # If no parameters have been defined, then do full definition
        # Guides used from
        # https://www.kaggle.com/evanmiller/pipelines-gridsearch-awesome-ml-pipelines
        # Main set of parameters for the grid search run 1: Select scaler, sampler and kernel for the problem
        test_scaler = [
            StandardScaler(),
            RobustScaler(),
            QuantileTransformer(),
            Normalizer()
        ]
        test_sampling = [
            modelutil.Nosampler(),
            #ClusterCentroids(),
            #RandomUnderSampler(),
            #NearMiss(version=1),
            #EditedNearestNeighbours(),
            #AllKNN(),
            #CondensedNearestNeighbour(random_state=0),
            #InstanceHardnessThreshold(random_state=0,
            #                          estimator=LogisticRegression(solver='lbfgs', multi_class='auto')),
            SMOTE(),
            SMOTEENN(),
            SMOTETomek(),
            ADASYN()
        ]
        test_C = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
        test_C_linear = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2]

        # gamma default parameters
        param_scale = 1 / (X_train.shape[1] * np.mean(X_train.var()))

        parameters = [
            {
                'scaler': test_scaler,
                'sampling': test_sampling,
                'feat__cols': selected_features,
                'svm__C': test_C,  # default C=1
                'svm__kernel': ['sigmoid']
            },
            {
                'scaler': test_scaler,
                'sampling': test_sampling,
                'feat__cols': selected_features,
                'svm__C': test_C_linear,  # default C=1
                'svm__kernel': ['linear']
            },
            {
                'scaler': test_scaler,
                'sampling': test_sampling,
                'feat__cols': selected_features,
                'svm__C': test_C,  # default C=1
                'svm__kernel': ['poly'],
                'svm__degree': [2, 3]  # Only relevant for poly
            },
            {
                'scaler': test_scaler,
                'sampling': test_sampling,
                'feat__cols': selected_features,
                'svm__C': test_C,  # default C=1
                'svm__kernel': ['rbf'],
                'svm__gamma':
                [param_scale, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
                # Only relevant in rbf, default='auto'=1/n_features
            }
        ]

        # If no missing values, only one imputer strategy shall be used
        if X_train.isna().sum().sum() > 0:
            parameters['imputer__strategy'] = [
                'mean', 'median', 'most_frequent'
            ]
            print("Missing values used. Test different imputer strategies")
        else:
            print("No missing values. No imputer necessary")

        print("Selected Parameters: ", parameters)
    else:
        print("Parameters defined in the input: ", parameters)

    # Main pipeline for the grid search
    pipe_run1 = Pipeline([('imputer',
                           SimpleImputer(missing_values=np.nan,
                                         strategy='median')),
                          ('scaler', StandardScaler()),
                          ('sampling', modelutil.Nosampler()),
                          ('feat', modelutil.ColumnExtractor(cols=None)),
                          ('svm', SVC())])

    print("Pipeline: ", pipe_run1)

    print("Stratified KFold={} used.".format(n_splits))
    #INFO: KFold Splitter with shuffle=True to get random values
    skf = StratifiedKFold(n_splits=n_splits, random_state=3, shuffle=True)

    pipe_run1 = pipe_run1
    params_run1 = parameters  # params_debug #params_run1
    grid_search_run1 = GridSearchCV(pipe_run1,
                                    params_run1,
                                    verbose=2,
                                    cv=skf,
                                    scoring=scorers,
                                    refit=refit_scorer_name,
                                    return_train_score=True,
                                    n_jobs=-1).fit(X_train_subset,
                                                   y_train_subset)

    #grid_search_run1 = GridSearchCV(pipe_run1, params_run1, verbose=1, cv=skf, scoring=scorers, refit=refit_scorer_name,
    #                                return_train_score=True, iid=True, n_jobs=-1).fit(X_train_subset, y_train_subset)

    results_run1 = modelutil.generate_result_table(grid_search_run1,
                                                   params_run1,
                                                   refit_scorer_name)
    print("Result size=", results_run1.shape)
    print("Number of NaN results: {}. Replace them with 0".format(
        np.sum(results_run1['mean_test_' + refit_scorer_name].isna())))

    return grid_search_run1, params_run1, pipe_run1, results_run1