def create_pipeline(self): pipe_run = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='median')), ('scaler', StandardScaler()), ('sampling', modelutil.Nosampler()), ('feat', modelutil.ColumnExtractor(cols=None)), ('model', SVC())]) return pipe_run
def create_pipeline(self): pipe_run = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='median')), ('scaler', StandardScaler()), ('sampling', modelutil.Nosampler()), ('feat', modelutil.ColumnExtractor(cols=None)), ('model', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))]) return pipe_run
def define_best_pipeline(self, best_values_dict, best_columns, models_run1): pipe_run_best_first_selection = Pipeline([ ('scaler', best_values_dict.get('scaler')), ('sampling', best_values_dict.get('sampling')), ('feat', modelutil.ColumnExtractor(cols=best_columns)), ('model', models_run1.set_params( kernel=best_values_dict.get('model__kernel'))) ]) return pipe_run_best_first_selection
def define_best_pipeline(self, best_values_dict, best_columns, models_run1): pipe_run_best_first_selection = Pipeline([ ('scaler', best_values_dict.get('scaler')), ('sampling', best_values_dict.get('sampling')), ('feat', modelutil.ColumnExtractor(cols=best_columns)), ('model', models_run1.set_params( n_estimators=best_values_dict.get('model__n_estimators'), n_learning_rate=best_values_dict.get('model__learning_rate'), n_max_depth=best_values_dict.get('model__max_depth'))) ]) return pipe_run_best_first_selection
def create_pipeline(self): n_jobs = multiprocessing.cpu_count() - 1 n_jobs = 10 print("Number of CPUs: {}. Using {}".format( multiprocessing.cpu_count(), n_jobs)) pipe_run = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='median')), ('scaler', StandardScaler()), ('sampling', modelutil.Nosampler()), ('feat', modelutil.ColumnExtractor(cols=None)), ('model', XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=n_jobs))]) return pipe_run
def run_basic_svm(X_train, y_train, selected_features, scorers, refit_scorer_name, subset_share=0.1, n_splits=5, parameters=None): '''Run an extensive grid search over all parameters to find the best parameters for SVM Classifier. The search shall be done only with a subset of the data. Default subset is 0.1. Input is training and test data. subset_share=0.1 ''' # Create a subset to train on print("[Step 1]: Create a data subset") subset_min = 300 # Minimal subset is 100 samples. if subset_share * X_train.shape[0] < subset_min: number_of_samples = subset_min print("minimal number of samples used: ", number_of_samples) else: number_of_samples = subset_share * X_train.shape[0] X_train_subset, y_train_subset = modelutil.extract_data_subset( X_train, y_train, number_of_samples) print("Got subset sizes X train: {} and y train: {}".format( X_train_subset.shape, y_train_subset.shape)) print("[Step 2]: Define test parameters") if parameters is None: # If no parameters have been defined, then do full definition # Guides used from # https://www.kaggle.com/evanmiller/pipelines-gridsearch-awesome-ml-pipelines # Main set of parameters for the grid search run 1: Select scaler, sampler and kernel for the problem test_scaler = [ StandardScaler(), RobustScaler(), QuantileTransformer(), Normalizer() ] test_sampling = [ modelutil.Nosampler(), #ClusterCentroids(), #RandomUnderSampler(), #NearMiss(version=1), #EditedNearestNeighbours(), #AllKNN(), #CondensedNearestNeighbour(random_state=0), #InstanceHardnessThreshold(random_state=0, # estimator=LogisticRegression(solver='lbfgs', multi_class='auto')), SMOTE(), SMOTEENN(), SMOTETomek(), ADASYN() ] test_C = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] test_C_linear = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2] # gamma default parameters param_scale = 1 / (X_train.shape[1] * np.mean(X_train.var())) parameters = [ { 'scaler': test_scaler, 'sampling': test_sampling, 'feat__cols': selected_features, 'svm__C': test_C, # default C=1 'svm__kernel': ['sigmoid'] }, { 'scaler': test_scaler, 'sampling': test_sampling, 'feat__cols': selected_features, 'svm__C': test_C_linear, # default C=1 'svm__kernel': ['linear'] }, { 'scaler': test_scaler, 'sampling': test_sampling, 'feat__cols': selected_features, 'svm__C': test_C, # default C=1 'svm__kernel': ['poly'], 'svm__degree': [2, 3] # Only relevant for poly }, { 'scaler': test_scaler, 'sampling': test_sampling, 'feat__cols': selected_features, 'svm__C': test_C, # default C=1 'svm__kernel': ['rbf'], 'svm__gamma': [param_scale, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] # Only relevant in rbf, default='auto'=1/n_features } ] # If no missing values, only one imputer strategy shall be used if X_train.isna().sum().sum() > 0: parameters['imputer__strategy'] = [ 'mean', 'median', 'most_frequent' ] print("Missing values used. Test different imputer strategies") else: print("No missing values. No imputer necessary") print("Selected Parameters: ", parameters) else: print("Parameters defined in the input: ", parameters) # Main pipeline for the grid search pipe_run1 = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='median')), ('scaler', StandardScaler()), ('sampling', modelutil.Nosampler()), ('feat', modelutil.ColumnExtractor(cols=None)), ('svm', SVC())]) print("Pipeline: ", pipe_run1) print("Stratified KFold={} used.".format(n_splits)) #INFO: KFold Splitter with shuffle=True to get random values skf = StratifiedKFold(n_splits=n_splits, random_state=3, shuffle=True) pipe_run1 = pipe_run1 params_run1 = parameters # params_debug #params_run1 grid_search_run1 = GridSearchCV(pipe_run1, params_run1, verbose=2, cv=skf, scoring=scorers, refit=refit_scorer_name, return_train_score=True, n_jobs=-1).fit(X_train_subset, y_train_subset) #grid_search_run1 = GridSearchCV(pipe_run1, params_run1, verbose=1, cv=skf, scoring=scorers, refit=refit_scorer_name, # return_train_score=True, iid=True, n_jobs=-1).fit(X_train_subset, y_train_subset) results_run1 = modelutil.generate_result_table(grid_search_run1, params_run1, refit_scorer_name) print("Result size=", results_run1.shape) print("Number of NaN results: {}. Replace them with 0".format( np.sum(results_run1['mean_test_' + refit_scorer_name].isna()))) return grid_search_run1, params_run1, pipe_run1, results_run1