def use_debug_parameters(self, reduced_selected_features): # Define parameters as an array of dicts in case different parameters are used for different optimizations params_debug = [ { 'scaler': [StandardScaler()], 'sampling': [modelutil.Nosampler(), SMOTE(), SMOTEENN(), ADASYN()], 'feat__cols': reduced_selected_features[0:2], 'model__kernel': ['linear'], 'model__C': [0.1, 1, 10], 'model__gamma': [0.1, 1, 10], }, { 'scaler': [StandardScaler(), Normalizer()], 'sampling': [modelutil.Nosampler()], 'feat__cols': reduced_selected_features[0:1], 'model__C': [1], # default C=1 'model__kernel': ['rbf'], 'model__gamma': [1] # Only relevant in rbf, default='auto'=1/n_features } ] return params_debug
def use_debug_parameters(self, reduced_selected_features): ### XGBOOST CODE start params_debug = [{ 'scaler': [StandardScaler()], 'sampling': [modelutil.Nosampler(), SMOTE(), SMOTEENN(), ADASYN()], 'feat__cols': reduced_selected_features[0:2], 'model__nthread': [4], # when use hyperthread, xgboost may become slower 'model__objective': ['binary:logistic'], 'model__learning_rate': [0.05, 0.5], # so called `eta` value 'model__max_depth': [6, 7, 8], 'model__min_child_weight': [11], 'model__silent': [1], 'model__subsample': [0.8], 'model__colsample_bytree': [0.7], 'model__n_estimators': [5, 10], # number of trees, change it to 1000 for better results 'model__missing': [-999], 'model__seed': [1337] }] return params_debug
def use_parameters(self, X_train, selected_features): """ Default Parameter """ test_scaler = [ StandardScaler(), RobustScaler(), QuantileTransformer(), Normalizer() ] test_sampling = [ modelutil.Nosampler(), ClusterCentroids(), RandomUnderSampler(), # NearMiss(version=1), # EditedNearestNeighbours(), # AllKNN(), # CondensedNearestNeighbour(random_state=0), # InstanceHardnessThreshold(random_state=0, # estimator=LogisticRegression(solver='lbfgs', multi_class='auto')), RandomOverSampler(random_state=0), SMOTE(), BorderlineSMOTE(), SMOTEENN(), SMOTETomek(), ADASYN() ] test_C = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] test_C_linear = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2] # gamma default parameters param_scale = 1 / (X_train.shape[1] * np.mean(X_train.var())) parameters = [{ 'scaler': test_scaler, 'sampling': test_sampling, 'feat__cols': selected_features, 'model__C': test_C_linear, # default C=1 'model__kernel': ['linear'] }] # If no missing values, only one imputer strategy shall be used if X_train.isna().sum().sum() > 0: parameters['imputer__strategy'] = [ 'mean', 'median', 'most_frequent' ] print("Missing values used. Test different imputer strategies") else: print("No missing values. No imputer necessary") print("Selected Parameters: ", parameters) # else: print("Parameters defined in the input: ", parameters) return parameters
def create_pipeline(self): pipe_run = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='median')), ('scaler', StandardScaler()), ('sampling', modelutil.Nosampler()), ('feat', modelutil.ColumnExtractor(cols=None)), ('model', SVC())]) return pipe_run
def create_pipeline(self): pipe_run = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='median')), ('scaler', StandardScaler()), ('sampling', modelutil.Nosampler()), ('feat', modelutil.ColumnExtractor(cols=None)), ('model', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))]) return pipe_run
def use_debug_parameters(self, reduced_selected_features): # Define parameters as an array of dicts in case different parameters are used for different optimizations params_debug = [{ 'scaler': [StandardScaler()], 'sampling': [modelutil.Nosampler(), SMOTE(), SMOTEENN(), ADASYN()], 'feat__cols': reduced_selected_features[0:2], 'model__var_smoothing': np.logspace(0, -9, num=100) }] return params_debug
def use_debug_parameters(self, reduced_selected_features): # Define parameters as an array of dicts in case different parameters are used for different optimizations params_debug = [{ 'scaler': [StandardScaler()], 'sampling': [modelutil.Nosampler(), SMOTE(), SMOTEENN(), ADASYN()], 'feat__cols': reduced_selected_features[0:2], 'model__n_neighbors': [3, 5], 'model__weights': ['uniform', 'distance'] }] return params_debug
def create_pipeline(self): n_jobs = multiprocessing.cpu_count() - 1 n_jobs = 10 print("Number of CPUs: {}. Using {}".format( multiprocessing.cpu_count(), n_jobs)) pipe_run = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='median')), ('scaler', StandardScaler()), ('sampling', modelutil.Nosampler()), ('feat', modelutil.ColumnExtractor(cols=None)), ('model', XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=n_jobs))]) return pipe_run
def run_basic_svm(X_train, y_train, selected_features, scorers, refit_scorer_name, subset_share=0.1, n_splits=5, parameters=None): '''Run an extensive grid search over all parameters to find the best parameters for SVM Classifier. The search shall be done only with a subset of the data. Default subset is 0.1. Input is training and test data. subset_share=0.1 ''' # Create a subset to train on print("[Step 1]: Create a data subset") subset_min = 300 # Minimal subset is 100 samples. if subset_share * X_train.shape[0] < subset_min: number_of_samples = subset_min print("minimal number of samples used: ", number_of_samples) else: number_of_samples = subset_share * X_train.shape[0] X_train_subset, y_train_subset = modelutil.extract_data_subset( X_train, y_train, number_of_samples) print("Got subset sizes X train: {} and y train: {}".format( X_train_subset.shape, y_train_subset.shape)) print("[Step 2]: Define test parameters") if parameters is None: # If no parameters have been defined, then do full definition # Guides used from # https://www.kaggle.com/evanmiller/pipelines-gridsearch-awesome-ml-pipelines # Main set of parameters for the grid search run 1: Select scaler, sampler and kernel for the problem test_scaler = [ StandardScaler(), RobustScaler(), QuantileTransformer(), Normalizer() ] test_sampling = [ modelutil.Nosampler(), #ClusterCentroids(), #RandomUnderSampler(), #NearMiss(version=1), #EditedNearestNeighbours(), #AllKNN(), #CondensedNearestNeighbour(random_state=0), #InstanceHardnessThreshold(random_state=0, # estimator=LogisticRegression(solver='lbfgs', multi_class='auto')), SMOTE(), SMOTEENN(), SMOTETomek(), ADASYN() ] test_C = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] test_C_linear = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2] # gamma default parameters param_scale = 1 / (X_train.shape[1] * np.mean(X_train.var())) parameters = [ { 'scaler': test_scaler, 'sampling': test_sampling, 'feat__cols': selected_features, 'svm__C': test_C, # default C=1 'svm__kernel': ['sigmoid'] }, { 'scaler': test_scaler, 'sampling': test_sampling, 'feat__cols': selected_features, 'svm__C': test_C_linear, # default C=1 'svm__kernel': ['linear'] }, { 'scaler': test_scaler, 'sampling': test_sampling, 'feat__cols': selected_features, 'svm__C': test_C, # default C=1 'svm__kernel': ['poly'], 'svm__degree': [2, 3] # Only relevant for poly }, { 'scaler': test_scaler, 'sampling': test_sampling, 'feat__cols': selected_features, 'svm__C': test_C, # default C=1 'svm__kernel': ['rbf'], 'svm__gamma': [param_scale, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] # Only relevant in rbf, default='auto'=1/n_features } ] # If no missing values, only one imputer strategy shall be used if X_train.isna().sum().sum() > 0: parameters['imputer__strategy'] = [ 'mean', 'median', 'most_frequent' ] print("Missing values used. Test different imputer strategies") else: print("No missing values. No imputer necessary") print("Selected Parameters: ", parameters) else: print("Parameters defined in the input: ", parameters) # Main pipeline for the grid search pipe_run1 = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='median')), ('scaler', StandardScaler()), ('sampling', modelutil.Nosampler()), ('feat', modelutil.ColumnExtractor(cols=None)), ('svm', SVC())]) print("Pipeline: ", pipe_run1) print("Stratified KFold={} used.".format(n_splits)) #INFO: KFold Splitter with shuffle=True to get random values skf = StratifiedKFold(n_splits=n_splits, random_state=3, shuffle=True) pipe_run1 = pipe_run1 params_run1 = parameters # params_debug #params_run1 grid_search_run1 = GridSearchCV(pipe_run1, params_run1, verbose=2, cv=skf, scoring=scorers, refit=refit_scorer_name, return_train_score=True, n_jobs=-1).fit(X_train_subset, y_train_subset) #grid_search_run1 = GridSearchCV(pipe_run1, params_run1, verbose=1, cv=skf, scoring=scorers, refit=refit_scorer_name, # return_train_score=True, iid=True, n_jobs=-1).fit(X_train_subset, y_train_subset) results_run1 = modelutil.generate_result_table(grid_search_run1, params_run1, refit_scorer_name) print("Result size=", results_run1.shape) print("Number of NaN results: {}. Replace them with 0".format( np.sum(results_run1['mean_test_' + refit_scorer_name].isna()))) return grid_search_run1, params_run1, pipe_run1, results_run1
def use_parameters(self, X_train, selected_features): ''' Returns ------- ''' test_scaler = [ StandardScaler(), RobustScaler(), QuantileTransformer(), Normalizer() ] test_sampling = [ modelutil.Nosampler(), ClusterCentroids(), RandomUnderSampler(), # NearMiss(version=1), # EditedNearestNeighbours(), # AllKNN(), # CondensedNearestNeighbour(random_state=0), # InstanceHardnessThreshold(random_state=0, # estimator=LogisticRegression(solver='lbfgs', multi_class='auto')), RandomOverSampler(random_state=0), SMOTE(), BorderlineSMOTE(), SMOTEENN(), SMOTETomek(), ADASYN() ] ### XGBOOST parameters = [{ 'scaler': test_scaler, 'sampling': test_sampling, 'feat__cols': selected_features, 'model__nthread': [4], # when use hyperthread, xgboost may become slower 'model__objective': ['binary:logistic'], 'model__learning_rate': [0.005, 0.01, 0.05, 0.1, 0.5], # so called `eta` value 'model__max_depth': [6, 7, 8], 'model__min_child_weight': [11], 'model__silent': [0], 'model__subsample': [0.8], 'model__colsample_bytree': [0.7], 'model__n_estimators': [5, 100, 1000], # number of trees, change it to 1000 for better results 'model__missing': [-999], 'model__seed': [1337] }] # If no missing values, only one imputer strategy shall be used if X_train.isna().sum().sum() > 0: parameters['imputer__strategy'] = [ 'mean', 'median', 'most_frequent' ] print("Missing values used. Test different imputer strategies") else: print("No missing values. No imputer necessary") print("Selected Parameters: ", parameters) # else: print("Parameters defined in the input: ", parameters) ### XGBOOST return parameters