percentage_list = [4, 10, 25, 35, 50] #This loop executes the oversampling strategy (In this case ADASYN) for all the ratio's that were tested. for ratio, percentage in zip(ratio_list, percentage_list): #Create a train-test split where the ratio of target class is maintained x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=47, stratify=y) #Initialize a ADASYN sampler with ratio that will be tested over = ADASYN(sampling_strategy=ratio) #Initialize a pipeline (One can add extra steps here if required) steps = [ ('o', over)] pipeline = Pipeline(steps) #Resample data x_res, y_res = pipeline.fit_resample(x_train, y_train) print('resample finished') #Train an xg_boost model with resampled data xgb = xg_boost(x_res, y_res, x_test, y_test, f"ADASYN_{percentage}") # The code below was used to calculate the running times. # Since some running times were very long, we let the code time-out after 10 hours. # It is less relevant for WWF, hence it is commented out. #List of sub-sample sizes that were evaluated to calculate running times. # subset_list = [30000, 50000, 75000, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1500000, 2000000] # times_subsetsize_list = [] # def calculate_running_times(): # for i in subset_list: # start = time.time() # x_rest, x_sub, y_rest, y_sub = train_test_split(X, y, test_size=i/len(X), stratify=y, random_state=47) # print("ADASYN", i)
#Create a train-test split where the ratio of target class is maintained x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=47, stratify=y) #Initialize a SMOTETomek sampler with ratio that will be tested over = imblearn.combine.SMOTETomek(sampling_strategy=ratio) #Initialize a pipeline (One can add extra steps here if required) steps = [('o', over)] pipeline = Pipeline(steps) #Resample data x_train_res, y_train_res = pipeline.fit_resample(x_train, y_train) print('resample finished') #Train an xg_boost model with resampled data xg_boost(x_train_res, y_train_res, x_test, y_test, f"smote_tomek{percentage}") # The code below was used to calculate the running times. # Since some running times were very long, we let the code time-out after 10 hours. # It is less relevant for WWF, hence it is commented out. #List of sub-sample sizes that were evaluated to calculate running times. # subset_list = [30000, 50000, 75000, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1500000, 2000000] # times_subsetsize_list = [] # # def calculate_running_times(): # for i in subset_list: # start = time.time() # print("SMOTETomek", i) # x_rest, x_sub, y_rest, y_sub = train_test_split(X, y, test_size=i/len(X), stratify=y, random_state=47) # over = imblearn.combine.SMOTETomek(sampling_strategy=0.042)
#1 = random undersampling, 2 = random oversampling, 3 = random under and oversampling combined. sampler_choice = 2 #test #Sampling_strategy variable is used to set the sampling ratio. sampling_strategy = 1 if sampler_choice == 1: #Initialize a random over sampler with ratio that will be tested under = imb.under_sampling.RandomUnderSampler( sampling_strategy=sampling_strategy) #Initialize a pipeline (One can add extra steps here if required) steps = [('u', under)] pipeline = imb.pipeline.Pipeline(steps) #Resample data x_train, y_train = pipeline.fit_resample(x_train, y_train) #Train an xg_boost model with resampled data xg_boost(x_train, y_train, x_test, y_test, 'Random Undersampling') elif sampler_choice == 2: #Initialize a random under sampler with ratio that will be tested over = imb.over_sampling.RandomOverSampler( sampling_strategy=sampling_strategy) #Initialize a pipeline (One can add extra steps here if required) steps = [('o', over)] pipeline = imb.pipeline.Pipeline(steps) #Resample data x_train, y_train = pipeline.fit_resample(x_train, y_train) #Train an xg_boost model with resampled data xg_boost(x_train, y_train, x_test, y_test, 'Random Oversampling') elif sampler_choice == 3: #Initialize a random over sampler, then a random undersampler with ratios that will be tested over = imb.over_sampling.RandomOverSampler(sampling_strategy=0.10) under = imb.under_sampling.RandomUnderSampler(
# Create a train-test split where the ratio of target class is maintained x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=47, stratify=y) #Initialize a SMOTE sampler with ratio that will be tested over = SMOTE(sampling_strategy=ratio) #Initialize a pipeline (One can add extra steps here if required) steps = [('o', over)] pipeline = Pipeline(steps) #Resample data x_res, y_res = pipeline.fit_resample(x_train, y_train) print('resample finished') #Train an xg_boost model with resampled data xgb = xg_boost(x_res, y_res, x_test, y_test, f"SMOTE_{percentage}") # The code below was used to calculate the running times. # Since some running times were very long, we let the code time-out after 10 hours. # # It is less relevant for WWF, hence it is commented out. #List of sub-sample sizes that were evaluated to calculate running times. # subset_list = [30000, 50000, 75000, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1500000, 2000000] # times_subsetsize_list = [] # def calculate_running_times(): # for i in subset_list: # start = time.time() # x_rest, x_sub, y_rest, y_sub = train_test_split(X, y, test_size=i/len(X), stratify=y, random_state=47) # # print("SMOTE", i)
# Create a train-test split where the ratio of target class is maintained x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=47, stratify=y) # Initialize a TomekLinks sampler under = imblearn.under_sampling.TomekLinks(sampling_strategy='majority') # Initialize a pipeline (One can add extra steps here if required) steps = [('o', under)] pipeline = Pipeline(steps) # Resample data x_train_res, y_train_res = pipeline.fit_resample(x_train, y_train) # Train an xg_boost model with resampled data xg_boost(x_train_res, y_train_res, x_test, y_test, f"tomek_links{len(X)}") # The code below was used to calculate the running times. # Since some running times were very long, we let the code time-out after 10 hours. # It is less relevant for WWF, hence it is commented out. #List of sub-sample sizes that were evaluated to calculate running times. # subset_list = [30000, 50000, 75000, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000, 1500000, 2000000] # times_subsetsize_list = [] # # def calculate_running_times(): # for i in subset_list: # start = time.time() # x_rest, x_sub, y_rest, y_sub = train_test_split(X, y, test_size=i/len(X), stratify=y, random_state=47) # # Third pipeline Tomek links # print("Tomek", i)