def _ANASYN(self): """ADAptive SYNthetic (ADASYN) is based on the idea of adaptively generating minority data samples according to their distributions using K nearest neighbor. The algorithm adaptively updates the distribution and there are no assumptions made for the underlying distribution of the data.""" print("before: ", len(self.x_train)) resampler = uns.InstanceHardnessThreshold( sampling_strategy=0.2, random_state=self.seed ) self.X_train_smote2, self.y_train_smote2 = resampler.fit_resample( self.x_train, self.y_train ) self.x_train = pd.DataFrame(self.X_train_smote2, columns=self.x_train.columns) self.y_train = pd.DataFrame( self.y_train_smote2, columns=["Local Relapse Y(1) /N(0)"] ) print("after: ", len(self.x_train)) adasyn = ADASYN(random_state=self.seed) self.X_train_smote, self.y_train_smote = adasyn.fit_sample( self.x_train, self.y_train ) print("X_train_SMOTE:\n", self.X_train_smote[1]) self.x_train = pd.DataFrame(self.X_train_smote, columns=self.x_train.columns) self.y_train = pd.DataFrame( self.y_train_smote, columns=["Local Relapse Y(1) /N(0)"] ) print("len smote: \n", len(self.X_train_smote)) print("len new x_train: \n", len(self.x_train)) number_pos_x = self.y_train.loc[self.y_train["Local Relapse Y(1) /N(0)"] == 1] print("number positive responses y_train:\n", len(number_pos_x))
path_features = '/data/prostate/extraction/mp-mri-prostate' # Define a list of the path where the feature are kept dce_features = ['ese-dce'] # Define the extension of each features ext_features = ['_ese__dce.npy'] # Define the path of the ground for the prostate path_gt = ['GT_inv/prostate', 'GT_inv/pz', 'GT_inv/cg', 'GT_inv/cap'] # Define the label of the ground-truth which will be provided label_gt = ['prostate', 'pz', 'cg', 'cap'] # Define the path where to store the data path_store = '/data/prostate/balanced/mp-mri-prostate/exp-3' N_JOBS = -1 # Create the under_samplers and over_samplers list to use samplers = [ under_sampling.InstanceHardnessThreshold(n_jobs=N_JOBS, estimator='random-forest'), under_sampling.NearMiss(version=1, n_jobs=N_JOBS), under_sampling.NearMiss(version=2, n_jobs=N_JOBS), under_sampling.NearMiss(version=3, n_jobs=N_JOBS), under_sampling.RandomUnderSampler(), over_sampling.SMOTE(kind='regular', n_jobs=N_JOBS), over_sampling.SMOTE(kind='borderline1', n_jobs=N_JOBS), over_sampling.SMOTE(kind='borderline2', n_jobs=N_JOBS), over_sampling.RandomOverSampler() ] # Define the sub-folder to use sub_folder = [ 'iht', 'nm1', 'nm2', 'nm3', 'rus', 'smote', 'smote-b1', 'smote-b2', 'ros' ] # Generate the different path to be later treated
def main(): # test the estimators n_jobs = 10 imblearn_est = under_sampling.InstanceHardnessThreshold(n_jobs=n_jobs) wrapped_est = ImblearnWrapper(imblearn_est)