def test_enn_not_good_object(): nn = "rnd" enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel="mode") err_msg = ( "n_neighbors must be an interger or an object compatible with the " "KNeighborsMixin API of scikit-learn" ) with pytest.raises(ValueError, match=err_msg): enn.fit_resample(X, Y)
def edited_KNN(X, Y): from imblearn.under_sampling import EditedNearestNeighbours enn = EditedNearestNeighbours() enn.fit_resample(X, Y) indexes = enn.sample_indices_ nobj = len(Y) mask = np.zeros(nobj, dtype=int) for i in range(nobj): if i in indexes: mask[i] = 1 return True, mask
def edited_KNN(X, Y): from imblearn.under_sampling import EditedNearestNeighbours enn = EditedNearestNeighbours() enn.fit_resample(X, Y) indexes = enn.sample_indices_ mask = [] for i in range(len(X)): if i in indexes: mask.append(1) else: mask.append(0) return True, np.asarray(mask)
def fit_resample(self, X, y): n_features = X.shape[1] continuous_features = np.setdiff1d(np.arange(n_features), self.categorical_features) X_continuous = X[:, continuous_features] X_continuous = check_array(X_continuous, accept_sparse=['csr', 'csc']) X_categorical = X[:, categorical_features] if X_continuous.dtype.name != 'object': dtype_ohe = X_continuous.dtype else: dtype_ohe = np.float64 ohe = OneHotEncoder(sparse=True, handle_unknown='ignore', dtype=dtype_ohe) X_ohe = ohe.fit_transform(X_categorical.toarray() if sparse. issparse(X_categorical) else X_categorical) X_encoded = sparse.hstack((X_continuous, X_ohe), format='csr') enn_balancer = EditedNearestNeighbours(sampling_strategy='all') X_resampled, y_resampled = enn_balancer.fit_resample(X_encoded, y) selected_indices = enn_balancer.sample_indices_ X_resampled = X[selected_indices, :] return X_resampled, y_resampled
def test_enn_fit_resample_with_nn_object(): nn = NearestNeighbors(n_neighbors=4) enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel="mode") X_resampled, y_resampled = enn.fit_resample(X, Y) X_gt = np.array( [ [-0.10903849, -0.12085181], [0.01936241, 0.17799828], [2.59928271, 0.93323465], [1.42772181, 0.526027], [1.92365863, 0.82718767], [0.25738379, 0.95564169], [-0.284881, -0.62730973], [0.57062627, 1.19528323], [0.78318102, 2.59153329], [0.35831463, 1.33483198], [-0.14313184, -1.0412815], [-0.09816301, -0.74662486], [0.52726792, -0.38735648], [0.2821046, -0.07862747], ] ) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def get_resample(X, Y): print("不均衡データの調節開始.....") #sme = SMOTEENN() enn = EditedNearestNeighbours() X_res, Y_res = enn.fit_resample(X, Y) print("調節終了") return (X_res, Y_res)
def test_enn_check_kind_selection(): """Check that `check_sel="all"` is more conservative than `check_sel="mode"`.""" X, y = make_classification( n_samples=1000, n_classes=2, weights=[0.3, 0.7], random_state=0, ) enn_all = EditedNearestNeighbours(kind_sel="all") enn_mode = EditedNearestNeighbours(kind_sel="mode") enn_all.fit_resample(X, y) enn_mode.fit_resample(X, y) assert enn_all.sample_indices_.size < enn_mode.sample_indices_.size
def test_enn_fit_resample(): enn = EditedNearestNeighbours() X_resampled, y_resampled = enn.fit_resample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [2.59928271, 0.93323465], [1.92365863, 0.82718767], [0.25738379, 0.95564169], [0.78318102, 2.59153329], [0.52726792, -0.38735648]]) y_gt = np.array([0, 0, 1, 1, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def load_from_csv(input_dir: str, counts_file: str = "normalized_counts.csv.gz", n_jobs=1, low_expression=0.1) -> (AnnData, AnnData): u""" load data from csv files :param input_dir: :param counts_file: :param n_jobs :param str :return: """ logger.info("Reading {0}".format(input_dir)) input_file = os.path.join(input_dir, counts_file) # if not os.path.exists(input_file): # input_file += ".gz" mtx = pd.read_csv(input_file, index_col=0) meta = pd.read_csv(os.path.join(input_dir, "meta.csv.gz"), index_col=0) meta = meta.loc[meta.index, :] logger.info(mtx.shape) # filter low expressed genes genes_sum = [x / mtx.shape[1] > low_expression for x in mtx.sum(axis=1)] mtx = mtx.loc[genes_sum, :] logger.info(mtx.shape) mtx = mtx.transpose() data = AnnData(mtx, obs=meta) data.obs = meta logger.info("Perform ENN") enn = EditedNearestNeighbours(n_jobs=n_jobs, return_indices=True) mtx_enn, group_enn, idx_enn = enn.fit_resample(mtx, meta["Stage"]) data_enn = AnnData(mtx.iloc[list(idx_enn), :], meta.iloc[idx_enn, :]) data_enn.obs = meta.iloc[idx_enn, :] logger.info("Perform RENN") renn = RepeatedEditedNearestNeighbours(n_jobs=n_jobs, return_indices=True) mtx_renn, group_renn, idx_renn = renn.fit_resample(mtx, meta["Stage"]) data_renn = AnnData(mtx.iloc[list(idx_renn), :], meta.iloc[idx_renn, :]) data_renn.obs = meta.iloc[idx_renn, :] return data, data_enn, data_renn
def undersample_ENN(df, debug=True): X = df.values[:, :-1] y = df.values[:, -1].astype(int) if debug: print('Original dataset shape %s' % Counter(y)) enn = EditedNearestNeighbours(sampling_strategy="auto") X_res, y_res = enn.fit_resample(X, y) df_resampled = pd.DataFrame(X_res, columns=df.columns[:-1]) df_resampled.insert(len(df_resampled.columns), df.columns[-1], y_res) if debug: print('Resampled dataset shape %s' % Counter(y_res)) return df_resampled
def load_data(filename_classes, filename_features, with_enn=False, denoise=False): """ loads data from data files :param filename_classes: :param filename_features: :param with_enn: :return: data rows, targets for training and test """ print("loading classes") class_data = numpy.loadtxt(open(filename_classes, "rb"), delimiter=";", dtype=numpy.dtype('U20'), skiprows=1, usecols=1) print("loading features") # remove first (index) column when loading the csv feature_data = numpy.loadtxt(open(filename_features, "rb"), delimiter=";", dtype=int, skiprows=1, usecols=range(1, 81)) # create set of class labels class_labels = list(set(class_data.tolist())) class_dict = dict() for enumerated_class_label in enumerate(class_labels): index, name = enumerated_class_label class_dict[name] = index # convert list of text labels to list of indices # this gives us the class assignments as list of indices class_assignments = list() for entry in class_data: class_assignments.append(class_dict[entry]) if with_enn: # clean up the data to limit chances of noisy training samples print("ENN: cleaning up data for instance selection") enn = EditedNearestNeighbours(return_indices=True) data_resampled, target_resampled, sample_indices = enn.fit_resample( feature_data, class_assignments) if denoise: # do not return any data points considered noise by enn return data_resampled, target_resampled, class_labels, sample_indices else: return feature_data, class_assignments, class_labels, sample_indices else: return feature_data, class_assignments, class_labels, list()
def edited_nearest_neighbour(X, y, visualize=False, pca2d=True, pca3d=True, tsne=True, pie_evr=True): enn = EditedNearestNeighbours() X_res, y_res = enn.fit_resample(X, y) if visualize == True: hist_over_and_undersampling(y_res) pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr) return X_res, y_res
def readFile(path, y_label, method, encode_features=[], skew_exempted=[], training_ratio=0.7, shuffle=True, needSkew=False, fea_eng=True): raw = pd.read_csv(path) n, d = raw.shape if (shuffle): raw = raw.sample(frac=1).reset_index(drop=True) # shuffle if (needSkew): skewed = raw[raw.dtypes[raw.dtypes != "object"].index.drop( skew_exempted)].apply(lambda x: skew(x.dropna())) skewed = skewed[skewed > 0.75].index raw[skewed] = np.log1p(raw[skewed]) # reduce skewness raw = pd.get_dummies( raw, columns=encode_features) # encode categorical features raw = raw.fillna(raw.mean()) # if(method=='OverSample'): # ind_more=np.argmax(np.bincount(raw[y_label])) # more=raw[ind] # less=raw[-ind] # x = [randint(0, len(less)) for a in range(0, len(more)-len(less))] # raw. X = raw.drop(y_label, axis=1) y = raw[y_label] X_train, X_test, y_train, y_test = split(X, y, training_ratio) if (method == 'OverSample'): ada = ADASYN(random_state=42) X_res, y_res = ada.fit_resample(X_train, y_train) X_train = X_res y_train = y_res if (method == 'UnderSample'): # for i in [] #model = CondensedNearestNeighbour(random_state=42) # doctest: +SKIP model = EditedNearestNeighbours(random_state=42) X_res, y_res = model.fit_resample(X_train, y_train) X_train = X_res y_train = y_res # if(method=='Weights'): # if(fea_eng==True): # # X,y=feature_eng(X,y) return X_train, X_test, y_train, y_test
def under_sample_data(matrix, y_train): add_to_log('Under Sampling') add_to_log('Sample distribution %s' % Counter(y_train)) # clean proximity samples using TomeKLinks tl = TomekLinks(random_state=11, sampling_strategy='majority', n_jobs=-1) X_res, y_res = tl.fit_resample(matrix, y_train) add_to_log('TomekLinks distribution %s' % Counter(y_res)) enn = EditedNearestNeighbours(random_state=7, sampling_strategy='majority', n_jobs=-1) X_res, y_res = enn.fit_resample(X_res, y_res) add_to_log('EditedNearestNeighbours distribution %s' % Counter(y_res)) return X_res, y_res
def get_data(force_reload=False, strategy='oversampling', test_size=0.15): train_data_file = os.path.join(DATA_DIR, 'train_data.{}.npy'.format(strategy)) train_labels_file = os.path.join(DATA_DIR, 'train_labels.{}.npy'.format(strategy)) val_data_file = os.path.join(DATA_DIR, 'val_data.{}.npy'.format(strategy)) val_labels_file = os.path.join(DATA_DIR, 'val_labels.{}.npy'.format(strategy)) training_files_exist = os.path.exists(train_data_file) and os.path.exists(train_labels_file) val_files_exist = os.path.exists(val_data_file) and os.path.exists(val_labels_file) if not force_reload and training_files_exist and val_files_exist: X_train = np.load(train_data_file) y_train = np.load(train_labels_file) X_val = np.load(val_data_file) y_val = np.load(val_labels_file) else: train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv')) X, y = to_data_format(train_df) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size) print('Shapes before: {}, {}'.format(X_train.shape, y_train.shape)) if strategy == 'oversampling': X_train, y_train = SMOTE(n_jobs=n_jobs).fit_resample(X_train, y_train) elif strategy == 'combine': smote = SMOTE(n_jobs=n_jobs) enn = EditedNearestNeighbours(n_jobs=n_jobs) X_train, y_train = SMOTEENN(smote=smote, enn=enn).fit_resample(X_train, y_train) elif strategy == 'undersampling': enn = EditedNearestNeighbours(n_jobs=n_jobs) X_train, y_train = enn.fit_resample(X_train, y_train) elif strategy == 'condensed-undersampling': cnn = CondensedNearestNeighbour(n_jobs=n_jobs, n_neighbors=3) X_train, y_train = cnn.fit_resample(X_train, y_train) print('Shapes after: {}, {}'.format(X_train.shape, y_train.shape)) np.save(train_data_file, X_train) np.save(train_labels_file, y_train) np.save(val_data_file, X_val) np.save(val_labels_file, y_val) return X_train, X_val, y_train, y_val
def ENN_us(X_train, Y_train, seed, sampling_strategy, n_neighbors=3, kind_sel='all'): enn = EditedNearestNeighbours(random_state=seed, n_jobs=-1, n_neighbors=n_neighbors, kind_sel=kind_sel, sampling_strategy=sampling_strategy) print('Before ENN undersampling : ', sorted(Counter(Y_train).items())) X_train_resampled, Y_train_resampled = enn.fit_resample(X_train, Y_train) print('After ENN undersampling : ', sorted(Counter(Y_train_resampled).items())) X_train_resampled, Y_train_resampled = shuffle_dataset( X_train_resampled, Y_train_resampled, seed) return X_train_resampled, Y_train_resampled
def load_from_csv(input_dir: str) -> (AnnData, AnnData): u""" load data from csv files :param input_dir: :return: """ logger.info("read") mtx = pd.read_csv(os.path.join(input_dir, "normalized_counts.csv.gz"), index_col=0, engine="c") meta = pd.read_csv(os.path.join(input_dir, "meta.csv.gz"), index_col=0, engine="c") meta = meta.loc[meta.index, :] mtx = mtx.transpose() data = AnnData(mtx, obs=meta) data.obs = meta logger.info("enn") enn = EditedNearestNeighbours(n_jobs=10, return_indices=True) mtx_enn, group_enn, idx_enn = enn.fit_resample(mtx, meta["Stage"]) data_enn = AnnData(mtx.iloc[list(idx_enn), :], meta.iloc[idx_enn, :]) data_enn.obs = meta.iloc[idx_enn, :] logger.info("Repeated enn") renn = RepeatedEditedNearestNeighbours(n_jobs=10, return_indices=True) mtx_renn, group_renn, idx_renn = renn.fit_resample(mtx, meta["Stage"]) data_renn = AnnData(mtx.iloc[list(idx_renn), :], meta.iloc[idx_renn, :]) data_renn.obs = meta.iloc[idx_renn, :] return data, data_enn, data_renn
def sampling(X_train, y_train): ran_over = RandomOverSampler(random_state=42) X_train_oversample,y_train_oversample = ran_over.fit_resample(X_train,y_train) ran_under = RandomUnderSampler(random_state=42) X_train_undersample, y_train_undersample = ran_under.fit_resample(X_train,y_train) tl = TomekLinks(n_jobs=6) X_train_tl, y_train_tl = tl.fit_sample(X_train, y_train) sm = SMOTE(random_state=42, n_jobs=5) X_train_sm, y_train_sm = sm.fit_sample(X_train, y_train) enn = EditedNearestNeighbours() X_train_enn, y_train_enn = enn.fit_resample(X_train, y_train) print(np.unique(y_train, return_counts=True)) print("after sampling") print("randomg over sampling") print(np.unique(y_train_oversample, return_counts=True)) print("SMOTE sampling") print(np.unique(y_train_sm, return_counts=True)) print("random under sampling") print(np.unique(y_train_undersample, return_counts=True)) print("TomekLinks under sampling") print(np.unique(y_train_tl, return_counts=True)) return (X_train_oversample, y_train_oversample, X_train_undersample, y_train_undersample, X_train_tl, y_train_tl, X_train_sm, y_train_sm, X_train_enn, y_train_enn)
TestSet_dev['QuantileAmt']=pd.qcut(x=TestSet_dev['TransactionAmt'], q=BinNum, labels=['Q'+str(X) for X in range(1,(BinNum+1))]) ############################################################ ############################################################ ############################################################ ############################################################ ColumnSelect=np.asarray(["C"+str(X) for X in range(1,15)]) TempTrain=TrainTransaction[ColumnSelect] TempTrain=TempTrain.join([pd.get_dummies(data=TrainTransaction["ProductCD"]), pd.get_dummies(data=TrainTransaction["P_emaildomain"]), pd.get_dummies(data=TrainTransaction["QuantileAmt"])]) pd.get_dummies(data=TrainTransaction["P_emaildomain"]).shape pd.get_dummies(data=TrainTransaction["ProductCD"]).shape TempTrain.shape #Undersample enn = EditedNearestNeighbours() X_resampled, y_resampled = enn.fit_resample(TempTrain.iloc[1:1000:], TrainTransaction['isFraud'].iloc[1:1000:]) X_resampled #Train and test sets X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.1, random_state=42) #Set up SDG Model with Grid Search LGBMModel=LGBMClassifier() LGBMModel.fit(X_train, y_train) #Predict Predictions=LGBMModel.predict(TempTrain) #Metrics print(confusion_matrix(y_test, Predictions)) print(classification_report(y_test, Predictions))
def test_enn_not_good_object(): nn = 'rnd' enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel='mode') with raises(ValueError, match="has to be one of"): enn.fit_resample(X, Y)
X_best_train = f_classif_select.fit_transform(X_train, y_train) X_best_test = f_classif_select.fit_transform(X_test, y_test) knn.fit(X_best_train, y_train) y_pred = knn.predict(X_best_test) scores[i].append(metric(y_test, y_pred)) for dataset_score in scores: print(np.mean(dataset_score)) if not SKIP_ENN: scores = [[] for _ in range(len(datasets))] for i, dataset in enumerate(datasets): X, y = dataset X, y = enn.fit_resample(X, y) for train_index, test_index in rskf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] f_classif_select = SelectKBest(k=K_BEST) X_best_train = f_classif_select.fit_transform(X_train, y_train) X_best_test = f_classif_select.fit_transform(X_test, y_test) knn.fit(X_best_train, y_train) y_pred = knn.predict(X_best_test) scores[i].append(metric(y_test, y_pred)) for dataset_score in scores: print(np.mean(dataset_score))
def test_deprecation_random_state(): enn = EditedNearestNeighbours(random_state=0) with warns(DeprecationWarning, match="'random_state' is deprecated from 0.4"): enn.fit_resample(X, Y)
def validateFitModel(X_train, y_train, X_test=None, y_test=None, cv=False, target=None): rs = RobustScaler(quantile_range=(0.1, 0.90)) mms = MinMaxScaler() X_train_mms = mms.fit_transform(rs.fit_transform(X_train)) ncr = EditedNearestNeighbours(n_neighbors=1, sampling_strategy=[7, 10], random_state=42, return_indices=True) _, _, indexes = ncr.fit_resample(X_train_mms, y_train) resampling_index = random.sample(range(len(indexes)), len(indexes)) sampled_indexes = indexes[resampling_index] with open(os.path.join(MODELS_PATH, 'sampled_dfs_%s.bin' % target), 'wb') as f: pickle.dump(sampled_indexes, f) f.close() model = XGBClassifier(verbosity=2, n_estimators=100, objective='multi:softprob', learning_rate=0.125, min_child_weight=1, max_depth=13, gamma=0.6, max_delta_step=0, subsample=1, colsample_bytree=0.9, reg_lambda=2, scale_pos_weight=0.05) if cv: param_grid = { 'n_estimators': [10], 'objective': ['multi:softprob'], 'learning_rate': [0.125], 'min_child_weigth': [1], 'max_depth': [13], 'gamma': [0.6], 'max_delta_step': [0], 'subsample': [1], 'colsample_bytree': [0.9], 'reg_lambda': [2], 'scale_pos_weight': [0.05] } validate(X_train[sampled_indexes], y_train[sampled_indexes], X_test, y_test, target=target, model=model, parameters=param_grid, model_name='XGB') else: model.fit(X_train[sampled_indexes], y_train[sampled_indexes]) with open( os.path.join(MODELS_PATH, '%s_fitted_classifier.bin' % target), 'wb') as f: pickle.dump(model, f) f.close() return
# ---------- ABALONE ----- # dataset = pd.read_csv('data/abalone.txt') # X_data = dataset.iloc[:, 0:].values # y_data = dataset.iloc[:, 8].values print(X_data.shape) print('-------') # ------- CNN -------- cnn = CondensedNearestNeighbour() X_cnn, y_cnn = cnn.fit_resample(X_data, y_data) print(X_cnn.shape) # ------- ENN -------- enn = EditedNearestNeighbours() X_enn, y_enn = enn.fit_resample(X_data, y_data) print(X_enn.shape) # ------- RENN -------- renn = RepeatedEditedNearestNeighbours() X_renn, y_renn = renn.fit_resample(X_data, y_data) print(X_renn.shape) # ------- Tomek -------- tl = TomekLinks() X_t, y_t = tl.fit_resample(X_data, y_data) print(X_t.shape) # ------- RUS -------- rus = RandomUnderSampler(random_state=42) X_rus, y_rus = rus.fit_resample(X_data, y_data)
X_test = np.load('FUMSECK_L3/X_test610.npy') y_test = np.load('FUMSECK_L3/y_test610.npy') #======================================== # (Optional) ENN : delete dirty examples #======================================== X_integrated = trapz(X_train, axis=1) X_integrated = pd.DataFrame( X_integrated, columns=['SWS', 'FWS', 'FL Orange', 'FL Red', 'Curvature']) y = y_train.argmax(1) # ENN for cleaning data enn = EditedNearestNeighbours() X_rs, y_rs = enn.fit_resample(X_integrated, y) X_train = X_train.take(enn.sample_indices_, axis=0) y_train = y_train.take(enn.sample_indices_, axis=0) #plot_2Dcyto(X_rs, y_rs, tn, 'FWS', 'FL Red') #plot_2Dcyto(X_integrated, y, tn, 'FWS', 'FL Red') #======================================================== # RUS: Delete random observations from majority classes #======================================================== balancing_dict = Counter(np.argmax(y_train, axis=1)) for class_, obs_nb in balancing_dict.items(): if obs_nb > 3000: balancing_dict[class_] = 3000
random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Three subplots, unpack the axes array immediately f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2) c0, c1 = plot_resampling(ax1, X_vis, y, 'Original set') # Apply the ENN print('ENN') enn = EditedNearestNeighbours(return_indices=True) X_resampled, y_resampled, idx_resampled = enn.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled) reduction_str = ('Reduced {:.2f}%'.format( 100 * (1 - float(len(X_resampled)) / len(X)))) print(reduction_str) c3 = ax2.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1], alpha=.2, label='Removed samples', c='g') plot_resampling(ax2, X_res_vis, y_resampled, 'ENN - ' + reduction_str) # Apply the RENN print('RENN') renn = RepeatedEditedNearestNeighbours(return_indices=True)
def splitrepeat_mcn(X, y, model_list, splits, repeats, num_classes, feature_list=None, mc_strategy='ovr', test_ratio=0.25, class_labels=None, stacked_model=None, imbalanced=None, categorical_features=None, over_strategy='auto', under_strategy='auto', avg_strategy='macro', initial_split_seed=None, initial_split_ratio=0.5, verbose=0): ''' Runs a j-split k-repeat random sub-sampling cross-validation for classification tasks model must be a sklearn model Parameters ---------- X : pandas DataFrame Independent data y : pandas Series or numpy array Depndent data or labels model : any scikit-learn classifier Currently tested with randomforestclassifier, gradientboostedclassifier splits : array Specify a list of seed values to be used. TO DO: insert int to randomly assign seeds repeats : array Specify a list of seed values to be used. TO DO: insert int to randomly assign seeds num_classes : int Number of classes. If classes are not arranged in numerical format (ex: 0,1,2) then specify class_labels class_labels : list of strings or ints Set labels of classes if not numerical from 0 test_ratio : float Used in sklearn.metrics.train_test_split to calculate the proportion of validation and test sets vs training data. Test set is calculated first, followed by validation set, so if the same number is used for both the test set will be larger than the validation set. imbalanced : default=None 'over' : utilize imblearn's SMOTE (or SMOTENC if categorical_features are defined) to oversample the train set 'under' : utilize imblearn's EditedNearestNeighbours to undersample the test set categorical_features : list of categorical features in data, used in SMOTENC avg_strategy : see 'average' in sklearn's roc_auc_score (default = 'macro') verbose : 0, 1, or 2 0 : disables all output 1 : shows split/repeat number 2 : adds confusion_matrix initial_split_seed : int If this value is specified, data will be initially split once. Use this to match previously used train/test splits (sklearn implementation) and to ensure that training data remains in the training set. Data on the testing side of the split may be shuffled into the training set, but never the reverse. If this value is not specified, all data will be shuffled. This is useful if a holdout test set will be used for final testing (note: do not test holdout sets using splitrepeat_cv) initial_split_ratio : float If initial_split_seed is specified, this ratio will be used to split initial train/test ratios. Small train splits are preferred to enable more data to be shuffled and to reduce overfitting. This value replaces "train_size" in sklearn's train_test_split. Note that the train data from this initial split will be added to all training sets generated over_strategy : see "search_strategy" from imblearn.oversampling.SMOTE under_strategy : see "search_strategy" from imblearn.undersampling.EditedNearestNeighbours Returns ------- Dataframe with sensitivity, specificity, PPV, NPV, accuracy, and F1 values for each class ''' df = pd.DataFrame() if class_labels == None: class_labels = list(range(num_classes)) # For multiclass ROC curve calculations (requires numerical input) if initial_split_seed != None: _X_train, X, _y_train, y = train_test_split(X, y.values.ravel(), train_size=initial_split_ratio, random_state=initial_split_seed, stratify=y) y = pd.Series(y) # Begin j-split k-repeat loop for j in splits: X_, X_test, y_, y_test = train_test_split(X, y.values.ravel(), test_size=test_ratio, random_state=j, stratify=y) if initial_split_seed != None: X_ = X_.append(_X_train) y_ = np.append(y_,_y_train) if imbalanced == 'under': enn = EditedNearestNeighbours(sampling_strategy=under_strategy, random_state=j) X_,y_ = enn.fit_resample(X_,y_) X_test,y_test = enn.fit_resample(X_test,y_test) # Add option to call test resampling if imbalanced == 'over': if categorical_features is None: sm = SMOTE(random_state=j, sampling_strategy=over_strategy) else: categorical_features = np.in1d(X_.columns.values, categorical_features) sm = SMOTENC(categorical_features = categorical_features, sampling_strategy=over_strategy, random_state=j) X_, y_ = sm.fit_resample(X_,y_) # Run models for k in repeats: np.random.seed(k) y_output = pd.DataFrame() y_ = pd.DataFrame(y_) for i in class_labels: model = model_list[i] model.set_params(random_state=k) X_i = X_[feature_list[i]] # Select feature list replace_y = {class_labels[i]:1} replace_y.update(zip([x for x in class_labels if x!=i],[0 for x in [x for x in class_labels if x!=i]])) y_i = y_.replace(replace_y) # Set selected class=1, others=0 (OneVsRest) model.fit(X_i,y_i.values.ravel()) y_pred = model.predict_proba(X_test[feature_list[i]]) y_output['Clf'+str(i)] = y_pred[:,1] # Use each classifier's target class (OneVsRest) output as selected output probability, then divide by total # so that probability outputs sum to 1 for i in range(len(y_output)): y_output.iloc[i,:] = y_output.iloc[i,:].divide(y_output.sum(axis=1)[i]).to_numpy() if stacked_model != None: y_output = stacked_model.predict_proba(y_output) cmat = multilabel_confusion_matrix(y_test, np.argmax(y_output.to_numpy(), axis=1)) if verbose >= 1: print('Split: ',j,', Repeat: ',k) if verbose >= 2: print(cmat) report = pd.DataFrame({'j':j,'k':k}, index=[0]) for c in range(num_classes): TP = cmat[c][1][1] FP = cmat[c][0][1] TN = cmat[c][0][0] FN = cmat[c][1][0] report['Sensitivity'+str(c)] = (TP/(TP+FN)) report['Specificity'+str(c)] = (TN/(FP+TN)) report['PPV'+str(c)] = (TP/(TP+FP)) report['NPV'+str(c)] = (TN/(TN+FN)) report['Accuracy'+str(c)] = (TP+TN)/len(y_test) y_test1 = y_test.copy() y_test1[y_test1 > 1] = 1 #Set class 2 predictions to class 1, to enable comparison with two-class y_pred = np.argmax(y_output.to_numpy(), axis=1) y_pred[y_pred > 1] = 1 report['Sensitivity'] = recall_score(y_test1,y_pred, average=avg_strategy, labels=[1]) report['Specificity'] = recall_score(y_test1,y_pred, average=avg_strategy, labels=[0]) report['PPV'] = precision_score(y_test1,y_pred, average=avg_strategy, labels=[1]) report['NPV'] = precision_score(y_test1,y_pred, average=avg_strategy, labels=[0]) report['F1_Score'] = f1_score(y_test1,y_pred, average=avg_strategy) report['Accuracy'] = accuracy_score(y_test1, y_pred) report.set_index(['j','k'], inplace=True) df = df.append(report) return df
def resampling_assigner(imb_technique, AA_ova_X_train, AA_ova_y_train, AI_ova_X_train, AI_ova_y_train, AW_ova_X_train, AW_ova_y_train, CC_ova_X_train, CC_ova_y_train, QA_ova_X_train, QA_ova_y_train): print(imb_technique) if imb_technique == "ADASYN": AA_ada, AI_ada, AW_ada, CC_ada, QA_ada = ADASYN(), ADASYN(), ADASYN( ), ADASYN(), ADASYN() AA_X_res, AA_y_res = AA_ada.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_ada.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_ada.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_ada.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_ada.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ALLKNN": AA_allknn, AI_allknn, AW_allknn, CC_allknn, QA_allknn = AllKNN( ), AllKNN(), AllKNN(), AllKNN(), AllKNN() AA_X_res, AA_y_res = AA_allknn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_allknn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_allknn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_allknn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_allknn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "CNN": AA_cnn, AI_cnn, AW_cnn, CC_cnn, QA_cnn = CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour() AA_X_res, AA_y_res = AA_cnn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_cnn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_cnn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_cnn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_cnn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ENN": AA_enn, AI_enn, AW_enn, CC_enn, QA_enn = EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours() AA_X_res, AA_y_res = AA_enn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_enn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_enn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_enn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_enn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "IHT": AA_iht, AI_iht, AW_iht, CC_iht, QA_iht = InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold() AA_X_res, AA_y_res = AA_iht.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_iht.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_iht.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_iht.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_iht.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "NCR": AA_ncr, AI_ncr, AW_ncr, CC_ncr, QA_ncr = NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule() AA_ova_y_train = [ 0 if i == "Accepted/Assigned" else 1 for i in AA_ova_y_train ] AA_X_res, AA_y_res = AA_ncr.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_ova_y_train = [ 0 if i == "Accepted/In Progress" else 1 for i in AI_ova_y_train ] AI_X_res, AI_y_res = AI_ncr.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_ova_y_train = [ 0 if i == "Accepted/Wait" else 1 for i in AW_ova_y_train ] AW_X_res, AW_y_res = AW_ncr.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_ova_y_train = [ 0 if i == "Completed/Closed" else 1 for i in CC_ova_y_train ] CC_X_res, CC_y_res = CC_ncr.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_ova_y_train = [ 0 if i == "Queued/Awaiting Assignment" else 1 for i in QA_ova_y_train ] QA_X_res, QA_y_res = QA_ncr.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "NM": AA_nm, AI_nm, AW_nm, CC_nm, QA_nm = NearMiss(), NearMiss(), NearMiss( ), NearMiss(), NearMiss() AA_X_res, AA_y_res = AA_nm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_nm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_nm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_nm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_nm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "OSS": AA_oss, AI_oss, AW_oss, CC_oss, QA_oss = OneSidedSelection( ), OneSidedSelection(), OneSidedSelection(), OneSidedSelection( ), OneSidedSelection() AA_X_res, AA_y_res = AA_oss.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_oss.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_oss.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_oss.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_oss.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "RENN": AA_renn, AI_renn, AW_renn, CC_renn, QA_renn = RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ) AA_X_res, AA_y_res = AA_renn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_renn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_renn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_renn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_renn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTE": AA_sm, AI_sm, AW_sm, CC_sm, QA_sm = SMOTE(), SMOTE(), SMOTE(), SMOTE( ), SMOTE() AA_X_res, AA_y_res = AA_sm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_sm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_sm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_sm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_sm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "BSMOTE": AA_bsm, AI_bsm, AW_bsm, CC_bsm, QA_bsm = BorderlineSMOTE( ), BorderlineSMOTE(), BorderlineSMOTE(), BorderlineSMOTE( ), BorderlineSMOTE() AA_X_res, AA_y_res = AA_bsm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_bsm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_bsm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_bsm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_bsm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTEENN": AA_smenn, AI_smenn, AW_smenn, CC_smenn, QA_smenn = SMOTEENN( ), SMOTEENN(), SMOTEENN(), SMOTEENN(), SMOTEENN() AA_X_res, AA_y_res = AA_smenn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_smenn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_smenn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_smenn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_smenn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTETOMEK": AA_smtm, AI_smtm, AW_smtm, CC_smtm, QA_smtm = SMOTETomek(), SMOTETomek( ), SMOTETomek(), SMOTETomek(), SMOTETomek() AA_X_res, AA_y_res = AA_smtm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_smtm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_smtm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_smtm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_smtm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "TOMEK": AA_tm, AI_tm, AW_tm, CC_tm, QA_tm = TomekLinks(), TomekLinks( ), TomekLinks(), TomekLinks(), TomekLinks() AA_X_res, AA_y_res = AA_tm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_tm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_tm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_tm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_tm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ROS": AA_ros, AI_ros, AW_ros, CC_ros, QA_ros = RandomOverSampler( ), RandomOverSampler(), RandomOverSampler(), RandomOverSampler( ), RandomOverSampler() AA_X_res, AA_y_res = AA_ros.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_ros.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_ros.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_ros.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_ros.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "RUS": AA_rus, AI_rus, AW_rus, CC_rus, QA_rus = RandomUnderSampler( ), RandomUnderSampler(), RandomUnderSampler(), RandomUnderSampler( ), RandomUnderSampler() AA_X_res, AA_y_res = AA_rus.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_rus.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_rus.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_rus.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_rus.fit_resample(QA_ova_X_train, QA_ova_y_train) return AA_X_res, AA_y_res, AI_X_res, AI_y_res, AW_X_res, AW_y_res, CC_X_res, CC_y_res, QA_X_res, QA_y_res
folds_origin.append([X_train, X_test, y_train, y_test]) #resampling 1: under + SMOTE, final ratio 0.5: under = RandomUnderSampler(sampling_strategy=0.0026) X_train_re, y_train_re = under.fit_resample(X_train, y_train) start1 = time.time() over = SMOTE(sampling_strategy=0.5) X_train_re1, y_train_re1 = over.fit_resample(X_train_re, y_train_re) end1 = time.time() exe_time1 = end1 - start1 folds_re1.append([X_train_re1, X_test, y_train_re1, y_test]) #resampling 2: under + SMOTE + ENN, final ratio 0.5: start2 = time.time() enn = EditedNearestNeighbours(sampling_strategy='all') X_train_re2, y_train_re2 = enn.fit_resample(X_train_re1, y_train_re1) end2 = time.time() exe_time2 = end2 - start2 + exe_time1 folds_re2.append([X_train_re2, X_test, y_train_re2, y_test]) #resampling 3: under + SMOTE + ENN, final ratio 1: start3 = time.time() over = SMOTEENN() X_train_re3, y_train_re3 = over.fit_resample(X_train_re, y_train_re) end3 = time.time() exe_time3 = end3 - start3 folds_re3.append([X_train_re3, X_test, y_train_re3, y_test]) times.append([exe_time1, exe_time2, exe_time3]) # store the folds for original and different re-samplings
def imbalance_handler( mode: str, parent_class_label: str): """ Purpose ------- The purpose of this function is to provide the user a tool that allows them to easily manipulate their training and/or test dataset so that it is significantly more balanced between its classes. One would want to do this in order to improve the realiability of their classifier that will get trainined on this dataset (see 1. in the References section for more information about this). **Note, however, that if a class has only 5 or fewer article instances that belong to it, it will be dropped completely due to the fact that the SMOTE and ENN algorithms used in this function rely on at least 6 nearest-neighbors of a class to exist. If this class label is particularly important and you would like to keep it around, then obtain more data for it.** Parameters ---------- mode : str This string allows the user to specify how they would like the imbalancing of the dataset to be handled. The available options include: 1. "smote" - In this mode, the only algorithm that will be implemented to make the dataset more balanced is the over-sampling algorithm SMOTE. See 1., 3., 4., and 5. in the References section for more information about this algorithm. 2. "enn" - In this mode, the only algorithm that will be implemented to make the dataset more balanced is the under-sampling algorithm Edited-Nearest Neighbors (ENN). See 1. and 6. for more information about this algorithm. 3. "smote-enn" - In this mode, this function will implement both the SMOTE and ENN algorithms; SMOTE will oversample to make the classes balanced and ENN will under-sample to remove any newly generated samples in the minority class(es) that are not helpful. See 1. and 7. for more information about the benefits of doing using this 2-step process and for how this is implemented in the imbalanced-learn module. parent_class_label : str This string represents the class label that is the Parent class of all of the sub-classes that will be distignuished and predicted by a classifier that you wish to build. I.e., if you want to build a classifier for the children of the "Auto Type" class (which includes "Budget Cars", "Concept Cars", and "Luxury Cars" to name a few), then you simply have to pass in the "Auto Type" string to this parameter. Returns ------- to_return : (Sparse Numpy Array, Numpy Array) The former element represents the new feature matrix (where some rows correspond to the article instances that were synthetically generated if the user specifed for over-sampling to occur) and the latter element represents the new class labels. Note that the number of rows in both these array objects are the same since each row of the two correspond to the same (real or synthetic) article instance. References ---------- 1. https://towardsdatascience.com/guide-to-classification-on-imbalanced-datasets-d6653aa5fa23 2. https://imbalanced-learn.readthedocs.io/en/stable/index.html 3. https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/ 4. https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html 5. https://www.kite.com/blog/python/smote-python-imbalanced-learn-for-oversampling/ 6. https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.EditedNearestNeighbours.html#imblearn.under_sampling.EditedNearestNeighbours 7. https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.combine.SMOTEENN.html """ # First, get values for the parameters that we will need to use for # the rest of the function. normalized_mode = "".join(mode.lower().split("-")) child_tier_lvl, raw_articles_df = class_data_retrival( parent_class_label, give_child_tier_lvl=True) # Before performing any transformations on our data, we need to # double check that it is suitable for the BOWs and balance # correcting model. If it is not, then perform any corrections # neccessary. child_tier_label = "Tier{}".format(child_tier_lvl) counts_of_classes = raw_articles_df[child_tier_label].value_counts() counts_checker = counts_of_classes.values <= 6 num_with_less = counts_checker.sum() if num_with_less > 0: # If any of the classes that we are working with have 5 or fewer # articles in them. If this is the case, then we cannot use any # of the over/under-sampling algorithms that investigate the # characteristics of its 6 nearest-neighbors. Our current # solution is to simply drop this class from consideration. indicies_with_less = np.argwhere(counts_checker).flatten() labels_with_less = counts_of_classes.index.values[indicies_with_less] if num_with_less == 1: # If there is exactly 1 class labels that we are going to # have to remove from the DataFrame of articles. assert labels_with_less.size == 1 label_to_remove = labels_with_less[0] articles_df = raw_articles_df[raw_articles_df[child_tier_label] != label_to_remove] elif num_with_less == 2: # If there are exactly 2 class labels that we are going to # have to remove from the DataFrame of articles. assert labels_with_less.size == 2 conditions_to_remove = np.logical_and( raw_articles_df[child_tier_label] != labels_with_less[0], raw_articles_df[child_tier_label] != labels_with_less[1]) articles_df = raw_articles_df[conditions_to_remove] else: # If there are 3 or more class labels that we are going to # have to remove from the DataFrame of articles. assert labels_with_less.size >= 3 for i in range(len(labels_with_less)): # if i == 0: # If we are on our first iteration. In this case, we # need to instantiate the `conditions_to_remove` # object with the first two labels that we want to # remove. conditions_to_remove = np.logical_and( raw_articles_df[child_tier_label] != labels_with_less[i], raw_articles_df[child_tier_label] != labels_with_less[i + 1]) elif i > 1: # If we are on either our third or further down # iteration. If this is the case, then we know that # the `conditions_to_remove` object has been # instantiated. We just need to add on to it with # the remaining labels that we would like to remove. conditions_to_remove = np.logical_and( conditions_to_remove, raw_articles_df[child_tier_label] != labels_with_less[i]) articles_df = raw_articles_df[conditions_to_remove] else: # All the article counts for each class pass the test :). articles_df = raw_articles_df # Next, obtain your X (features) matrix and your y (labels) vector. _, featue_matrix = bag_of_words_converter(mode="tfidf", parent_class_label=None, articles_df=articles_df, upper_n_gram=2, upper_features=300, apply_PCA=True) labels_arr = np.array( articles_df[child_tier_label].tolist()) # Next, implement the algorithm the user has specified. if normalized_mode == "smote": # If the user would first like to oversample with the SMOTE # algorithm. sm_model = SMOTE(random_state=169, n_jobs=3) final_feature_matrix, final_labels_arr = sm_model.fit_resample( featue_matrix, labels_arr) elif normalized_mode == "enn": # If the user would like to undersample with the Tomek links # algorithm enn_model = EditedNearestNeighbours(sampling_strategy="auto", n_jobs=3) final_feature_matrix, final_labels_arr = enn_model.fit_resample( featue_matrix, labels_arr) elif normalized_mode == "smoteenn": # If the user would first like to oversample with SMOTE and then # improve on that new set of samples by undersampling with the # ENN algorithm # Instantiate the smoteenn object from imblearn that first # performs SMOTE and then ENN. sm_enn_model = SMOTEENN(random_state=169, n_jobs=3) # Fit and resample with this pipeline object. final_feature_matrix, final_labels_arr = sm_enn_model.fit_resample( featue_matrix, labels_arr) to_return = (final_feature_matrix, final_labels_arr) return to_return