def rep_edited_KNN(X, Y): from imblearn.under_sampling import RepeatedEditedNearestNeighbours renn = RepeatedEditedNearestNeighbours() renn.fit_resample(X, Y) indexes = renn.sample_indices_ nobj = len(Y) mask = np.zeros(nobj, dtype=int) for i in range(nobj): if i in indexes: mask[i] = 1 return True, mask
def rep_edited_KNN(X, Y): from imblearn.under_sampling import RepeatedEditedNearestNeighbours renn = RepeatedEditedNearestNeighbours() renn.fit_resample(X, Y) indexes = renn.sample_indices_ mask = [] for i in range(len(X)): if i in indexes: mask.append(1) else: mask.append(0) return True, np.asarray(mask)
def test_renn_fit_resample_with_indices(): renn = RepeatedEditedNearestNeighbours(return_indices=True) X_resampled, y_resampled, idx_under = renn.fit_resample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [0.73489726, 0.43915195], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ]) idx_gt = np.array([ 6, 13, 32, 39, 4, 5, 16, 22, 23, 24, 30, 37, 2, 11, 12, 17, 20, 21, 25, 26, 28, 31, 33, 34, 35, 36 ]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_renn_fit_resample_with_indices(): renn = RepeatedEditedNearestNeighbours(return_indices=True) X_resampled, y_resampled, idx_under = renn.fit_resample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ -0.46226554, -0.50481004 ], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [ 1.12202806, 0.33811558 ], [0.73489726, 0.43915195], [0.50307437, 0.498805], [ 0.84929742, 0.41042894 ], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [ 0.69804044, 0.44810796 ], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [ 0.34218094, -0.58781961 ], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [ 0.73418199, -0.02222847 ], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ]) idx_gt = np.array([ 6, 13, 32, 39, 4, 5, 16, 22, 23, 24, 30, 37, 2, 11, 12, 17, 20, 21, 25, 26, 28, 31, 33, 34, 35, 36 ]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_renn_fit_resample_mode(): nn = NearestNeighbors(n_neighbors=4) renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel='mode') X_resampled, y_resampled = renn.fit_resample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [2.94290565, -0.13986434], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], [1.84864913, 0.14729596], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [1.67314371, 0.19231498], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_renn_fit_resample_mode(): nn = NearestNeighbors(n_neighbors=4) renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel='mode') X_resampled, y_resampled = renn.fit_resample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [ -0.46226554, -0.50481004 ], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [ 1.02956816, 0.36061601 ], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [ 2.94290565, -0.13986434 ], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [ -0.28479268, 0.70459548 ], [1.84864913, 0.14729596], [0.50307437, 0.498805], [ 0.84929742, 0.41042894 ], [0.62649535, 0.46600596], [1.67314371, 0.19231498], [ 0.98382284, 0.37184502 ], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [ 0.04296502, -0.37981873 ], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [ 0.2096964, -0.61814058 ], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [ 0.79270821, -0.41386668 ], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [ 0.48921682, -1.38504507 ], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def train_decisiontree_with(configurationname, train_data, k, score_function, undersam=False, oversam=False, export=False, **kwargs): assert k > 0 print("Training with configuration " + configurationname) X_train, y_train, id_to_a_train = train_data max_depth = None if "max_depth" not in kwargs else kwargs["max_depth"] dtc = DecisionTreeClassifier(criterion="entropy", random_state=0, max_depth=max_depth) print("Feature Selection") # selector = SelectFpr(score_function) selector = SelectKBest(score_function, k=k) selector = SelectKBest(score_function, k=k) selector = selector.fit(X_train, y_train) X_train = selector.transform(X_train) fitted_ids = [i for i in selector.get_support(indices=True)] print("Apply Resampling") print(Counter(y_train)) if undersam and not oversam: renn = RepeatedEditedNearestNeighbours() X_train, y_train = renn.fit_resample(X_train, y_train) if oversam and not undersam: # feature_indices_array = list(range(len(f_to_id))) # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0) # X_train, y_train = smote_nc.fit_resample(X_train, y_train) sm = SMOTE(random_state=42) X_train, y_train = sm.fit_resample(X_train, y_train) if oversam and undersam: smote_enn = SMOTEENN(random_state=0) X_train, y_train = smote_enn.fit_resample(X_train, y_train) print(Counter(y_train)) print("Train Classifier") dtc = dtc.fit(X_train, y_train, check_input=True) if export: print("Exporting tree to graph...") export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True) transform(fitted_ids, configurationname) print("Self Accuracy: " + str(dtc.score(X_train, y_train))) return selector, dtc
def classification_results(train,test): #Derivation of NBDriver using training data """ Arguments: train = feature matrix derived from Brown et al. test= feature matrix derived from Martelotto et al. Returns: best_model = Best ensemble model derived using the training data X_red= Dataframe derived after sampling that was used to train the model scores= probability based classification scores """ sen=[];spe=[];acc=[];auc=[];c=[];m=[];s=[] train_x=train.drop('Label',axis=1);train_y=train['Label']; test_x=test.drop('Label',axis=1);test_y=test['Label']; #Random undersampling to reduce the majority class size samp=RepeatedEditedNearestNeighbours(random_state=42) X_samp,y_samp=samp.fit_resample(train_x,train_y) X_samp = pd.DataFrame(X_samp, columns = train_x.columns) #Experimenting with different numbers of top features derived from the tree-based feature extraction method top_n_feats=[30,40,50,60,70] X_r=feature_reduction_using_trees(X_samp,y_samp) cols=X_r.columns for n in top_n_feats: print("For top: ",n," features") X_red=X_r[cols[0:n]] sv=SVC(kernel="linear",probability=True,C=0.01,random_state=42) #chosen from 5foldCV based grid search kde=KDEClassifier(bandwidth=1.27) #chosen from 5foldCV based grid search best_model = VotingClassifier(estimators=[('sv', sv), ('kde', kde)], voting='soft',weights=[4, 7]) #best combination of weights selected by a brute force search (possible weights 1-10) using a cross-validation approach on the training data best_model.fit(X_red,y_samp) y_probs = best_model.predict_proba(test_x[X_red.columns])[:,1] thresholds = arange(0, 1, 0.001) scores = [roc_auc_score(test_y, to_labels(y_probs, t)) for t in thresholds] ix= argmax(scores) y_test_predictions = np.where(best_model.predict_proba(test_x[X_red.columns])[:,1] > thresholds[ix], 2, 1) print("Thresh: ",thresholds[ix]) sensi= sensitivity_score(test_y, y_test_predictions, pos_label=2) speci=specificity_score(test_y,y_test_predictions,pos_label=2) accu=accuracy_score(test_y,y_test_predictions) auro=roc_auc_score(test_y,y_test_predictions) mcc=metrics.matthews_corrcoef(test_y,y_test_predictions) tn, fp, fn, tp = confusion_matrix(test_y, y_test_predictions).ravel() ppv=tp/(tp+fp) npv=tn/(tn+fn) sen=tp/(tp+fn) spe=tn/(tn+fp) score=ppv+npv+sen+spe print("For kmer size: ",len(train.columns[0])) print("for top ",n," features") print(list(X_red.columns.values),"\n") score_dict={"Sen":sen,"Spe":spe,"PPV":ppv,"NPV":npv,"AUC":auro,"MCC":mcc,"ACC":accu} print(score) print(score_dict) df=pd.DataFrame(y_test_predictions) y_samp = pd.DataFrame(y_samp, columns = ['x']) return best_model,X_red,scores
def load_from_csv(input_dir: str, counts_file: str = "normalized_counts.csv.gz", n_jobs=1, low_expression=0.1) -> (AnnData, AnnData): u""" load data from csv files :param input_dir: :param counts_file: :param n_jobs :param str :return: """ logger.info("Reading {0}".format(input_dir)) input_file = os.path.join(input_dir, counts_file) # if not os.path.exists(input_file): # input_file += ".gz" mtx = pd.read_csv(input_file, index_col=0) meta = pd.read_csv(os.path.join(input_dir, "meta.csv.gz"), index_col=0) meta = meta.loc[meta.index, :] logger.info(mtx.shape) # filter low expressed genes genes_sum = [x / mtx.shape[1] > low_expression for x in mtx.sum(axis=1)] mtx = mtx.loc[genes_sum, :] logger.info(mtx.shape) mtx = mtx.transpose() data = AnnData(mtx, obs=meta) data.obs = meta logger.info("Perform ENN") enn = EditedNearestNeighbours(n_jobs=n_jobs, return_indices=True) mtx_enn, group_enn, idx_enn = enn.fit_resample(mtx, meta["Stage"]) data_enn = AnnData(mtx.iloc[list(idx_enn), :], meta.iloc[idx_enn, :]) data_enn.obs = meta.iloc[idx_enn, :] logger.info("Perform RENN") renn = RepeatedEditedNearestNeighbours(n_jobs=n_jobs, return_indices=True) mtx_renn, group_renn, idx_renn = renn.fit_resample(mtx, meta["Stage"]) data_renn = AnnData(mtx.iloc[list(idx_renn), :], meta.iloc[idx_renn, :]) data_renn.obs = meta.iloc[idx_renn, :] return data, data_enn, data_renn
def repeated_edited_nearest_neighbours(X, y, visualize=False, pca2d=True, pca3d=True, tsne=True, pie_evr=True): renn = RepeatedEditedNearestNeighbours() X_res, y_res = renn.fit_resample(X, y) if visualize == True: hist_over_and_undersampling(y_res) pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr) return X_res, y_res
def train_decisiontree_FPR(configurationname, train_data, score_function, undersam=False, oversam=False, export=False): print("Training with configuration " + configurationname) X_train, y_train, id_to_a_train = train_data dtc = DecisionTreeClassifier(random_state=0) print("Feature Selection") # selector = SelectFpr(score_function) selector = SelectFpr(score_function) result = selector.fit(X_train, y_train) X_train = selector.transform(X_train) fitted_ids = [i for i in result.get_support(indices=True)] print("Apply Resampling") print(Counter(y_train)) if undersam and not oversam: renn = RepeatedEditedNearestNeighbours() X_train, y_train = renn.fit_resample(X_train, y_train) if oversam and not undersam: # feature_indices_array = list(range(len(f_to_id))) # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0) # X_train, y_train = smote_nc.fit_resample(X_train, y_train) sm = SMOTE(random_state=42) X_train, y_train = sm.fit_resample(X_train, y_train) if oversam and undersam: smote_enn = SMOTEENN(random_state=0) X_train, y_train = smote_enn.fit_resample(X_train, y_train) print(Counter(y_train)) print("Train Classifier") dtc = dtc.fit(X_train, y_train, check_input=True) # if export: print("Exporting decision tree image...") export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True) transform(fitted_ids) print("Self Accuracy: " + str(dtc.score(X_train, y_train))) return selector, dtc
def resample(X, y): print('\nOriginal') for i in range(len(y.unique())): print(str(i) + ': ' + str(class_size(y, i))) # under = RandomUnderSampler( sampling_strategy = 'majority' ) under = RepeatedEditedNearestNeighbours(sampling_strategy='majority') X_res, y_res = under.fit_resample(X, np.ravel(y, order='C')) over = ADASYN( sampling_strategy={ 0: class_size(y_res, 0), 1: int(class_size(y_res, 0) * 0.7), 2: int(class_size(y_res, 0) * 0.7) }) # over = ADASYN( sampling_strategy='not majority' ) X_res, y_res = over.fit_resample(X_res, np.ravel(y_res, order='C')) print('\nResampled') for i in range(len(y.unique())): print(str(i) + ': ' + str(class_size(y_res, i))) return X_res, y_res
def train_decisiontree_with(configurationname, train_data, k, score_function, undersam=False, oversam=False, export=False): assert k > 0 print("Training with configuration " + configurationname) X_train, y_train, id_to_a_train = train_data dtc = DecisionTreeClassifier(random_state=0) print("Feature Selection") # selector = SelectFpr(score_function) selector = SelectKBest(score_function, k=k) result = selector.fit(X_train, y_train) X_train = selector.transform(X_train) fitted_ids = [i for i in result.get_support(indices=True)] print("Apply Resampling") print(Counter(y_train)) if undersam and not oversam: renn = RepeatedEditedNearestNeighbours() X_train, y_train = renn.fit_resample(X_train, y_train) if oversam and not undersam: # feature_indices_array = list(range(len(f_to_id))) # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0) # X_train, y_train = smote_nc.fit_resample(X_train, y_train) sm = SMOTE(random_state=42) X_train, y_train = sm.fit_resample(X_train, y_train) if oversam and undersam: smote_enn = SMOTEENN(random_state=0) X_train, y_train = smote_enn.fit_resample(X_train, y_train) print(Counter(y_train)) print("Train Classifier") dtc = dtc.fit(X_train, y_train, check_input=True) if export: export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True) transform(fitted_ids, configurationname) print("Self Accuracy: " + str(dtc.score(X_train, y_train))) return selector, dtc
def load_from_csv(input_dir: str) -> (AnnData, AnnData): u""" load data from csv files :param input_dir: :return: """ logger.info("read") mtx = pd.read_csv(os.path.join(input_dir, "normalized_counts.csv.gz"), index_col=0, engine="c") meta = pd.read_csv(os.path.join(input_dir, "meta.csv.gz"), index_col=0, engine="c") meta = meta.loc[meta.index, :] mtx = mtx.transpose() data = AnnData(mtx, obs=meta) data.obs = meta logger.info("enn") enn = EditedNearestNeighbours(n_jobs=10, return_indices=True) mtx_enn, group_enn, idx_enn = enn.fit_resample(mtx, meta["Stage"]) data_enn = AnnData(mtx.iloc[list(idx_enn), :], meta.iloc[idx_enn, :]) data_enn.obs = meta.iloc[idx_enn, :] logger.info("Repeated enn") renn = RepeatedEditedNearestNeighbours(n_jobs=10, return_indices=True) mtx_renn, group_renn, idx_renn = renn.fit_resample(mtx, meta["Stage"]) data_renn = AnnData(mtx.iloc[list(idx_renn), :], meta.iloc[idx_renn, :]) data_renn.obs = meta.iloc[idx_renn, :] return data, data_enn, data_renn
print(X_data.shape) print('-------') # ------- CNN -------- cnn = CondensedNearestNeighbour() X_cnn, y_cnn = cnn.fit_resample(X_data, y_data) print(X_cnn.shape) # ------- ENN -------- enn = EditedNearestNeighbours() X_enn, y_enn = enn.fit_resample(X_data, y_data) print(X_enn.shape) # ------- RENN -------- renn = RepeatedEditedNearestNeighbours() X_renn, y_renn = renn.fit_resample(X_data, y_data) print(X_renn.shape) # ------- Tomek -------- tl = TomekLinks() X_t, y_t = tl.fit_resample(X_data, y_data) print(X_t.shape) # ------- RUS -------- rus = RandomUnderSampler(random_state=42) X_rus, y_rus = rus.fit_resample(X_data, y_data) print(X_rus.shape) print('\n\n') datasets = [{
def resampling_assigner(imb_technique, AA_ova_X_train, AA_ova_y_train, AI_ova_X_train, AI_ova_y_train, AW_ova_X_train, AW_ova_y_train, CC_ova_X_train, CC_ova_y_train, QA_ova_X_train, QA_ova_y_train): print(imb_technique) if imb_technique == "ADASYN": AA_ada, AI_ada, AW_ada, CC_ada, QA_ada = ADASYN(), ADASYN(), ADASYN( ), ADASYN(), ADASYN() AA_X_res, AA_y_res = AA_ada.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_ada.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_ada.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_ada.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_ada.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ALLKNN": AA_allknn, AI_allknn, AW_allknn, CC_allknn, QA_allknn = AllKNN( ), AllKNN(), AllKNN(), AllKNN(), AllKNN() AA_X_res, AA_y_res = AA_allknn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_allknn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_allknn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_allknn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_allknn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "CNN": AA_cnn, AI_cnn, AW_cnn, CC_cnn, QA_cnn = CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour() AA_X_res, AA_y_res = AA_cnn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_cnn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_cnn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_cnn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_cnn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ENN": AA_enn, AI_enn, AW_enn, CC_enn, QA_enn = EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours() AA_X_res, AA_y_res = AA_enn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_enn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_enn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_enn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_enn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "IHT": AA_iht, AI_iht, AW_iht, CC_iht, QA_iht = InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold() AA_X_res, AA_y_res = AA_iht.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_iht.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_iht.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_iht.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_iht.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "NCR": AA_ncr, AI_ncr, AW_ncr, CC_ncr, QA_ncr = NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule() AA_ova_y_train = [ 0 if i == "Accepted/Assigned" else 1 for i in AA_ova_y_train ] AA_X_res, AA_y_res = AA_ncr.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_ova_y_train = [ 0 if i == "Accepted/In Progress" else 1 for i in AI_ova_y_train ] AI_X_res, AI_y_res = AI_ncr.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_ova_y_train = [ 0 if i == "Accepted/Wait" else 1 for i in AW_ova_y_train ] AW_X_res, AW_y_res = AW_ncr.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_ova_y_train = [ 0 if i == "Completed/Closed" else 1 for i in CC_ova_y_train ] CC_X_res, CC_y_res = CC_ncr.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_ova_y_train = [ 0 if i == "Queued/Awaiting Assignment" else 1 for i in QA_ova_y_train ] QA_X_res, QA_y_res = QA_ncr.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "NM": AA_nm, AI_nm, AW_nm, CC_nm, QA_nm = NearMiss(), NearMiss(), NearMiss( ), NearMiss(), NearMiss() AA_X_res, AA_y_res = AA_nm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_nm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_nm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_nm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_nm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "OSS": AA_oss, AI_oss, AW_oss, CC_oss, QA_oss = OneSidedSelection( ), OneSidedSelection(), OneSidedSelection(), OneSidedSelection( ), OneSidedSelection() AA_X_res, AA_y_res = AA_oss.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_oss.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_oss.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_oss.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_oss.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "RENN": AA_renn, AI_renn, AW_renn, CC_renn, QA_renn = RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ) AA_X_res, AA_y_res = AA_renn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_renn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_renn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_renn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_renn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTE": AA_sm, AI_sm, AW_sm, CC_sm, QA_sm = SMOTE(), SMOTE(), SMOTE(), SMOTE( ), SMOTE() AA_X_res, AA_y_res = AA_sm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_sm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_sm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_sm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_sm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "BSMOTE": AA_bsm, AI_bsm, AW_bsm, CC_bsm, QA_bsm = BorderlineSMOTE( ), BorderlineSMOTE(), BorderlineSMOTE(), BorderlineSMOTE( ), BorderlineSMOTE() AA_X_res, AA_y_res = AA_bsm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_bsm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_bsm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_bsm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_bsm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTEENN": AA_smenn, AI_smenn, AW_smenn, CC_smenn, QA_smenn = SMOTEENN( ), SMOTEENN(), SMOTEENN(), SMOTEENN(), SMOTEENN() AA_X_res, AA_y_res = AA_smenn.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_smenn.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_smenn.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_smenn.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_smenn.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "SMOTETOMEK": AA_smtm, AI_smtm, AW_smtm, CC_smtm, QA_smtm = SMOTETomek(), SMOTETomek( ), SMOTETomek(), SMOTETomek(), SMOTETomek() AA_X_res, AA_y_res = AA_smtm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_smtm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_smtm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_smtm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_smtm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "TOMEK": AA_tm, AI_tm, AW_tm, CC_tm, QA_tm = TomekLinks(), TomekLinks( ), TomekLinks(), TomekLinks(), TomekLinks() AA_X_res, AA_y_res = AA_tm.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_tm.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_tm.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_tm.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_tm.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "ROS": AA_ros, AI_ros, AW_ros, CC_ros, QA_ros = RandomOverSampler( ), RandomOverSampler(), RandomOverSampler(), RandomOverSampler( ), RandomOverSampler() AA_X_res, AA_y_res = AA_ros.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_ros.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_ros.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_ros.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_ros.fit_resample(QA_ova_X_train, QA_ova_y_train) elif imb_technique == "RUS": AA_rus, AI_rus, AW_rus, CC_rus, QA_rus = RandomUnderSampler( ), RandomUnderSampler(), RandomUnderSampler(), RandomUnderSampler( ), RandomUnderSampler() AA_X_res, AA_y_res = AA_rus.fit_resample(AA_ova_X_train, AA_ova_y_train) AI_X_res, AI_y_res = AI_rus.fit_resample(AI_ova_X_train, AI_ova_y_train) AW_X_res, AW_y_res = AW_rus.fit_resample(AW_ova_X_train, AW_ova_y_train) CC_X_res, CC_y_res = CC_rus.fit_resample(CC_ova_X_train, CC_ova_y_train) QA_X_res, QA_y_res = QA_rus.fit_resample(QA_ova_X_train, QA_ova_y_train) return AA_X_res, AA_y_res, AI_X_res, AI_y_res, AW_X_res, AW_y_res, CC_X_res, CC_y_res, QA_X_res, QA_y_res
def all_imblearn(xx, yy): imblearnlist = [] """OVER SAMPLING""" """Random Over Sampler""" ros = RandomOverSampler(random_state=0) X_resampled, y_resampled = ros.fit_resample(xx, yy) randomOverSampler = [X_resampled, y_resampled, 'random over sampler'] imblearnlist.append(randomOverSampler) """SMOTE""" X_resampled, y_resampled = SMOTE().fit_resample(xx, yy) smote = [X_resampled, y_resampled, 'smote'] imblearnlist.append(smote) """SMOTE borderline1""" sm = SMOTE(kind='borderline1') X_resampled, y_resampled = sm.fit_resample(xx, yy) smote = [X_resampled, y_resampled, 'smote borderline1'] imblearnlist.append(smote) """SMOTE borderline2""" sm = SMOTE(kind='borderline2') X_resampled, y_resampled = sm.fit_resample(xx, yy) smote = [X_resampled, y_resampled, 'smote borderline2'] imblearnlist.append(smote) """SMOTE svm""" sm = SMOTE(kind='svm') X_resampled, y_resampled = sm.fit_resample(xx, yy) smote = [X_resampled, y_resampled, 'smote svm'] imblearnlist.append(smote) """SMOTENC""" smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0) X_resampled, y_resampled = smote_nc.fit_resample(xx, yy) smote = [X_resampled, y_resampled, 'smotenc'] imblearnlist.append(smote) # """ADASYN""" # X_resampled, y_resampled = ADASYN.fit_resample(xx, yy) # adasyn = [X_resampled, y_resampled, 'adasyn'] # imblearnlist.append(adasyn) # """UNDER SAMPLING""" """Cluster Centroids""" cc = ClusterCentroids(random_state=0) X_resampled, y_resampled = cc.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'cluster centroids'] imblearnlist.append(reSampled) """Random Over Sampler""" rus = RandomUnderSampler(random_state=0) X_resampled, y_resampled = rus.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'random under sampler'] imblearnlist.append(reSampled) """Near Miss 1""" nm1 = NearMiss(version=1) X_resampled, y_resampled = nm1.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'near miss 1'] imblearnlist.append(reSampled) """Near Miss 2""" nm2 = NearMiss(version=2) X_resampled, y_resampled = nm2.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'near miss 2'] imblearnlist.append(reSampled) """Near Miss 3""" nm3 = NearMiss(version=3) X_resampled, y_resampled = nm3.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'near miss 3'] imblearnlist.append(reSampled) """Edited Nearest Neighbours""" enn = EditedNearestNeighbours() X_resampled, y_resampled = enn.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'edited nearest neighbours'] imblearnlist.append(reSampled) """Repeated Edited Nearest Neighbours""" renn = RepeatedEditedNearestNeighbours() X_resampled, y_resampled = renn.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'repeated edited nearest neighbours'] imblearnlist.append(reSampled) """All KNN""" allknn = AllKNN() X_resampled, y_resampled = allknn.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'allKNN'] imblearnlist.append(reSampled) """Condensed Nearest Neighbour""" cnn = CondensedNearestNeighbour(random_state=0) X_resampled, y_resampled = cnn.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'Condensed Nearest Neighbour'] imblearnlist.append(reSampled) """One Sided Selection""" oss = OneSidedSelection(random_state=0) X_resampled, y_resampled = oss.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'One Sided Selection'] imblearnlist.append(reSampled) """Neighbourhood Cleaning Rule""" ncr = NeighbourhoodCleaningRule() X_resampled, y_resampled = ncr.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'Neighbourhood Cleaning Rule'] imblearnlist.append(reSampled) """OVER AND UNDER SAMPLING""" """SMOTEENN""" smote_enn = SMOTEENN(random_state=0) X_resampled, y_resampled = smote_enn.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'SMOTEENN'] imblearnlist.append(reSampled) """SMOTETomek""" smote_tomek = SMOTETomek(random_state=0) X_resampled, y_resampled = smote_tomek.fit_resample(xx, yy) reSampled = [X_resampled, y_resampled, 'SMOTETomek'] imblearnlist.append(reSampled) return imblearnlist
def test_renn_not_good_object(): nn = 'rnd' renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel='mode') with raises(ValueError): renn.fit_resample(X, Y)
def test_renn_iter_wrong(): max_iter = -1 renn = RepeatedEditedNearestNeighbours(max_iter=max_iter) with pytest.raises(ValueError): renn.fit_resample(X, Y)
def test_deprecation_random_state(): renn = RepeatedEditedNearestNeighbours(random_state=0) with warns(DeprecationWarning, match="'random_state' is deprecated from 0.4"): renn.fit_resample(X, Y)
def test_renn_not_good_object(): nn = "rnd" renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel="mode") with pytest.raises(ValueError): renn.fit_resample(X, Y)
# Next a nearest neighbors undersampling technique is applied to the majority. # In[27]: from imblearn.under_sampling import RepeatedEditedNearestNeighbours # First with n_neighbors = 25 # In[28]: enn = RepeatedEditedNearestNeighbours(sampling_strategy='majority', n_neighbors=25, n_jobs=3, random_state=101) X_resamp2, y_resamp2 = enn.fit_resample(X_train, y_train) # In[29]: #number of resampled majority samples y_resamp2.shape[0] - y_resamp2.sum() # Now n_neighbors is decreased to 20 # In[30]: enn = RepeatedEditedNearestNeighbours(sampling_strategy='majority', n_neighbors=20, n_jobs=3, random_state=101) X_resamp3, y_resamp3 = enn.fit_resample(X_train, y_train)
# advanced undersampling ====================================================== ''' ALLKNN ''' from imblearn.under_sampling import AllKNN, NeighbourhoodCleaningRule # define undersampling strategy under_allknn = AllKNN() # fit and apply the transform X, y = under_allknn.fit_resample(X, y) # summarize class distribution print(Counter(y)) ''' RENN ''' from imblearn.under_sampling import CondensedNearestNeighbour, EditedNearestNeighbours, RepeatedEditedNearestNeighbours # define undersampling strategy under_renn = RepeatedEditedNearestNeighbours() # fit and apply the transform X, y = under_renn.fit_resample(X, y) # summarize class distribution print(Counter(y)) # advanced oversampling ======================================================= ''' ADASYN ''' from imblearn.over_sampling import ADASYN # define oversampling strategy over_ada = ADASYN(random_state=42) # fit and apply the transform X, y = over_ada.fit_resample(X, y) # summarize class distribution print(Counter(y)) ''' SMOTE ''' from imblearn.over_sampling import SMOTE, SMOTENC, BorderlineSMOTE # define oversampling strategy
def repeated_edited_nearest_neighbours(X, y): renn = RepeatedEditedNearestNeighbours() X_res, y_res = renn.fit_resample(X, y) return X_res, y_res
def resampling_assigner(imb_technique, AP_ova_X_train, AP_ova_y_train, PM_ova_X_train, PM_ova_y_train, SC_ova_X_train, SC_ova_y_train): print(imb_technique) if imb_technique == "ADASYN": AP_ada, PM_ada, SC_ada = ADASYN(), ADASYN(), ADASYN() AP_X_res, AP_y_res = AP_ada.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_ada.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_ada.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "ALLKNN": AP_allknn, PM_allknn, SC_allknn = AllKNN(), AllKNN(), AllKNN() AP_X_res, AP_y_res = AP_allknn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_allknn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_allknn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "CNN": AP_cnn, PM_cnn, SC_cnn = CondensedNearestNeighbour( ), CondensedNearestNeighbour(), CondensedNearestNeighbour() AP_X_res, AP_y_res = AP_cnn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_cnn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_cnn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "ENN": AP_enn, PM_enn, SC_enn = EditedNearestNeighbours( ), EditedNearestNeighbours(), EditedNearestNeighbours() AP_X_res, AP_y_res = AP_enn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_enn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_enn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "IHT": AP_iht, PM_iht, SC_iht = InstanceHardnessThreshold( ), InstanceHardnessThreshold(), InstanceHardnessThreshold() AP_X_res, AP_y_res = AP_iht.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_iht.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_iht.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "NCR": AP_iht, PM_iht, SC_iht = NeighbourhoodCleaningRule( ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule() AP_ova_y_train = [ 0 if i == "Add penalty" else 1 for i in AP_ova_y_train ] AP_X_res, AP_y_res = AP_ncr.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_ova_y_train = [0 if i == "Payment" else 1 for i in PM_ova_y_train] PM_X_res, PM_y_res = PM_ncr.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_ova_y_train = [ 0 if i == "Send for Credit Collection" else 1 for i in SC_ova_y_train ] SC_X_res, SC_y_res = SC_ncr.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "NM": AP_nm, PM_nm, SC_nm = NearMiss(), NearMiss(), NearMiss() AP_X_res, AP_y_res = AP_nm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_nm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_nm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "OSS": AP_oss, PM_oss, SC_oss = OneSidedSelection(), OneSidedSelection( ), OneSidedSelection() AP_X_res, AP_y_res = AP_oss.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_oss.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_oss.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "RENN": AP_renn, PM_renn, SC_renn = RepeatedEditedNearestNeighbours( ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours( ) AP_X_res, AP_y_res = AP_renn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_renn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_renn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "SMOTE": AP_sm, PM_sm, SC_sm = SMOTE(), SMOTE(), SMOTE() AP_X_res, AP_y_res = AP_sm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_sm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_sm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "BSMOTE": AP_bsm, PM_bsm, SC_bsm = BorderlineSMOTE(), BorderlineSMOTE( ), BorderlineSMOTE() AP_X_res, AP_y_res = AP_bsm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_bsm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_bsm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "SMOTEENN": AP_smenn, PM_smenn, SC_smenn = SMOTEENN(), SMOTEENN(), SMOTEENN() AP_X_res, AP_y_res = AP_smenn.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_smenn.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_smenn.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "SMOTETOMEK": AP_smtm, PM_smtm, SC_smtm = SMOTETomek(), SMOTETomek(), SMOTETomek() AP_X_res, AP_y_res = AP_smtm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_smtm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_smtm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "TOMEK": AP_tm, PM_tm, SC_tm = TomekLinks(), TomekLinks(), TomekLinks() AP_X_res, AP_y_res = AP_tm.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_tm.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_tm.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "ROS": AP_ros, PM_ros, SC_ros = RandomOverSampler(), RandomOverSampler( ), RandomOverSampler() AP_X_res, AP_y_res = AP_ros.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_ros.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_ros.fit_resample(SC_ova_X_train, SC_ova_y_train) elif imb_technique == "RUS": AP_rus, PM_rus, SC_rus = RandomUnderSampler(), RandomUnderSampler( ), RandomUnderSampler() AP_X_res, AP_y_res = AP_rus.fit_resample(AP_ova_X_train, AP_ova_y_train) PM_X_res, PM_y_res = PM_rus.fit_resample(PM_ova_X_train, PM_ova_y_train) SC_X_res, SC_y_res = SC_rus.fit_resample(SC_ova_X_train, SC_ova_y_train) return AP_X_res, AP_y_res, PM_X_res, PM_y_res, SC_X_res, SC_y_res
X_res_vis = pca.transform(X_resampled) idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled) reduction_str = ('Reduced {:.2f}%'.format( 100 * (1 - float(len(X_resampled)) / len(X)))) print(reduction_str) c3 = ax2.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1], alpha=.2, label='Removed samples', c='g') plot_resampling(ax2, X_res_vis, y_resampled, 'ENN - ' + reduction_str) # Apply the RENN print('RENN') renn = RepeatedEditedNearestNeighbours(return_indices=True) X_resampled, y_resampled, idx_resampled = renn.fit_resample(X, y) X_res_vis = pca.transform(X_resampled) idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled) reduction_str = ('Reduced {:.2f}%'.format( 100 * (1 - float(len(X_resampled)) / len(X)))) print(reduction_str) ax3.scatter(X_vis[idx_samples_removed, 0], X_vis[idx_samples_removed, 1], alpha=.2, label='Removed samples', c='g') plot_resampling(ax3, X_res_vis, y_resampled, 'RENN - ' + reduction_str) # Apply the AllKNN print('AllKNN') allknn = AllKNN(return_indices=True)
def test_renn_not_good_object(): nn = 'rnd' renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel='mode') with pytest.raises(ValueError): renn.fit_resample(X, Y)
def test_deprecation_random_state(): renn = RepeatedEditedNearestNeighbours(random_state=0) with warns( DeprecationWarning, match="'random_state' is deprecated from 0.4"): renn.fit_resample(X, Y)
def test_renn_iter_attribute(max_iter, n_iter): renn = RepeatedEditedNearestNeighbours(max_iter=max_iter) renn.fit_resample(X, Y) assert renn.n_iter_ == n_iter
def test_renn_fit_resample(): renn = RepeatedEditedNearestNeighbours() X_resampled, y_resampled = renn.fit_resample(X, Y) X_gt = np.array([ [-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [0.73489726, 0.43915195], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172], ]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert 0 < renn.n_iter_ <= renn.max_iter