def rnn_undersampling( self, x: pandas.DataFrame, y: numpy.ndarray, neighbors: int) -> typing.Tuple[numpy.ndarray, numpy.ndarray]: """ Repeated Edited Nearest Neighbors. Args: x: X training covariates for the ML model. y: y training binary outcomes of the ML model. Returns: resampled (undersampled) observations that reduce bias in the receiving operating characteristic (ROC). """ x = self.check_id(x) rnn_undersampler = RepeatedEditedNearestNeighbours( random_state=82, n_neighbors=neighbors, return_indices=True, kind_sel="mode", max_iter=400, ratio="majority", ) X_resampled, y_resampled, resampled_idx = rnn_undersampler.fit_sample( copy.deepcopy(x), copy.deepcopy(y)) LOGGER.info(X_resampled) LOGGER.info( "RNN undersampling yielded {} number of X_resampled observations". format(len(X_resampled))) LOGGER.info(y_resampled) LOGGER.info( "RNN undersampling yielded {} number of y_resampled observations". format(len(y_resampled))) return X_resampled, y_resampled
def under_sampling(X, y, method): if method == 'ClusterCentroids': model = ClusterCentroids() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'RandomUnderSampler': model = RandomUnderSampler() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'NearMiss': model = NearMiss() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'EditedNearestNeighbours': model = EditedNearestNeighbours() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'RepeatedEditedNearestNeighbours': model = RepeatedEditedNearestNeighbours() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'AllKNN': model = AllKNN() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'NeighbourhoodCleaningRule': model = NeighbourhoodCleaningRule() X_resampled, y_resampled = model.fit_resample(X, y) elif method == 'OneSidedSelection': model = OneSidedSelection() X_resampled, y_resampled = model.fit_resample(X, y) return X_resampled, y_resampled
class ResamplingAlgorithms(Enum): RO = ("Random Over-sampling", RandomOverSampler(random_state=1)) SMOTE = ("Smote", SMOTE(random_state=1)) ADASYN = ("ADASYN", ADASYN(random_state=1)) SMOTE_TL = ('SMOTE+TL', SMOTETomek(random_state=1)) SMOTE_ENN = ('SMOTE+ENN', SMOTEENN(random_state=1)) SMOTE_BOOST = ("SMOTEBoost", smote_boost.SMOTEBoost()) RU = ("Random Under-sampling", RandomUnderSampler(random_state=1)) CLUSTERCENTROIDS = ("ClusterCentroids", ClusterCentroids(random_state=1)) TOMEK_LINKS = ("TomekLinks", TomekLinks()) NM1 = ("NM1", NearMiss(version=1)) NM2 = ("NM2", NearMiss(version=2)) NM3 = ("NM3", NearMiss(version=3)) CNN = ("CNN", CondensedNearestNeighbour(random_state=1)) OSS = ("OneSidedSelection", OneSidedSelection(random_state=1)) ENN = ('ENN', EditedNearestNeighbours()) NCL = ('NCL', NeighbourhoodCleaningRule()) IHT = ('IHT', (InstanceHardnessThreshold(random_state=1))) RENN = ('RENN', RepeatedEditedNearestNeighbours()) AllKNN = ('AllKNN', AllKNN()) @classmethod def get_algorithm_by_name(cls, name): filtered_algos = filter(lambda ra: ra.value[0] == name, ResamplingAlgorithms) return next(filtered_algos, ResamplingAlgorithms.RO)
def test_renn_fit_sample(): """Test the fit sample routine""" # Resample the data renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED) X_resampled, y_resampled = renn.fit_sample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [0.73489726, 0.43915195], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_renn_init(): renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED) assert renn.n_neighbors == 3 assert renn.kind_sel == 'all' assert renn.n_jobs == 1 assert renn.random_state == RND_SEED
def test_renn_fit_sample_mode(): nn = NearestNeighbors(n_neighbors=4) renn = RepeatedEditedNearestNeighbours( n_neighbors=nn, random_state=RND_SEED, kind_sel='mode') X_resampled, y_resampled = renn.fit_sample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [2.94290565, -0.13986434], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [-0.28479268, 0.70459548], [1.84864913, 0.14729596], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [1.67314371, 0.19231498], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def under_sample(X, y, sampler="RandomUnderSampler"): # list of all samplers, in case you want to iterate all of them samplers_list = ['RandomUnderSampler', 'ClusterCentroids', 'NearMiss', 'InstanceHardnessThreshold', 'CondensedNearestNeighbour', 'EditedNearestNeighbours', 'RepeatedEditedNearestNeighbours', 'AllKNN', 'NeighbourhoodCleaningRule', 'OneSidedSelection'] print(samplers_list) # currently there is no parameters sampler # this dict is used to choose a resampler by user. default is random samplers = { "RandomUnderSampler": RandomUnderSampler(), "ClusterCentroids": ClusterCentroids(), "NearMiss": NearMiss(), "InstanceHardnessThreshold": InstanceHardnessThreshold(), "CondensedNearestNeighbour": CondensedNearestNeighbour(), "EditedNearestNeighbours": EditedNearestNeighbours(), "RepeatedEditedNearestNeighbours": RepeatedEditedNearestNeighbours(), "AllKNN": AllKNN(), "NeighbourhoodCleaningRule": NeighbourhoodCleaningRule(), "OneSidedSelection": OneSidedSelection(), } sampler = samplers[sampler] # plot y class count before and after resample print("before", sorted(Counter(y).items())) # to resample simply call fit_resample method of sampler X_resampled, y_resampled = sampler.fit_resample(X, y) print("after", sorted(Counter(y_resampled).items())) print('===' * 4, 'under_sample finished') return X_resampled, y_resampled
def test_renn_fit_sample_with_indices(): renn = RepeatedEditedNearestNeighbours( return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = renn.fit_sample(X, Y) X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [-0.46226554, -0.50481004], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [1.12202806, 0.33811558], [0.73489726, 0.43915195], [0.50307437, 0.498805], [0.84929742, 0.41042894], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [0.69804044, 0.44810796], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [0.79270821, -0.41386668], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [0.48921682, -1.38504507], [-0.03918551, -0.68540745], [0.24991051, -1.00864997], [0.80541964, -0.34465185], [0.1732627, -1.61323172]]) y_gt = np.array([ 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 ]) idx_gt = np.array([ 6, 13, 32, 39, 4, 5, 16, 22, 23, 24, 30, 37, 2, 11, 12, 17, 20, 21, 25, 26, 28, 31, 33, 34, 35, 36 ]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def getsampler(self, type): if type == 'none': sampler = NoSampler() elif type == 'randomunder': sampler = RandomUnderSampler() elif type == 'nearmiss': sampler = NearMiss() elif type == 'allknn': sampler = AllKNN() elif type == 'condensednn': sampler = CondensedNearestNeighbour() elif type == 'editednn': sampler = EditedNearestNeighbours() elif type == 'repeatededitednn': sampler = RepeatedEditedNearestNeighbours() elif type == 'tomeklinks': sampler = TomekLinks() elif type == 'randomover': sampler = RandomOverSampler() elif type == 'smote': sampler = SMOTE() elif type == 'adasyn': sampler = ADASYN() elif type == 'smotenc': sampler = SMOTENC() elif type == 'quality': # and self.quality_model_selection_type == 'extended': sampler = QualitySampler(self.n_init) else: print("Unsupported sampler %s" % type) exit(1) if type != 'none' and type != 'quality' and 'random_state' in sampler.get_params( ).keys(): sampler.set_params(random_state=self.random_state) return sampler
def test_renn_sample_wt_fit(): """Test either if an error is raised when sample is called before fitting""" # Create the object renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED) assert_raises(RuntimeError, renn.sample, X, Y)
def sampler(name, ratio, random_state=0, return_indices=True, **kwargs): if name == "rus": sampler = RandomUnderSampler( ratio=ratio, return_indices=return_indices, random_state=random_state, **kwargs, ) elif name == "nm": sampler = NearMiss( ratio=ratio, return_indices=return_indices, random_state=random_state, **kwargs, ) elif name == "enn": sampler = EditedNearestNeighbours(return_indices=return_indices, random_state=random_state, **kwargs) elif name == "renn": sampler = RepeatedEditedNearestNeighbours( return_indices=return_indices, random_state=random_state, **kwargs) elif name == "allknn": sampler = AllKNN(return_indices=return_indices, random_state=random_state, **kwargs) elif name == "tl": sampler = TomekLinks(return_indices=return_indices, random_state=random_state, **kwargs) else: raise ValueError return sampler
def test_renn_init(): renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED) assert_equal(renn.n_neighbors, 3) assert_equal(renn.kind_sel, 'all') assert_equal(renn.n_jobs, -1) assert_equal(renn.random_state, RND_SEED)
def undersampled_data_split(df, test_size=0.3): X = df.loc[:, ~df.columns.isin(['class'])] y = df.loc[:, df.columns.isin(['class'])] X_train, X_test, y_train, y_test = train_test_split(X.values, y.values.flatten(), test_size=test_size, random_state=42) X_train, y_train = RepeatedEditedNearestNeighbours().fit_resample(X_train, y_train) return X_train, X_test, y_train, y_test
def test_continuous_error(): """Test either if an error is raised when the target are continuous type""" # continuous case y = np.linspace(0, 1, 40) enn = RepeatedEditedNearestNeighbours(random_state=RND_SEED) assert_warns(UserWarning, enn.fit, X, y)
def train_decisiontree_with(configurationname, train_data, k, score_function, undersam=False, oversam=False, export=False, **kwargs): assert k > 0 print("Training with configuration " + configurationname) X_train, y_train, id_to_a_train = train_data max_depth = None if "max_depth" not in kwargs else kwargs["max_depth"] dtc = DecisionTreeClassifier(criterion="entropy", random_state=0, max_depth=max_depth) print("Feature Selection") # selector = SelectFpr(score_function) selector = SelectKBest(score_function, k=k) selector = SelectKBest(score_function, k=k) selector = selector.fit(X_train, y_train) X_train = selector.transform(X_train) fitted_ids = [i for i in selector.get_support(indices=True)] print("Apply Resampling") print(Counter(y_train)) if undersam and not oversam: renn = RepeatedEditedNearestNeighbours() X_train, y_train = renn.fit_resample(X_train, y_train) if oversam and not undersam: # feature_indices_array = list(range(len(f_to_id))) # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0) # X_train, y_train = smote_nc.fit_resample(X_train, y_train) sm = SMOTE(random_state=42) X_train, y_train = sm.fit_resample(X_train, y_train) if oversam and undersam: smote_enn = SMOTEENN(random_state=0) X_train, y_train = smote_enn.fit_resample(X_train, y_train) print(Counter(y_train)) print("Train Classifier") dtc = dtc.fit(X_train, y_train, check_input=True) if export: print("Exporting tree to graph...") export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True) transform(fitted_ids, configurationname) print("Self Accuracy: " + str(dtc.score(X_train, y_train))) return selector, dtc
def Resampling(train_x, train_y, resampling_method): train_y.data = LabelEncoder().fit_transform(train_y.data) # summarize distribution # scommentare la riga di seguito se si vuole visualizzare il grafico a torta della distribuzione delle classi prima di resampling #plotGraphics.piePlot(train_y, "Before Resampling") # ---- UNDER-SAMPLING ------ # if resampling_method == "ClusterCentroids": resample = ClusterCentroids(voting='hard', random_state=42) if resampling_method == "CondensedNearestNeighbour": resample = CondensedNearestNeighbour(n_neighbors=7, random_state=42) if resampling_method == "EditedNearestNeighbours": resample = EditedNearestNeighbours(n_neighbors=7, kind_sel='mode', n_jobs=-1) if resampling_method == "RepeatedEditedNearestNeighbours": resample = RepeatedEditedNearestNeighbours(n_neighbors=7, kind_sel='mode', n_jobs=-1) if resampling_method == "AllKNN": resample = AllKNN(n_neighbors=7, kind_sel='mode', allow_minority=True, n_jobs=-1) if resampling_method == "NearMiss": resample = NearMiss(n_neighbors=7, n_jobs=-1) if resampling_method == "NeighbourhoodCleaningRule": resample = NeighbourhoodCleaningRule(n_neighbors=7, kind_sel='all') if resampling_method == "RandomUnderSampler": resample = RandomUnderSampler(random_state=42) if resampling_method == "TomekLinks": resample = TomekLinks(n_jobs=-1) # ---- OVER-SAMPLING ------ # if resampling_method == "BorderlineSMOTE": resample = BorderlineSMOTE(random_state=42, n_jobs=-1) if resampling_method == "KMeansSMOTE": resample = KMeansSMOTE(random_state=42) if resampling_method == "RandomUnderSampler": resample = RandomOverSampler(random_state=42) if resampling_method == "SMOTE": resample = SMOTE(random_state=42, n_jobs=-1) # transform the dataset train_x.data, train_y.data = resample.fit_resample(train_x.data, train_y.data)
def classification_results(train,test): #Derivation of NBDriver using training data """ Arguments: train = feature matrix derived from Brown et al. test= feature matrix derived from Martelotto et al. Returns: best_model = Best ensemble model derived using the training data X_red= Dataframe derived after sampling that was used to train the model scores= probability based classification scores """ sen=[];spe=[];acc=[];auc=[];c=[];m=[];s=[] train_x=train.drop('Label',axis=1);train_y=train['Label']; test_x=test.drop('Label',axis=1);test_y=test['Label']; #Random undersampling to reduce the majority class size samp=RepeatedEditedNearestNeighbours(random_state=42) X_samp,y_samp=samp.fit_resample(train_x,train_y) X_samp = pd.DataFrame(X_samp, columns = train_x.columns) #Experimenting with different numbers of top features derived from the tree-based feature extraction method top_n_feats=[30,40,50,60,70] X_r=feature_reduction_using_trees(X_samp,y_samp) cols=X_r.columns for n in top_n_feats: print("For top: ",n," features") X_red=X_r[cols[0:n]] sv=SVC(kernel="linear",probability=True,C=0.01,random_state=42) #chosen from 5foldCV based grid search kde=KDEClassifier(bandwidth=1.27) #chosen from 5foldCV based grid search best_model = VotingClassifier(estimators=[('sv', sv), ('kde', kde)], voting='soft',weights=[4, 7]) #best combination of weights selected by a brute force search (possible weights 1-10) using a cross-validation approach on the training data best_model.fit(X_red,y_samp) y_probs = best_model.predict_proba(test_x[X_red.columns])[:,1] thresholds = arange(0, 1, 0.001) scores = [roc_auc_score(test_y, to_labels(y_probs, t)) for t in thresholds] ix= argmax(scores) y_test_predictions = np.where(best_model.predict_proba(test_x[X_red.columns])[:,1] > thresholds[ix], 2, 1) print("Thresh: ",thresholds[ix]) sensi= sensitivity_score(test_y, y_test_predictions, pos_label=2) speci=specificity_score(test_y,y_test_predictions,pos_label=2) accu=accuracy_score(test_y,y_test_predictions) auro=roc_auc_score(test_y,y_test_predictions) mcc=metrics.matthews_corrcoef(test_y,y_test_predictions) tn, fp, fn, tp = confusion_matrix(test_y, y_test_predictions).ravel() ppv=tp/(tp+fp) npv=tn/(tn+fn) sen=tp/(tp+fn) spe=tn/(tn+fp) score=ppv+npv+sen+spe print("For kmer size: ",len(train.columns[0])) print("for top ",n," features") print(list(X_red.columns.values),"\n") score_dict={"Sen":sen,"Spe":spe,"PPV":ppv,"NPV":npv,"AUC":auro,"MCC":mcc,"ACC":accu} print(score) print(score_dict) df=pd.DataFrame(y_test_predictions) y_samp = pd.DataFrame(y_samp, columns = ['x']) return best_model,X_red,scores
def test_renn_fit_single_class(): """Test either if an error when there is a single class""" # Create the object renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED) # Resample the data # Create a wrong y y_single_class = np.zeros((X.shape[0], )) assert_warns(RuntimeWarning, renn.fit, X, y_single_class)
def test_renn_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED) renn.fit(X, Y) assert_raises(RuntimeError, renn.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_renn_iter_wrong(): """Test either if an error is raised when the numbr of iteration is wrong""" # Create the object max_iter = -1 renn = RepeatedEditedNearestNeighbours(max_iter=max_iter, random_state=RND_SEED) assert_raises(ValueError, renn.fit_sample, X, Y)
def test_renn_not_good_object(): """Test either if an error is raised while a wrong type of NN is given""" # Resample the data nn = 'rnd' renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, random_state=RND_SEED, kind_sel='mode') assert_raises(ValueError, renn.fit_sample, X, Y)
def test_renn_init(): """Test the initialisation of the object""" # Define a ratio renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED) assert_equal(renn.size_ngh, 3) assert_equal(renn.kind_sel, 'all') assert_equal(renn.n_jobs, -1) assert_equal(renn.random_state, RND_SEED)
def rep_edited_KNN(X, Y): from imblearn.under_sampling import RepeatedEditedNearestNeighbours renn = RepeatedEditedNearestNeighbours() renn.fit_resample(X, Y) indexes = renn.sample_indices_ nobj = len(Y) mask = np.zeros(nobj, dtype=int) for i in range(nobj): if i in indexes: mask[i] = 1 return True, mask
def load_from_csv(input_dir: str, counts_file: str = "normalized_counts.csv.gz", n_jobs=1, low_expression=0.1) -> (AnnData, AnnData): u""" load data from csv files :param input_dir: :param counts_file: :param n_jobs :param str :return: """ logger.info("Reading {0}".format(input_dir)) input_file = os.path.join(input_dir, counts_file) # if not os.path.exists(input_file): # input_file += ".gz" mtx = pd.read_csv(input_file, index_col=0) meta = pd.read_csv(os.path.join(input_dir, "meta.csv.gz"), index_col=0) meta = meta.loc[meta.index, :] logger.info(mtx.shape) # filter low expressed genes genes_sum = [x / mtx.shape[1] > low_expression for x in mtx.sum(axis=1)] mtx = mtx.loc[genes_sum, :] logger.info(mtx.shape) mtx = mtx.transpose() data = AnnData(mtx, obs=meta) data.obs = meta logger.info("Perform ENN") enn = EditedNearestNeighbours(n_jobs=n_jobs, return_indices=True) mtx_enn, group_enn, idx_enn = enn.fit_resample(mtx, meta["Stage"]) data_enn = AnnData(mtx.iloc[list(idx_enn), :], meta.iloc[idx_enn, :]) data_enn.obs = meta.iloc[idx_enn, :] logger.info("Perform RENN") renn = RepeatedEditedNearestNeighbours(n_jobs=n_jobs, return_indices=True) mtx_renn, group_renn, idx_renn = renn.fit_resample(mtx, meta["Stage"]) data_renn = AnnData(mtx.iloc[list(idx_renn), :], meta.iloc[idx_renn, :]) data_renn.obs = meta.iloc[idx_renn, :] return data, data_enn, data_renn
def __init__(self, name): self.strategie = None self.name = name if name == "enn": self.strategie = EditedNearestNeighbours(sampling_strategy='auto', n_neighbors=3, kind_sel='all', n_jobs=-1) elif name == "allknn": self.strategie = AllKNN(sampling_strategy='auto', n_neighbors=3, kind_sel='all', allow_minority=False, n_jobs=-1) elif name == "renn": self.strategie = RepeatedEditedNearestNeighbours( sampling_strategy='auto', n_neighbors=3, max_iter=100, kind_sel='all', n_jobs=-1) elif name == "tomek": self.strategie = TomekLinks(sampling_strategy='auto', n_jobs=-1) elif name == "smote": self.strategie = SMOTE(sampling_strategy='auto', k_neighbors=5, n_jobs=-1, random_state=42) elif name == "bdsmote": self.strategie = BorderlineSMOTE(random_state=42, n_jobs=-1) elif name == "adasyn": self.strategie = ADASYN(sampling_strategy='auto', n_neighbors=5, n_jobs=-1, random_state=42) elif name == "smoteenn": self.strategie = SMOTEENN(sampling_strategy='auto', smote=None, enn=None, n_jobs=-1, random_state=42) elif name == "smotetomek": self.strategie = SMOTETomek(sampling_strategy='auto', smote=None, tomek=None, n_jobs=-1, random_state=42)
def test_renn_fit_sample(): """Test the fit sample routine""" # Resample the data renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED) X_resampled, y_resampled = renn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'renn_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'renn_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def rep_edited_KNN(X, Y): from imblearn.under_sampling import RepeatedEditedNearestNeighbours renn = RepeatedEditedNearestNeighbours() renn.fit_resample(X, Y) indexes = renn.sample_indices_ mask = [] for i in range(len(X)): if i in indexes: mask.append(1) else: mask.append(0) return True, np.asarray(mask)
def repeated_edited_nearest_neighbours(X, y, visualize=False, pca2d=True, pca3d=True, tsne=True, pie_evr=True): renn = RepeatedEditedNearestNeighbours() X_res, y_res = renn.fit_resample(X, y) if visualize == True: hist_over_and_undersampling(y_res) pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr) return X_res, y_res
def test_renn_fit(): """Test the fitting method""" # Create the object renn = RepeatedEditedNearestNeighbours(random_state=RND_SEED) # Fit the data renn.fit(X, Y) # Check if the data information have been computed assert_equal(renn.min_c_, 0) assert_equal(renn.maj_c_, 1) assert_equal(renn.stats_c_[0], 500) assert_equal(renn.stats_c_[1], 4500)
def get_under_sample_models(): models, names = list(), list() models.append(TomekLinks()) names.append('TomesLinks') models.append(EditedNearestNeighbours()) names.append('EditedNearestNeighbors') models.append(RepeatedEditedNearestNeighbours()) names.append('RENN') models.append(OneSidedSelection()) names.append('OneSidedSelection') models.append(NeighbourhoodCleaningRule()) names.append('NCR') return models, names