示例#1
0
def rep_edited_KNN(X, Y):
    from imblearn.under_sampling import RepeatedEditedNearestNeighbours
    renn = RepeatedEditedNearestNeighbours()
    renn.fit_resample(X, Y)
    indexes = renn.sample_indices_
    nobj = len(Y)
    mask = np.zeros(nobj, dtype=int)
    for i in range(nobj):
        if i in indexes:
            mask[i] = 1
    return True, mask
示例#2
0
def rep_edited_KNN(X, Y):
    from imblearn.under_sampling import RepeatedEditedNearestNeighbours
    renn = RepeatedEditedNearestNeighbours()
    renn.fit_resample(X, Y)
    indexes = renn.sample_indices_
    mask = []
    for i in range(len(X)):
        if i in indexes:
            mask.append(1)
        else:
            mask.append(0)
    return True, np.asarray(mask)
def test_renn_fit_resample_with_indices():
    renn = RepeatedEditedNearestNeighbours(return_indices=True)
    X_resampled, y_resampled, idx_under = renn.fit_resample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [1.02956816, 0.36061601], [1.12202806, 0.33811558],
                     [0.73489726, 0.43915195], [0.50307437, 0.498805],
                     [0.84929742, 0.41042894], [0.62649535, 0.46600596],
                     [0.98382284, 0.37184502], [0.69804044, 0.44810796],
                     [0.04296502, -0.37981873], [0.28294738, -1.00125525],
                     [0.34218094, -0.58781961], [0.2096964, -0.61814058],
                     [1.59068979, -0.96622933], [0.73418199, -0.02222847],
                     [0.79270821, -0.41386668], [1.16606871, -0.25641059],
                     [1.0304995, -0.16955962], [0.48921682, -1.38504507],
                     [-0.03918551, -0.68540745], [0.24991051, -1.00864997],
                     [0.80541964, -0.34465185], [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2
    ])
    idx_gt = np.array([
        6, 13, 32, 39, 4, 5, 16, 22, 23, 24, 30, 37, 2, 11, 12, 17, 20, 21, 25,
        26, 28, 31, 33, 34, 35, 36
    ])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_renn_fit_resample_with_indices():
    renn = RepeatedEditedNearestNeighbours(return_indices=True)
    X_resampled, y_resampled, idx_under = renn.fit_resample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [
        -0.46226554, -0.50481004
    ], [-0.34474418, 0.21969797], [1.02956816, 0.36061601], [
        1.12202806, 0.33811558
    ], [0.73489726, 0.43915195], [0.50307437, 0.498805], [
        0.84929742, 0.41042894
    ], [0.62649535, 0.46600596], [0.98382284, 0.37184502], [
        0.69804044, 0.44810796
    ], [0.04296502, -0.37981873], [0.28294738, -1.00125525], [
        0.34218094, -0.58781961
    ], [0.2096964, -0.61814058], [1.59068979, -0.96622933], [
        0.73418199, -0.02222847
    ], [0.79270821, -0.41386668], [1.16606871, -0.25641059],
                     [1.0304995, -0.16955962], [0.48921682, -1.38504507],
                     [-0.03918551, -0.68540745], [0.24991051, -1.00864997],
                     [0.80541964, -0.34465185], [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2
    ])
    idx_gt = np.array([
        6, 13, 32, 39, 4, 5, 16, 22, 23, 24, 30, 37, 2, 11, 12, 17, 20, 21, 25,
        26, 28, 31, 33, 34, 35, 36
    ])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
def test_renn_fit_resample_mode():
    nn = NearestNeighbors(n_neighbors=4)
    renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel='mode')
    X_resampled, y_resampled = renn.fit_resample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387],
                     [-0.46226554, -0.50481004], [-0.34474418, 0.21969797],
                     [-0.12840393, 0.66446571], [1.02956816, 0.36061601],
                     [1.12202806, 0.33811558], [-0.35946678, 0.72510189],
                     [2.94290565, -0.13986434], [-1.10146139, 0.91782682],
                     [0.73489726, 0.43915195], [-0.28479268, 0.70459548],
                     [1.84864913, 0.14729596], [0.50307437, 0.498805],
                     [0.84929742, 0.41042894], [0.62649535, 0.46600596],
                     [1.67314371, 0.19231498], [0.98382284, 0.37184502],
                     [0.69804044, 0.44810796], [1.32319756, -0.13181616],
                     [0.04296502, -0.37981873], [0.28294738, -1.00125525],
                     [0.34218094, -0.58781961], [0.2096964, -0.61814058],
                     [1.59068979, -0.96622933], [0.73418199, -0.02222847],
                     [0.79270821, -0.41386668], [1.16606871, -0.25641059],
                     [1.0304995, -0.16955962], [0.48921682, -1.38504507],
                     [-0.03918551, -0.68540745], [0.24991051, -1.00864997],
                     [0.80541964, -0.34465185], [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2
    ])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_renn_fit_resample_mode():
    nn = NearestNeighbors(n_neighbors=4)
    renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel='mode')
    X_resampled, y_resampled = renn.fit_resample(X, Y)

    X_gt = np.array([[-0.53171468, -0.53735182], [-0.88864036, -0.33782387], [
        -0.46226554, -0.50481004
    ], [-0.34474418, 0.21969797], [-0.12840393, 0.66446571], [
        1.02956816, 0.36061601
    ], [1.12202806, 0.33811558], [-0.35946678, 0.72510189], [
        2.94290565, -0.13986434
    ], [-1.10146139, 0.91782682], [0.73489726, 0.43915195], [
        -0.28479268, 0.70459548
    ], [1.84864913, 0.14729596], [0.50307437, 0.498805], [
        0.84929742, 0.41042894
    ], [0.62649535, 0.46600596], [1.67314371, 0.19231498], [
        0.98382284, 0.37184502
    ], [0.69804044, 0.44810796], [1.32319756, -0.13181616], [
        0.04296502, -0.37981873
    ], [0.28294738, -1.00125525], [0.34218094, -0.58781961], [
        0.2096964, -0.61814058
    ], [1.59068979, -0.96622933], [0.73418199, -0.02222847], [
        0.79270821, -0.41386668
    ], [1.16606871, -0.25641059], [1.0304995, -0.16955962], [
        0.48921682, -1.38504507
    ], [-0.03918551, -0.68540745], [0.24991051, -1.00864997],
                     [0.80541964, -0.34465185], [0.1732627, -1.61323172]])
    y_gt = np.array([
        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2
    ])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
示例#7
0
def train_decisiontree_with(configurationname,
                            train_data,
                            k,
                            score_function,
                            undersam=False,
                            oversam=False,
                            export=False,
                            **kwargs):
    assert k > 0
    print("Training with configuration " + configurationname)
    X_train, y_train, id_to_a_train = train_data

    max_depth = None if "max_depth" not in kwargs else kwargs["max_depth"]

    dtc = DecisionTreeClassifier(criterion="entropy",
                                 random_state=0,
                                 max_depth=max_depth)

    print("Feature Selection")
    # selector = SelectFpr(score_function)
    selector = SelectKBest(score_function, k=k)
    selector = SelectKBest(score_function, k=k)
    selector = selector.fit(X_train, y_train)

    X_train = selector.transform(X_train)

    fitted_ids = [i for i in selector.get_support(indices=True)]

    print("Apply Resampling")
    print(Counter(y_train))
    if undersam and not oversam:
        renn = RepeatedEditedNearestNeighbours()
        X_train, y_train = renn.fit_resample(X_train, y_train)
    if oversam and not undersam:
        # feature_indices_array = list(range(len(f_to_id)))
        # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0)
        # X_train, y_train = smote_nc.fit_resample(X_train, y_train)
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    if oversam and undersam:
        smote_enn = SMOTEENN(random_state=0)
        X_train, y_train = smote_enn.fit_resample(X_train, y_train)
    print(Counter(y_train))

    print("Train Classifier")
    dtc = dtc.fit(X_train, y_train, check_input=True)

    if export:
        print("Exporting tree to graph...")
        export_graphviz(dtc,
                        out_file=DATAP + "/temp/trees/sltree_" +
                        configurationname + ".dot",
                        filled=True)
        transform(fitted_ids, configurationname)

    print("Self Accuracy: " + str(dtc.score(X_train, y_train)))

    return selector, dtc
示例#8
0
def classification_results(train,test):
    #Derivation of NBDriver using training data 
    """
    Arguments:
        train = feature matrix derived from Brown et al.
        test= feature matrix derived from Martelotto et al.
    Returns:
        best_model = Best ensemble model derived using the training data
        X_red= Dataframe derived after sampling that was used to train the model
        scores= probability based classification scores
    """
    sen=[];spe=[];acc=[];auc=[];c=[];m=[];s=[]
    train_x=train.drop('Label',axis=1);train_y=train['Label'];    
    test_x=test.drop('Label',axis=1);test_y=test['Label'];
    #Random undersampling to reduce the majority class size
    samp=RepeatedEditedNearestNeighbours(random_state=42)
    X_samp,y_samp=samp.fit_resample(train_x,train_y)
    X_samp = pd.DataFrame(X_samp, columns = train_x.columns)
    #Experimenting with different numbers of top features derived from the tree-based feature extraction method 
    top_n_feats=[30,40,50,60,70]
    X_r=feature_reduction_using_trees(X_samp,y_samp) 
    cols=X_r.columns
    for n in top_n_feats:
        print("For top: ",n," features")
        X_red=X_r[cols[0:n]]
        sv=SVC(kernel="linear",probability=True,C=0.01,random_state=42) #chosen from 5foldCV based grid search
        kde=KDEClassifier(bandwidth=1.27) #chosen from 5foldCV based grid search
        best_model = VotingClassifier(estimators=[('sv', sv), ('kde', kde)],
                        voting='soft',weights=[4, 7]) #best combination of weights selected by a brute force search (possible weights 1-10) using a cross-validation approach on the training data  
        
        best_model.fit(X_red,y_samp)
        y_probs = best_model.predict_proba(test_x[X_red.columns])[:,1]
        thresholds = arange(0, 1, 0.001)
        scores = [roc_auc_score(test_y, to_labels(y_probs, t)) for t in thresholds]
        ix= argmax(scores)
        y_test_predictions = np.where(best_model.predict_proba(test_x[X_red.columns])[:,1] > thresholds[ix], 2, 1)
        print("Thresh: ",thresholds[ix])
        sensi= sensitivity_score(test_y, y_test_predictions, pos_label=2)
        speci=specificity_score(test_y,y_test_predictions,pos_label=2)
        accu=accuracy_score(test_y,y_test_predictions)
        auro=roc_auc_score(test_y,y_test_predictions)
        mcc=metrics.matthews_corrcoef(test_y,y_test_predictions)
        tn, fp, fn, tp = confusion_matrix(test_y, y_test_predictions).ravel()
        ppv=tp/(tp+fp)
        npv=tn/(tn+fn)
        sen=tp/(tp+fn)
        spe=tn/(tn+fp)
        score=ppv+npv+sen+spe
        print("For kmer size: ",len(train.columns[0]))
        print("for top ",n," features")
        print(list(X_red.columns.values),"\n")
        score_dict={"Sen":sen,"Spe":spe,"PPV":ppv,"NPV":npv,"AUC":auro,"MCC":mcc,"ACC":accu}
        print(score)
        print(score_dict)
        df=pd.DataFrame(y_test_predictions)
        y_samp = pd.DataFrame(y_samp, columns = ['x'])
    return best_model,X_red,scores
示例#9
0
def load_from_csv(input_dir: str,
                  counts_file: str = "normalized_counts.csv.gz",
                  n_jobs=1,
                  low_expression=0.1) -> (AnnData, AnnData):
    u"""
    load data from csv files
    :param input_dir:
    :param counts_file:
    :param n_jobs
    :param str
    :return:
    """
    logger.info("Reading {0}".format(input_dir))

    input_file = os.path.join(input_dir, counts_file)

    # if not os.path.exists(input_file):
    #     input_file += ".gz"

    mtx = pd.read_csv(input_file, index_col=0)
    meta = pd.read_csv(os.path.join(input_dir, "meta.csv.gz"), index_col=0)
    meta = meta.loc[meta.index, :]

    logger.info(mtx.shape)
    # filter low expressed genes
    genes_sum = [x / mtx.shape[1] > low_expression for x in mtx.sum(axis=1)]

    mtx = mtx.loc[genes_sum, :]

    logger.info(mtx.shape)
    mtx = mtx.transpose()

    data = AnnData(mtx, obs=meta)
    data.obs = meta

    logger.info("Perform ENN")
    enn = EditedNearestNeighbours(n_jobs=n_jobs, return_indices=True)

    mtx_enn, group_enn, idx_enn = enn.fit_resample(mtx, meta["Stage"])

    data_enn = AnnData(mtx.iloc[list(idx_enn), :], meta.iloc[idx_enn, :])

    data_enn.obs = meta.iloc[idx_enn, :]

    logger.info("Perform RENN")
    renn = RepeatedEditedNearestNeighbours(n_jobs=n_jobs, return_indices=True)

    mtx_renn, group_renn, idx_renn = renn.fit_resample(mtx, meta["Stage"])

    data_renn = AnnData(mtx.iloc[list(idx_renn), :], meta.iloc[idx_renn, :])

    data_renn.obs = meta.iloc[idx_renn, :]

    return data, data_enn, data_renn
示例#10
0
def repeated_edited_nearest_neighbours(X,
                                       y,
                                       visualize=False,
                                       pca2d=True,
                                       pca3d=True,
                                       tsne=True,
                                       pie_evr=True):
    renn = RepeatedEditedNearestNeighbours()
    X_res, y_res = renn.fit_resample(X, y)
    if visualize == True:
        hist_over_and_undersampling(y_res)
        pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr)
    return X_res, y_res
示例#11
0
def train_decisiontree_FPR(configurationname,
                           train_data,
                           score_function,
                           undersam=False,
                           oversam=False,
                           export=False):
    print("Training with configuration " + configurationname)
    X_train, y_train, id_to_a_train = train_data
    dtc = DecisionTreeClassifier(random_state=0)

    print("Feature Selection")
    # selector = SelectFpr(score_function)
    selector = SelectFpr(score_function)
    result = selector.fit(X_train, y_train)
    X_train = selector.transform(X_train)

    fitted_ids = [i for i in result.get_support(indices=True)]

    print("Apply Resampling")
    print(Counter(y_train))
    if undersam and not oversam:
        renn = RepeatedEditedNearestNeighbours()
        X_train, y_train = renn.fit_resample(X_train, y_train)
    if oversam and not undersam:
        # feature_indices_array = list(range(len(f_to_id)))
        # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0)
        # X_train, y_train = smote_nc.fit_resample(X_train, y_train)
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    if oversam and undersam:
        smote_enn = SMOTEENN(random_state=0)
        X_train, y_train = smote_enn.fit_resample(X_train, y_train)
    print(Counter(y_train))

    print("Train Classifier")
    dtc = dtc.fit(X_train, y_train, check_input=True)

    # if export:
    print("Exporting decision tree image...")
    export_graphviz(dtc,
                    out_file=DATAP + "/temp/trees/sltree_" +
                    configurationname + ".dot",
                    filled=True)
    transform(fitted_ids)

    print("Self Accuracy: " + str(dtc.score(X_train, y_train)))

    return selector, dtc
示例#12
0
def resample(X, y):
    print('\nOriginal')
    for i in range(len(y.unique())):
        print(str(i) + ': ' + str(class_size(y, i)))
    # under = RandomUnderSampler( sampling_strategy = 'majority' )
    under = RepeatedEditedNearestNeighbours(sampling_strategy='majority')
    X_res, y_res = under.fit_resample(X, np.ravel(y, order='C'))
    over = ADASYN(
        sampling_strategy={
            0: class_size(y_res, 0),
            1: int(class_size(y_res, 0) * 0.7),
            2: int(class_size(y_res, 0) * 0.7)
        })
    # over = ADASYN( sampling_strategy='not majority' )
    X_res, y_res = over.fit_resample(X_res, np.ravel(y_res, order='C'))
    print('\nResampled')
    for i in range(len(y.unique())):
        print(str(i) + ': ' + str(class_size(y_res, i)))
    return X_res, y_res
示例#13
0
def train_decisiontree_with(configurationname, train_data, k, score_function, undersam=False, oversam=False,
                            export=False):
    assert k > 0
    print("Training with configuration " + configurationname)
    X_train, y_train, id_to_a_train = train_data
    dtc = DecisionTreeClassifier(random_state=0)

    print("Feature Selection")
    # selector = SelectFpr(score_function)
    selector = SelectKBest(score_function, k=k)
    result = selector.fit(X_train, y_train)
    X_train = selector.transform(X_train)

    fitted_ids = [i for i in result.get_support(indices=True)]

    print("Apply Resampling")
    print(Counter(y_train))
    if undersam and not oversam:
        renn = RepeatedEditedNearestNeighbours()
        X_train, y_train = renn.fit_resample(X_train, y_train)
    if oversam and not undersam:
        # feature_indices_array = list(range(len(f_to_id)))
        # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0)
        # X_train, y_train = smote_nc.fit_resample(X_train, y_train)
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    if oversam and undersam:
        smote_enn = SMOTEENN(random_state=0)
        X_train, y_train = smote_enn.fit_resample(X_train, y_train)
    print(Counter(y_train))

    print("Train Classifier")
    dtc = dtc.fit(X_train, y_train, check_input=True)

    if export:
        export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True)
        transform(fitted_ids, configurationname)

    print("Self Accuracy: " + str(dtc.score(X_train, y_train)))

    return selector, dtc
示例#14
0
文件: main.py 项目: LuChenLab/inferCC
def load_from_csv(input_dir: str) -> (AnnData, AnnData):
    u"""
    load data from csv files
    :param input_dir:
    :return:
    """
    logger.info("read")
    mtx = pd.read_csv(os.path.join(input_dir, "normalized_counts.csv.gz"),
                      index_col=0,
                      engine="c")
    meta = pd.read_csv(os.path.join(input_dir, "meta.csv.gz"),
                       index_col=0,
                       engine="c")
    meta = meta.loc[meta.index, :]

    mtx = mtx.transpose()

    data = AnnData(mtx, obs=meta)
    data.obs = meta

    logger.info("enn")
    enn = EditedNearestNeighbours(n_jobs=10, return_indices=True)

    mtx_enn, group_enn, idx_enn = enn.fit_resample(mtx, meta["Stage"])

    data_enn = AnnData(mtx.iloc[list(idx_enn), :], meta.iloc[idx_enn, :])

    data_enn.obs = meta.iloc[idx_enn, :]

    logger.info("Repeated enn")
    renn = RepeatedEditedNearestNeighbours(n_jobs=10, return_indices=True)

    mtx_renn, group_renn, idx_renn = renn.fit_resample(mtx, meta["Stage"])

    data_renn = AnnData(mtx.iloc[list(idx_renn), :], meta.iloc[idx_renn, :])

    data_renn.obs = meta.iloc[idx_renn, :]

    return data, data_enn, data_renn
示例#15
0
print(X_data.shape)
print('-------')

# ------- CNN --------
cnn = CondensedNearestNeighbour()
X_cnn, y_cnn = cnn.fit_resample(X_data, y_data)
print(X_cnn.shape)

# ------- ENN --------
enn = EditedNearestNeighbours()
X_enn, y_enn = enn.fit_resample(X_data, y_data)
print(X_enn.shape)

# ------- RENN --------
renn = RepeatedEditedNearestNeighbours()
X_renn, y_renn = renn.fit_resample(X_data, y_data)
print(X_renn.shape)

# ------- Tomek --------
tl = TomekLinks()
X_t, y_t = tl.fit_resample(X_data, y_data)
print(X_t.shape)

# ------- RUS --------
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_data, y_data)
print(X_rus.shape)

print('\n\n')

datasets = [{
示例#16
0
def resampling_assigner(imb_technique, AA_ova_X_train, AA_ova_y_train,
                        AI_ova_X_train, AI_ova_y_train, AW_ova_X_train,
                        AW_ova_y_train, CC_ova_X_train, CC_ova_y_train,
                        QA_ova_X_train, QA_ova_y_train):
    print(imb_technique)
    if imb_technique == "ADASYN":
        AA_ada, AI_ada, AW_ada, CC_ada, QA_ada = ADASYN(), ADASYN(), ADASYN(
        ), ADASYN(), ADASYN()
        AA_X_res, AA_y_res = AA_ada.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_ada.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_ada.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_ada.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_ada.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "ALLKNN":
        AA_allknn, AI_allknn, AW_allknn, CC_allknn, QA_allknn = AllKNN(
        ), AllKNN(), AllKNN(), AllKNN(), AllKNN()
        AA_X_res, AA_y_res = AA_allknn.fit_resample(AA_ova_X_train,
                                                    AA_ova_y_train)
        AI_X_res, AI_y_res = AI_allknn.fit_resample(AI_ova_X_train,
                                                    AI_ova_y_train)
        AW_X_res, AW_y_res = AW_allknn.fit_resample(AW_ova_X_train,
                                                    AW_ova_y_train)
        CC_X_res, CC_y_res = CC_allknn.fit_resample(CC_ova_X_train,
                                                    CC_ova_y_train)
        QA_X_res, QA_y_res = QA_allknn.fit_resample(QA_ova_X_train,
                                                    QA_ova_y_train)
    elif imb_technique == "CNN":
        AA_cnn, AI_cnn, AW_cnn, CC_cnn, QA_cnn = CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour()
        AA_X_res, AA_y_res = AA_cnn.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_cnn.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_cnn.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_cnn.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_cnn.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "ENN":
        AA_enn, AI_enn, AW_enn, CC_enn, QA_enn = EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours()
        AA_X_res, AA_y_res = AA_enn.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_enn.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_enn.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_enn.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_enn.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "IHT":
        AA_iht, AI_iht, AW_iht, CC_iht, QA_iht = InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold()
        AA_X_res, AA_y_res = AA_iht.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_iht.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_iht.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_iht.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_iht.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "NCR":
        AA_ncr, AI_ncr, AW_ncr, CC_ncr, QA_ncr = NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule()
        AA_ova_y_train = [
            0 if i == "Accepted/Assigned" else 1 for i in AA_ova_y_train
        ]
        AA_X_res, AA_y_res = AA_ncr.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_ova_y_train = [
            0 if i == "Accepted/In Progress" else 1 for i in AI_ova_y_train
        ]
        AI_X_res, AI_y_res = AI_ncr.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_ova_y_train = [
            0 if i == "Accepted/Wait" else 1 for i in AW_ova_y_train
        ]
        AW_X_res, AW_y_res = AW_ncr.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_ova_y_train = [
            0 if i == "Completed/Closed" else 1 for i in CC_ova_y_train
        ]
        CC_X_res, CC_y_res = CC_ncr.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_ova_y_train = [
            0 if i == "Queued/Awaiting Assignment" else 1
            for i in QA_ova_y_train
        ]
        QA_X_res, QA_y_res = QA_ncr.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "NM":
        AA_nm, AI_nm, AW_nm, CC_nm, QA_nm = NearMiss(), NearMiss(), NearMiss(
        ), NearMiss(), NearMiss()
        AA_X_res, AA_y_res = AA_nm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_nm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_nm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_nm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_nm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "OSS":
        AA_oss, AI_oss, AW_oss, CC_oss, QA_oss = OneSidedSelection(
        ), OneSidedSelection(), OneSidedSelection(), OneSidedSelection(
        ), OneSidedSelection()
        AA_X_res, AA_y_res = AA_oss.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_oss.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_oss.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_oss.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_oss.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "RENN":
        AA_renn, AI_renn, AW_renn, CC_renn, QA_renn = RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        )
        AA_X_res, AA_y_res = AA_renn.fit_resample(AA_ova_X_train,
                                                  AA_ova_y_train)
        AI_X_res, AI_y_res = AI_renn.fit_resample(AI_ova_X_train,
                                                  AI_ova_y_train)
        AW_X_res, AW_y_res = AW_renn.fit_resample(AW_ova_X_train,
                                                  AW_ova_y_train)
        CC_X_res, CC_y_res = CC_renn.fit_resample(CC_ova_X_train,
                                                  CC_ova_y_train)
        QA_X_res, QA_y_res = QA_renn.fit_resample(QA_ova_X_train,
                                                  QA_ova_y_train)
    elif imb_technique == "SMOTE":
        AA_sm, AI_sm, AW_sm, CC_sm, QA_sm = SMOTE(), SMOTE(), SMOTE(), SMOTE(
        ), SMOTE()
        AA_X_res, AA_y_res = AA_sm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_sm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_sm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_sm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_sm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "BSMOTE":
        AA_bsm, AI_bsm, AW_bsm, CC_bsm, QA_bsm = BorderlineSMOTE(
        ), BorderlineSMOTE(), BorderlineSMOTE(), BorderlineSMOTE(
        ), BorderlineSMOTE()
        AA_X_res, AA_y_res = AA_bsm.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_bsm.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_bsm.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_bsm.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_bsm.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "SMOTEENN":
        AA_smenn, AI_smenn, AW_smenn, CC_smenn, QA_smenn = SMOTEENN(
        ), SMOTEENN(), SMOTEENN(), SMOTEENN(), SMOTEENN()
        AA_X_res, AA_y_res = AA_smenn.fit_resample(AA_ova_X_train,
                                                   AA_ova_y_train)
        AI_X_res, AI_y_res = AI_smenn.fit_resample(AI_ova_X_train,
                                                   AI_ova_y_train)
        AW_X_res, AW_y_res = AW_smenn.fit_resample(AW_ova_X_train,
                                                   AW_ova_y_train)
        CC_X_res, CC_y_res = CC_smenn.fit_resample(CC_ova_X_train,
                                                   CC_ova_y_train)
        QA_X_res, QA_y_res = QA_smenn.fit_resample(QA_ova_X_train,
                                                   QA_ova_y_train)
    elif imb_technique == "SMOTETOMEK":
        AA_smtm, AI_smtm, AW_smtm, CC_smtm, QA_smtm = SMOTETomek(), SMOTETomek(
        ), SMOTETomek(), SMOTETomek(), SMOTETomek()
        AA_X_res, AA_y_res = AA_smtm.fit_resample(AA_ova_X_train,
                                                  AA_ova_y_train)
        AI_X_res, AI_y_res = AI_smtm.fit_resample(AI_ova_X_train,
                                                  AI_ova_y_train)
        AW_X_res, AW_y_res = AW_smtm.fit_resample(AW_ova_X_train,
                                                  AW_ova_y_train)
        CC_X_res, CC_y_res = CC_smtm.fit_resample(CC_ova_X_train,
                                                  CC_ova_y_train)
        QA_X_res, QA_y_res = QA_smtm.fit_resample(QA_ova_X_train,
                                                  QA_ova_y_train)
    elif imb_technique == "TOMEK":
        AA_tm, AI_tm, AW_tm, CC_tm, QA_tm = TomekLinks(), TomekLinks(
        ), TomekLinks(), TomekLinks(), TomekLinks()
        AA_X_res, AA_y_res = AA_tm.fit_resample(AA_ova_X_train, AA_ova_y_train)
        AI_X_res, AI_y_res = AI_tm.fit_resample(AI_ova_X_train, AI_ova_y_train)
        AW_X_res, AW_y_res = AW_tm.fit_resample(AW_ova_X_train, AW_ova_y_train)
        CC_X_res, CC_y_res = CC_tm.fit_resample(CC_ova_X_train, CC_ova_y_train)
        QA_X_res, QA_y_res = QA_tm.fit_resample(QA_ova_X_train, QA_ova_y_train)
    elif imb_technique == "ROS":
        AA_ros, AI_ros, AW_ros, CC_ros, QA_ros = RandomOverSampler(
        ), RandomOverSampler(), RandomOverSampler(), RandomOverSampler(
        ), RandomOverSampler()
        AA_X_res, AA_y_res = AA_ros.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_ros.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_ros.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_ros.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_ros.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    elif imb_technique == "RUS":
        AA_rus, AI_rus, AW_rus, CC_rus, QA_rus = RandomUnderSampler(
        ), RandomUnderSampler(), RandomUnderSampler(), RandomUnderSampler(
        ), RandomUnderSampler()
        AA_X_res, AA_y_res = AA_rus.fit_resample(AA_ova_X_train,
                                                 AA_ova_y_train)
        AI_X_res, AI_y_res = AI_rus.fit_resample(AI_ova_X_train,
                                                 AI_ova_y_train)
        AW_X_res, AW_y_res = AW_rus.fit_resample(AW_ova_X_train,
                                                 AW_ova_y_train)
        CC_X_res, CC_y_res = CC_rus.fit_resample(CC_ova_X_train,
                                                 CC_ova_y_train)
        QA_X_res, QA_y_res = QA_rus.fit_resample(QA_ova_X_train,
                                                 QA_ova_y_train)
    return AA_X_res, AA_y_res, AI_X_res, AI_y_res, AW_X_res, AW_y_res, CC_X_res, CC_y_res, QA_X_res, QA_y_res
示例#17
0
def all_imblearn(xx, yy):
    
    imblearnlist = []  
    
    """OVER SAMPLING"""
    
    """Random Over Sampler"""
    ros = RandomOverSampler(random_state=0)
    X_resampled, y_resampled = ros.fit_resample(xx, yy)
    randomOverSampler = [X_resampled, y_resampled, 'random over sampler']
    imblearnlist.append(randomOverSampler)
    
    """SMOTE"""
    X_resampled, y_resampled = SMOTE().fit_resample(xx, yy)
    smote = [X_resampled, y_resampled, 'smote']
    imblearnlist.append(smote)
    
    """SMOTE borderline1"""
    sm = SMOTE(kind='borderline1')
    X_resampled, y_resampled = sm.fit_resample(xx, yy)
    smote = [X_resampled, y_resampled, 'smote borderline1']
    imblearnlist.append(smote)
    
    """SMOTE borderline2"""
    sm = SMOTE(kind='borderline2')
    X_resampled, y_resampled = sm.fit_resample(xx, yy)
    smote = [X_resampled, y_resampled, 'smote borderline2']
    imblearnlist.append(smote)
    
    """SMOTE svm"""
    sm = SMOTE(kind='svm')
    X_resampled, y_resampled = sm.fit_resample(xx, yy)
    smote = [X_resampled, y_resampled, 'smote svm']
    imblearnlist.append(smote)
    
    """SMOTENC"""
    smote_nc = SMOTENC(categorical_features=[0, 2], random_state=0)
    X_resampled, y_resampled = smote_nc.fit_resample(xx, yy)
    smote = [X_resampled, y_resampled, 'smotenc']
    imblearnlist.append(smote)
    
#    """ADASYN"""
#    X_resampled, y_resampled = ADASYN.fit_resample(xx, yy)
#    adasyn = [X_resampled, y_resampled, 'adasyn']
#    imblearnlist.append(adasyn)
#    


    """UNDER SAMPLING"""
    
    """Cluster Centroids"""
    cc = ClusterCentroids(random_state=0)
    X_resampled, y_resampled = cc.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'cluster centroids']
    imblearnlist.append(reSampled)

    """Random Over Sampler"""
    rus = RandomUnderSampler(random_state=0)
    X_resampled, y_resampled = rus.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'random under sampler']
    imblearnlist.append(reSampled)
    
    """Near Miss 1"""
    nm1 = NearMiss(version=1)
    X_resampled, y_resampled = nm1.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'near miss 1']
    imblearnlist.append(reSampled)
    
    """Near Miss 2"""
    nm2 = NearMiss(version=2)
    X_resampled, y_resampled = nm2.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'near miss 2']
    imblearnlist.append(reSampled)
    
    """Near Miss 3"""
    nm3 = NearMiss(version=3)
    X_resampled, y_resampled = nm3.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'near miss 3']
    imblearnlist.append(reSampled)
    
    """Edited Nearest Neighbours"""
    enn = EditedNearestNeighbours()
    X_resampled, y_resampled = enn.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'edited nearest neighbours']
    imblearnlist.append(reSampled)
    
    """Repeated Edited Nearest Neighbours"""
    renn = RepeatedEditedNearestNeighbours()
    X_resampled, y_resampled = renn.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'repeated edited nearest neighbours']
    imblearnlist.append(reSampled)
    
    """All KNN"""
    allknn = AllKNN()
    X_resampled, y_resampled = allknn.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'allKNN']
    imblearnlist.append(reSampled)
    
    """Condensed Nearest Neighbour"""
    cnn = CondensedNearestNeighbour(random_state=0)
    X_resampled, y_resampled = cnn.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'Condensed Nearest Neighbour']
    imblearnlist.append(reSampled)
    
    """One Sided Selection"""
    oss = OneSidedSelection(random_state=0)
    X_resampled, y_resampled = oss.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'One Sided Selection']
    imblearnlist.append(reSampled)
    
    """Neighbourhood Cleaning Rule"""
    ncr = NeighbourhoodCleaningRule()
    X_resampled, y_resampled = ncr.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'Neighbourhood Cleaning Rule']
    imblearnlist.append(reSampled)


    """OVER AND UNDER SAMPLING"""
    
    """SMOTEENN"""
    smote_enn = SMOTEENN(random_state=0)
    X_resampled, y_resampled = smote_enn.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'SMOTEENN']
    imblearnlist.append(reSampled)
    
    """SMOTETomek"""
    smote_tomek = SMOTETomek(random_state=0)
    X_resampled, y_resampled = smote_tomek.fit_resample(xx, yy)
    reSampled = [X_resampled, y_resampled, 'SMOTETomek']
    imblearnlist.append(reSampled)
    
    return imblearnlist
    
    
    
    
    
    
    
def test_renn_not_good_object():
    nn = 'rnd'
    renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel='mode')
    with raises(ValueError):
        renn.fit_resample(X, Y)
def test_renn_iter_wrong():
    max_iter = -1
    renn = RepeatedEditedNearestNeighbours(max_iter=max_iter)
    with pytest.raises(ValueError):
        renn.fit_resample(X, Y)
def test_deprecation_random_state():
    renn = RepeatedEditedNearestNeighbours(random_state=0)
    with warns(DeprecationWarning,
               match="'random_state' is deprecated from 0.4"):
        renn.fit_resample(X, Y)
示例#21
0
def test_renn_not_good_object():
    nn = "rnd"
    renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel="mode")
    with pytest.raises(ValueError):
        renn.fit_resample(X, Y)
示例#22
0
# Next a nearest neighbors undersampling technique is applied to the majority.

# In[27]:

from imblearn.under_sampling import RepeatedEditedNearestNeighbours

# First with n_neighbors = 25

# In[28]:

enn = RepeatedEditedNearestNeighbours(sampling_strategy='majority',
                                      n_neighbors=25,
                                      n_jobs=3,
                                      random_state=101)
X_resamp2, y_resamp2 = enn.fit_resample(X_train, y_train)

# In[29]:

#number of resampled majority samples
y_resamp2.shape[0] - y_resamp2.sum()

# Now n_neighbors is decreased to 20

# In[30]:

enn = RepeatedEditedNearestNeighbours(sampling_strategy='majority',
                                      n_neighbors=20,
                                      n_jobs=3,
                                      random_state=101)
X_resamp3, y_resamp3 = enn.fit_resample(X_train, y_train)
示例#23
0
# advanced undersampling ======================================================
''' ALLKNN '''
from imblearn.under_sampling import AllKNN, NeighbourhoodCleaningRule
# define undersampling strategy
under_allknn = AllKNN()
# fit and apply the transform
X, y = under_allknn.fit_resample(X, y)
# summarize class distribution
print(Counter(y))
''' RENN '''
from imblearn.under_sampling import CondensedNearestNeighbour, EditedNearestNeighbours, RepeatedEditedNearestNeighbours
# define undersampling strategy
under_renn = RepeatedEditedNearestNeighbours()
# fit and apply the transform
X, y = under_renn.fit_resample(X, y)
# summarize class distribution
print(Counter(y))

# advanced oversampling =======================================================
''' ADASYN '''
from imblearn.over_sampling import ADASYN
# define oversampling strategy
over_ada = ADASYN(random_state=42)
# fit and apply the transform
X, y = over_ada.fit_resample(X, y)
# summarize class distribution
print(Counter(y))
''' SMOTE '''
from imblearn.over_sampling import SMOTE, SMOTENC, BorderlineSMOTE
# define oversampling strategy
示例#24
0
def repeated_edited_nearest_neighbours(X, y):
    renn = RepeatedEditedNearestNeighbours()
    X_res, y_res = renn.fit_resample(X, y)
    return X_res, y_res
def resampling_assigner(imb_technique, AP_ova_X_train, AP_ova_y_train,
                        PM_ova_X_train, PM_ova_y_train, SC_ova_X_train,
                        SC_ova_y_train):
    print(imb_technique)
    if imb_technique == "ADASYN":
        AP_ada, PM_ada, SC_ada = ADASYN(), ADASYN(), ADASYN()
        AP_X_res, AP_y_res = AP_ada.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_ada.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_ada.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "ALLKNN":
        AP_allknn, PM_allknn, SC_allknn = AllKNN(), AllKNN(), AllKNN()
        AP_X_res, AP_y_res = AP_allknn.fit_resample(AP_ova_X_train,
                                                    AP_ova_y_train)
        PM_X_res, PM_y_res = PM_allknn.fit_resample(PM_ova_X_train,
                                                    PM_ova_y_train)
        SC_X_res, SC_y_res = SC_allknn.fit_resample(SC_ova_X_train,
                                                    SC_ova_y_train)
    elif imb_technique == "CNN":
        AP_cnn, PM_cnn, SC_cnn = CondensedNearestNeighbour(
        ), CondensedNearestNeighbour(), CondensedNearestNeighbour()
        AP_X_res, AP_y_res = AP_cnn.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_cnn.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_cnn.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "ENN":
        AP_enn, PM_enn, SC_enn = EditedNearestNeighbours(
        ), EditedNearestNeighbours(), EditedNearestNeighbours()
        AP_X_res, AP_y_res = AP_enn.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_enn.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_enn.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "IHT":
        AP_iht, PM_iht, SC_iht = InstanceHardnessThreshold(
        ), InstanceHardnessThreshold(), InstanceHardnessThreshold()
        AP_X_res, AP_y_res = AP_iht.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_iht.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_iht.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "NCR":
        AP_iht, PM_iht, SC_iht = NeighbourhoodCleaningRule(
        ), NeighbourhoodCleaningRule(), NeighbourhoodCleaningRule()
        AP_ova_y_train = [
            0 if i == "Add penalty" else 1 for i in AP_ova_y_train
        ]
        AP_X_res, AP_y_res = AP_ncr.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_ova_y_train = [0 if i == "Payment" else 1 for i in PM_ova_y_train]
        PM_X_res, PM_y_res = PM_ncr.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_ova_y_train = [
            0 if i == "Send for Credit Collection" else 1
            for i in SC_ova_y_train
        ]
        SC_X_res, SC_y_res = SC_ncr.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "NM":
        AP_nm, PM_nm, SC_nm = NearMiss(), NearMiss(), NearMiss()
        AP_X_res, AP_y_res = AP_nm.fit_resample(AP_ova_X_train, AP_ova_y_train)
        PM_X_res, PM_y_res = PM_nm.fit_resample(PM_ova_X_train, PM_ova_y_train)
        SC_X_res, SC_y_res = SC_nm.fit_resample(SC_ova_X_train, SC_ova_y_train)
    elif imb_technique == "OSS":
        AP_oss, PM_oss, SC_oss = OneSidedSelection(), OneSidedSelection(
        ), OneSidedSelection()
        AP_X_res, AP_y_res = AP_oss.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_oss.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_oss.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "RENN":
        AP_renn, PM_renn, SC_renn = RepeatedEditedNearestNeighbours(
        ), RepeatedEditedNearestNeighbours(), RepeatedEditedNearestNeighbours(
        )
        AP_X_res, AP_y_res = AP_renn.fit_resample(AP_ova_X_train,
                                                  AP_ova_y_train)
        PM_X_res, PM_y_res = PM_renn.fit_resample(PM_ova_X_train,
                                                  PM_ova_y_train)
        SC_X_res, SC_y_res = SC_renn.fit_resample(SC_ova_X_train,
                                                  SC_ova_y_train)
    elif imb_technique == "SMOTE":
        AP_sm, PM_sm, SC_sm = SMOTE(), SMOTE(), SMOTE()
        AP_X_res, AP_y_res = AP_sm.fit_resample(AP_ova_X_train, AP_ova_y_train)
        PM_X_res, PM_y_res = PM_sm.fit_resample(PM_ova_X_train, PM_ova_y_train)
        SC_X_res, SC_y_res = SC_sm.fit_resample(SC_ova_X_train, SC_ova_y_train)
    elif imb_technique == "BSMOTE":
        AP_bsm, PM_bsm, SC_bsm = BorderlineSMOTE(), BorderlineSMOTE(
        ), BorderlineSMOTE()
        AP_X_res, AP_y_res = AP_bsm.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_bsm.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_bsm.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "SMOTEENN":
        AP_smenn, PM_smenn, SC_smenn = SMOTEENN(), SMOTEENN(), SMOTEENN()
        AP_X_res, AP_y_res = AP_smenn.fit_resample(AP_ova_X_train,
                                                   AP_ova_y_train)
        PM_X_res, PM_y_res = PM_smenn.fit_resample(PM_ova_X_train,
                                                   PM_ova_y_train)
        SC_X_res, SC_y_res = SC_smenn.fit_resample(SC_ova_X_train,
                                                   SC_ova_y_train)
    elif imb_technique == "SMOTETOMEK":
        AP_smtm, PM_smtm, SC_smtm = SMOTETomek(), SMOTETomek(), SMOTETomek()
        AP_X_res, AP_y_res = AP_smtm.fit_resample(AP_ova_X_train,
                                                  AP_ova_y_train)
        PM_X_res, PM_y_res = PM_smtm.fit_resample(PM_ova_X_train,
                                                  PM_ova_y_train)
        SC_X_res, SC_y_res = SC_smtm.fit_resample(SC_ova_X_train,
                                                  SC_ova_y_train)
    elif imb_technique == "TOMEK":
        AP_tm, PM_tm, SC_tm = TomekLinks(), TomekLinks(), TomekLinks()
        AP_X_res, AP_y_res = AP_tm.fit_resample(AP_ova_X_train, AP_ova_y_train)
        PM_X_res, PM_y_res = PM_tm.fit_resample(PM_ova_X_train, PM_ova_y_train)
        SC_X_res, SC_y_res = SC_tm.fit_resample(SC_ova_X_train, SC_ova_y_train)
    elif imb_technique == "ROS":
        AP_ros, PM_ros, SC_ros = RandomOverSampler(), RandomOverSampler(
        ), RandomOverSampler()
        AP_X_res, AP_y_res = AP_ros.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_ros.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_ros.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    elif imb_technique == "RUS":
        AP_rus, PM_rus, SC_rus = RandomUnderSampler(), RandomUnderSampler(
        ), RandomUnderSampler()
        AP_X_res, AP_y_res = AP_rus.fit_resample(AP_ova_X_train,
                                                 AP_ova_y_train)
        PM_X_res, PM_y_res = PM_rus.fit_resample(PM_ova_X_train,
                                                 PM_ova_y_train)
        SC_X_res, SC_y_res = SC_rus.fit_resample(SC_ova_X_train,
                                                 SC_ova_y_train)
    return AP_X_res, AP_y_res, PM_X_res, PM_y_res, SC_X_res, SC_y_res
X_res_vis = pca.transform(X_resampled)
idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled)
reduction_str = ('Reduced {:.2f}%'.format(
    100 * (1 - float(len(X_resampled)) / len(X))))
print(reduction_str)
c3 = ax2.scatter(X_vis[idx_samples_removed, 0],
                 X_vis[idx_samples_removed, 1],
                 alpha=.2,
                 label='Removed samples',
                 c='g')
plot_resampling(ax2, X_res_vis, y_resampled, 'ENN - ' + reduction_str)

# Apply the RENN
print('RENN')
renn = RepeatedEditedNearestNeighbours(return_indices=True)
X_resampled, y_resampled, idx_resampled = renn.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)
idx_samples_removed = np.setdiff1d(np.arange(X_vis.shape[0]), idx_resampled)
reduction_str = ('Reduced {:.2f}%'.format(
    100 * (1 - float(len(X_resampled)) / len(X))))
print(reduction_str)
ax3.scatter(X_vis[idx_samples_removed, 0],
            X_vis[idx_samples_removed, 1],
            alpha=.2,
            label='Removed samples',
            c='g')
plot_resampling(ax3, X_res_vis, y_resampled, 'RENN - ' + reduction_str)

# Apply the AllKNN
print('AllKNN')
allknn = AllKNN(return_indices=True)
def test_renn_not_good_object():
    nn = 'rnd'
    renn = RepeatedEditedNearestNeighbours(n_neighbors=nn, kind_sel='mode')
    with pytest.raises(ValueError):
        renn.fit_resample(X, Y)
示例#28
0
def test_renn_iter_wrong():
    max_iter = -1
    renn = RepeatedEditedNearestNeighbours(max_iter=max_iter)
    with pytest.raises(ValueError):
        renn.fit_resample(X, Y)
def test_deprecation_random_state():
    renn = RepeatedEditedNearestNeighbours(random_state=0)
    with warns(
            DeprecationWarning, match="'random_state' is deprecated from 0.4"):
        renn.fit_resample(X, Y)
示例#30
0
def test_renn_iter_attribute(max_iter, n_iter):
    renn = RepeatedEditedNearestNeighbours(max_iter=max_iter)
    renn.fit_resample(X, Y)
    assert renn.n_iter_ == n_iter
def test_renn_fit_resample():
    renn = RepeatedEditedNearestNeighbours()
    X_resampled, y_resampled = renn.fit_resample(X, Y)

    X_gt = np.array([
        [-0.53171468, -0.53735182],
        [-0.88864036, -0.33782387],
        [-0.46226554, -0.50481004],
        [-0.34474418, 0.21969797],
        [1.02956816, 0.36061601],
        [1.12202806, 0.33811558],
        [0.73489726, 0.43915195],
        [0.50307437, 0.498805],
        [0.84929742, 0.41042894],
        [0.62649535, 0.46600596],
        [0.98382284, 0.37184502],
        [0.69804044, 0.44810796],
        [0.04296502, -0.37981873],
        [0.28294738, -1.00125525],
        [0.34218094, -0.58781961],
        [0.2096964, -0.61814058],
        [1.59068979, -0.96622933],
        [0.73418199, -0.02222847],
        [0.79270821, -0.41386668],
        [1.16606871, -0.25641059],
        [1.0304995, -0.16955962],
        [0.48921682, -1.38504507],
        [-0.03918551, -0.68540745],
        [0.24991051, -1.00864997],
        [0.80541964, -0.34465185],
        [0.1732627, -1.61323172],
    ])
    y_gt = np.array([
        0,
        0,
        0,
        0,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
        2,
    ])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert 0 < renn.n_iter_ <= renn.max_iter