示例#1
0
def test_validate_estimator_deprecation():
    """Test right processing while passing old parameters"""

    X_gt = np.array([[0.11622591, -0.0317206],
                     [1.25192108, -0.22367336],
                     [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504],
                     [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734],
                     [-0.41635887, -0.38299653],
                     [1.70580611, -0.11219234],
                     [0.29307743, -0.14670439],
                     [0.84976473, -0.15570176],
                     [0.61319159, -0.11571668],
                     [0.66052536, -0.28246517],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1])

    smt = SMOTEENN(random_state=RND_SEED, n_jobs=-1)
    X_resampled, y_resampled = smt.fit_sample(X, Y)
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)

    smt = SMOTEENN(random_state=RND_SEED, k=5)
    X_resampled, y_resampled = smt.fit_sample(X, Y)
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_error_wrong_object():
    smote = 'rnd'
    enn = 'rnd'
    smt = SMOTEENN(smote=smote, random_state=RND_SEED)
    with raises(ValueError, match="smote needs to be a SMOTE"):
        smt.fit_sample(X, Y)
    smt = SMOTEENN(enn=enn, random_state=RND_SEED)
    with raises(ValueError, match="enn needs to be an "):
        smt.fit_sample(X, Y)
示例#3
0
def test_error_wrong_object():
    smote = 'rnd'
    enn = 'rnd'
    smt = SMOTEENN(smote=smote, random_state=RND_SEED)
    with raises(ValueError, match="smote needs to be a SMOTE"):
        smt.fit_sample(X, Y)
    smt = SMOTEENN(enn=enn, random_state=RND_SEED)
    with raises(ValueError, match="enn needs to be an "):
        smt.fit_sample(X, Y)
示例#4
0
def test_validate_estimator_deprecation():
    smt = SMOTEENN(random_state=RND_SEED, n_jobs=-1)
    X_resampled, y_resampled = smt.fit_sample(X, Y)
    X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176],
                     [0.61319159, -0.11571667], [0.66052536, -0.28246518],
                     [-0.28162401, -2.10400981], [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
    smt = SMOTEENN(random_state=RND_SEED, k=5)
    X_resampled, y_resampled = smt.fit_sample(X, Y)
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
示例#5
0
def test_sample_regular():
    """Test sample function with regular SMOTE."""

    # Create the object
    smote = SMOTEENN(random_state=RND_SEED)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206],
                     [1.25192108, -0.22367336],
                     [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504],
                     [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734],
                     [-0.41635887, -0.38299653],
                     [1.70580611, -0.11219234],
                     [0.29307743, -0.14670439],
                     [0.84976473, -0.15570176],
                     [0.61319159, -0.11571668],
                     [0.66052536, -0.28246517],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
示例#6
0
def test_validate_estimator_init():
    """Test right processing while passing objects as initialization"""

    # Create a SMOTE and Tomek object
    smote = SMOTE(random_state=RND_SEED)
    enn = EditedNearestNeighbours(random_state=RND_SEED)

    smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED)

    X_resampled, y_resampled = smt.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206],
                     [1.25192108, -0.22367336],
                     [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504],
                     [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734],
                     [-0.41635887, -0.38299653],
                     [1.70580611, -0.11219234],
                     [0.29307743, -0.14670439],
                     [0.84976473, -0.15570176],
                     [0.61319159, -0.11571668],
                     [0.66052536, -0.28246517],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
示例#7
0
def resample(X, Y, nb_class):
    print("original shape: ", X.shape)
    labels = Y.astype(int)
    counts = np.bincount(labels)

    if len(counts) != nb_class:
        print("there is no samples to interpolate! skip this fold.")
        return X, Y

    class_dist = counts / float(sum(counts))
    print("original dist: ", class_dist)

    org_shape = X.shape
    sampler = SMOTEENN(random_state=0)
    flattend_X = X.reshape(
        (X.shape[0], X.shape[1] * X.shape[2] * X.shape[3] * X.shape[4]))
    X_resampled, Y_resampled = sampler.fit_sample(flattend_X, labels)
    X_resampled = X_resampled.reshape(
        (X_resampled.shape[0], X.shape[1], X.shape[2], X.shape[3], X.shape[4]))
    print("sampled shape: ", X_resampled.shape)

    Y_resampled = Y_resampled.astype(int)
    counts = np.bincount(Y_resampled)
    class_dist = counts / float(sum(counts))
    print("after SMOTEENN dist: ", class_dist)
    return X_resampled, Y_resampled
示例#8
0
def runtree(data, target):
    lb = preprocessing.LabelEncoder()
    lb.fit(target)
    target1 = lb.transform(target)
    sm = SMOTEENN()
    clf = tree.DecisionTreeClassifier()
    folds = [3]
    depths = [10]
    print("------------ TREE ------------")

    for fold in folds:
        skf = StratifiedKFold(n_splits=fold, random_state=5)
        test_target = []
        test_predict = []
        test_proba = []
        test_proba_target = []
        for train_index, test_index in skf.split(data, target1):
            clf_ = clone(clf)
            X_resampled, y_resampled = sm.fit_sample(data[train_index], target1[train_index])
            clf_.fit(X_resampled, y_resampled)
            test_predict.append(clf_.predict(data[test_index]))
            test_target.append(target1[test_index])
            test_proba_target.extend(target1[test_index])
            test_proba.extend(clf_.predict_proba(data[test_index])[:, 1])

        print_scores(test_predict, test_target)
        print(roc_auc_score(y_true=test_proba_target, y_score=test_proba))
示例#9
0
def smot2(train_x, train_y, feature_columns):

    from imblearn.combine import SMOTEENN
    from imblearn.over_sampling import SMOTE
    from imblearn.under_sampling import TomekLinks
    from imblearn.under_sampling import RandomUnderSampler
    from imblearn.over_sampling import ADASYN
    from sklearn.svm import SVC
    from imblearn.under_sampling import CondensedNearestNeighbour

    print('\nOriginal dataset shape {}'.format(Counter(train_y)))

    sm = SMOTEENN(ratio='minority',
                  n_jobs=3,
                  random_state=42,
                  n_neighbors=50,
                  smote=SMOTE())
    #sm = ADASYN(ratio='minority', n_jobs=3,random_state=42,n_neighbors=100)

    #sm = SMOTE(ratio='minority', n_jobs=3, random_state=42,m_neighbors=200)

    #sm = CondensedNearestNeighbour(ratio='majority', random_state=42)

    log.traceLogInfo("\nFIT DE SMOT2 ...equilibrage")
    X_res, y_res = sm.fit_sample(train_x, train_y)

    print('\nResampled dataset shape {}'.format(Counter(y_res)))
    # reconstitution DATAFRAME
    train_x = pd.DataFrame(X_res, columns=feature_columns)
    train_y = pd.Series(y_res)

    return train_x, train_y
示例#10
0
def smpote_test():
    # 读取测试测试数据集中的数据
    truth_df = pd.read_hdf('D:\\kpi\\1.hdf')
    # print(truth_df["KPI ID"])
    kpi_names = truth_df['KPI ID'].values
    truth = truth_df[truth_df["KPI ID"] == kpi_names[0]]
    y = truth['label']

    X = truth.drop(columns=['label', 'KPI ID'])
    sm = SMOTEENN()
    X_resampled, y_resampled = sm.fit_sample(X, y)

    dfX = pd.DataFrame(X_resampled, columns=['timestamp', 'value'])
    DFy = pd.DataFrame(y_resampled, columns=['label'])

    plt.plot(np.array(X['timestamp']),
             np.array(X['value']),
             color='green',
             label='training accuracy')
    plt.legend()  # 显示图例
    plt.show()

    dfX = dfX.join(DFy).sort_values(by="timestamp", ascending=True)

    plt.plot(np.array(dfX['timestamp']),
             np.array(dfX['value']),
             color='red',
             label='training accuracy')
    plt.legend()  # 显示图例
    plt.show()
示例#11
0
def resampling(X_train, y_train):
    from imblearn.combine import SMOTEENN
    sm = SMOTEENN()
    print('dataset shape {}'.format(Counter(y_train)))
    X_train, y_train = sm.fit_sample(X_train, y_train)
    print('Resampled dataset shape {}'.format(Counter(y_train)))
    return X_train, y_train
示例#12
0
def test_sample_regular_half():
    """Test sample function with regular SMOTE and a ratio of 0.5."""

    # Create the object
    ratio = 0.8
    smote = SMOTEENN(ratio=ratio, random_state=RND_SEED)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206],
                     [1.25192108, -0.22367336],
                     [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504],
                     [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734],
                     [-0.41635887, -0.38299653],
                     [1.70580611, -0.11219234],
                     [0.36784496, -0.1953161],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def balanced_train(data, features):
    X = data[features]
    y = data['label']
    from imblearn.combine import SMOTEENN
    smote_enn = SMOTEENN(random_state=42)
    X_resampled, y_resampled = smote_enn.fit_sample(X, y)
    return X_resampled, y_resampled
示例#14
0
def balance(x, y, randomstate=None, **kwargs):
    sm = SMOTEENN(random_state=randomstate,
                  n_jobs=3,
                  n_neighbors=kwargs['neighbors'])
    print('dataset shape {}'.format(Counter(y)))
    print('Resampling...')
    rx, ry = sm.fit_sample(x, y)
    print('Resampled dataset shape {}'.format(Counter(ry)))
    return rx, ry
示例#15
0
文件: sampling.py 项目: brunnurs/PA1
def SMOTEENN_oversampling(x, y):
    print('Original dataset shape {}'.format(Counter(y)))

    smote_enn = SMOTEENN(random_state=42)
    x_sampled, y_sampled = smote_enn.fit_sample(x, y)

    print('With SMOTEENN sampled dataset shape {}'.format(Counter(y_sampled)))

    return x_sampled, y_sampled
示例#16
0
def over_sampling(data):
    data = data.drop('aid', axis=1)
    data = data.drop('uid', axis=1)
    y = data['label']
    X = data.drop('label', axis=1)
    sme = SMOTEENN()
    X_res, y_res = sme.fit_sample(X, y)
    data_res = pd.concat([X_res, y_res], axis=1)
    data_res.to_csv('./data/train_all_after_overSamlping.csv', index=False)
示例#17
0
def balance_train_data(data):
    print("Start balancing...")
    features, labels = data

    start_time = time.time()
    smote_enn = SMOTEENN(random_state=42)
    features, labels = smote_enn.fit_sample(features, labels)
    print("Balanced dataset:", sorted(Counter(labels).items()))
    print("Balancing time:", time.time() - start_time)
    return (features, labels)
示例#18
0
def test_sample_regular_half():
    ratio = 0.8
    smote = SMOTEENN(ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = smote.fit_sample(X, Y)

    X_gt = np.array([[1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.08711622, 0.93259929]])
    y_gt = np.array([0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def SMOTE_ENN_method(sm, combined, Cols, nn, ks):

    X_train, y_train, X_test, y_test = train_test_split(combined, Cols)

    enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel=ks)

    st = SMOTEENN(random_state=33, smote=sm, enn=enn)

    X_train, y_train = st.fit_sample(X_train, y_train)

    classifier_and_metrics(X_train, y_train, X_test, y_test)
示例#20
0
def test_sample_regular():
    smote = SMOTEENN(random_state=RND_SEED)
    X_resampled, y_resampled = smote.fit_sample(X, Y)

    X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176],
                     [0.61319159, -0.11571667], [0.66052536, -0.28246518],
                     [-0.28162401, -2.10400981], [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
def test_sample_regular_half():
    ratio = {0: 10, 1: 12}
    smote = SMOTEENN(ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = smote.fit_sample(X, Y)

    X_gt = np.array([[1.52091956, -0.49283504],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
示例#22
0
def over_under_sampling(data):
    column_names = data.columns[:-1]
    smote_tomek = SMOTEENN(ratio='auto')
    features, label = smote_tomek.fit_sample(data[data.columns[:-1]],
                                             data['Tumor'].as_matrix())

    data = pd.DataFrame(features)
    data.columns = column_names
    data['Tumor'] = label

    logger.info(data)
    return data
示例#23
0
def test_validate_estimator_init():
    smote = SMOTE(random_state=RND_SEED)
    enn = EditedNearestNeighbours(random_state=RND_SEED, ratio='all')
    smt = SMOTEENN(smote=smote, enn=enn, random_state=RND_SEED)
    X_resampled, y_resampled = smt.fit_sample(X, Y)
    X_gt = np.array([[1.52091956, -0.49283504], [0.84976473, -0.15570176],
                     [0.61319159, -0.11571667], [0.66052536, -0.28246518],
                     [-0.28162401, -2.10400981], [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
示例#24
0
    def fit(self, x, y):

        # 随机下采样
        smoteenn = SMOTEENN()
        x_train, y_train = smoteenn.fit_sample(x, y)
        self.estimators_.append(self._fit_base_estimator(x_train, y_train))

        for i in range(len(self.estimators_)):
            joblib.dump(
                self.estimators_[i], "model/card/SMOTEENN_" + self.model_name +
                "_" + str(self.cnt) + "_model.pkl")
        return self
示例#25
0
 def get_data(self):
     """
     function to fetch data
     """
     with open(os.path.join(self.data_dir, 'shuffled_processed_data.csv'),
               'r') as r:
         data = pd.read_csv(r, nrows=self.nrows)
         X = data.iloc[:, 1:7]
         y = data.iloc[:, 7]
         if self.SMOTENN:
             sm = SMOTEENN(random_state=0)
             X, y = sm.fit_sample(X, y)
         return (X, y)
    def resample(x, y, sampling_type=None):
        x_out, y_out = x, y
        if sampling_type == "smoteenn":
            sme = SMOTEENN(random_state=1)
            x_out, y_out = sme.fit_sample(x, y)
        else:
            if sampling_type == "enn":
                enn = EditedNearestNeighbours(random_state=1)
                x_out, y_out = enn.fit_sample(x, y)

        print("Before resampling:", sorted(Counter(y).items()))
        print("After resampling:", sorted(Counter(y_out).items()))
        return x_out, y_out
示例#27
0
def smote_en_resampling(data_X, data_y, k_neighbors=5):
    # Perform under and over sampling using SMOTE and EN
    smote = SMOTE(sampling_strategy='minority',
                  k_neighbors=k_neighbors,
                  n_jobs=8)
    enn = EditedNearestNeighbours(n_neighbors=k_neighbors, n_jobs=8)
    smoteen = SMOTEENN(sampling_strategy="minority",
                       smote=smote,
                       enn=enn,
                       n_jobs=8)
    resamp_X, resamp_y = smoteen.fit_sample(data_X, data_y)

    return resamp_X, resamp_y
示例#28
0
	def imbalanceProcess(self, X, y):
		'''
		样本不平衡处理
		Args:
			X: 待处理的数据特征样本
			y: 待处理的数据标记样本
		Returns:
			X: 处理后的数据特征样本
			y: 处理后的数据标记样本
		'''
		sm = SMOTEENN()
		X, y = sm.fit_sample(X, y)
		return X, y
示例#29
0
def data_smot():
    sm = SMOTEENN()
    x_res, y_res = sm.fit_sample(test_data[:, 3:], test_data[:, 2])
    print(len(y_res[y_res == 1]))
    print(len(y_res[y_res == 0]))
    y_res = np.reshape(y_res, [-1, 1])
    x_y = np.hstack((x_res, y_res))
    col = list(data.columns[3:])
    col.append(data.columns[2])
    val = x_y
    df = pd.DataFrame(data=val, columns=col)
    df.to_csv("data/new_data_test.csv", index=False)
    print("over")
示例#30
0
def smoteenn(X_train, y_train):
    ## DOES NOT WORK CORRECTLY
    smoteenn = SMOTEENN(random_state=42)

    n_samples, n_levels, n_variables = X_train.shape[0], \
                                       X_train.shape[1], \
                                       X_train.shape[2]

    X_train = X_train.reshape((n_samples, -1), order='F')
    X_train, y_train = smoteenn.fit_sample(X_train, y_train)
    X_train = np.reshape(X_train, (-1, n_levels, n_variables))

    return X_train, y_train
def test_validate_estimator_default():
    smt = SMOTEENN(random_state=RND_SEED)
    X_resampled, y_resampled = smt.fit_sample(X, Y)
    X_gt = np.array([[1.52091956, -0.49283504],
                     [0.84976473, -0.15570176],
                     [0.61319159, -0.11571667],
                     [0.66052536, -0.28246518],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
def create_synthetic_balanced_data_set(args, data_set, selected_class, ratio='auto'):
    """
    Creates a balanced data set by adding synthetic samples to underrepresented classes using SMOTE upsampling.

    :param args: Program arguments.
    :param data_set: The data set to balance.
    :param selected_class: The class to be balanced with synthetic samples.
    :param ratio: Upsampling ratio.
    :return: A data set with the selected class balanced using synthetic samples.
    """
    from imblearn.combine import SMOTEENN

    num_classes = data_set.get_num_classes()
    seq_copy = data_set.to_one_vs_k(selected_class)

    # X must be padded and y must be binarized to work with the SMOTE implementation.
    padded_x = pad_sequences(seq_copy.x,
                             maxlen=args["max_sequence_length"],
                             padding="post",
                             truncating="post",
                             dtype="float32")

    binary_y = np.argmax(seq_copy.y, axis=-1)

    sm = SMOTEENN(n_jobs=4, ratio=ratio)
    new_x, new_y = sm.fit_sample(padded_x, binary_y)

    # Transform the data back to the application format.
    synthetic_data_set = data_set.__class__(new_x, new_y)
    synthetic_data_set.to_categorical(num_classes)
    synthetic_data_set = synthetic_data_set.single_class_data_set(0)
    synthetic_data_set.y = np.ones((len(synthetic_data_set.x), 1)) * selected_class
    synthetic_data_set.to_categorical(num_classes)

    synthetic_data_set.x = map(lambda x: x, synthetic_data_set.x)
    synthetic_data_set.y = synthetic_data_set.y.tolist()

    # Remove the samples used to generate synthetic samples.
    balance = data_set.get_class_balance()
    balance[selected_class] = 0
    data_set.set_class_balance(balance)

    data_set.x = data_set.x.tolist()
    data_set.y = data_set.y.tolist()

    # Merge sets.
    return_set = data_set.merged(synthetic_data_set)
    return_set.x = np.asarray(return_set.x)
    return_set.y = np.asarray(return_set.y)

    return return_set
示例#33
0
    def use_OSSSMOTEENN(self):
        X,y = preparation(self.path)
##############################
        dy = pd.DataFrame(y)
        dy.value_counts().plot(kind='bar',title='Count(label)')
        plt.show()
#################################
        oss = OneSidedSelection(random_state = 42,n_jobs=-1,sampling_strategy="majority")
        X_res,y_res = oss.fit_sample(X,y)

        dy_res = pd.DataFrame(y_res)
        dy_res.value_counts().plot(kind='bar',title='Count(label)')
        plt.show()
##############################
        sme = SMOTEENN(random_state=42,n_jobs=-1)
        X_sme, y_sme = sme.fit_sample(X_res, y_res)

    #draw bar

        dy_sme = pd.DataFrame(y_sme)
        dy_sme.value_counts().plot(kind='bar',title='Count(label)')
        plt.show()

    #generate csv

        df=pd.concat([X_sme,pd.DataFrame(y_sme)],axis=1)

        df.to_csv(self.path.replace('.csv','_OSSSMOTEENN_Final_Test.csv') ,index = None,header=None,float_format='%.4f')
        
    ###the first line of data will be delete    


    ##########draw PCA
        pca = PCA(n_components=2)
        X_sme = pca.fit_transform(X_sme)
        plot_2d_space(X_sme,y_sme, 'SMOTE + ENN')

        return self.path.replace('.csv','_OSSSMOTEENN_Final_Test.csv')




# if __name__ == '__main__':
#     path ="++Final_Test++_pre.csv"
#     #draw_bar(path)
#     mhi = My_handle_imbalance(path)
#     mhi.use_OSSSMOTEENN()
#
#     #use_SMOTETomek(path)
#     #draw_origin(path)
示例#34
0
def test_sample_regular():
    """Test sample function with regular SMOTE."""

    # Create the object
    smote = SMOTEENN(random_state=RND_SEED)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'smote_enn_reg_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'smote_enn_reg_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_sample_regular_pass_smote_enn():
    smote = SMOTEENN(smote=SMOTE(ratio='auto', random_state=RND_SEED),
                     enn=EditedNearestNeighbours(ratio='all',
                                                 random_state=RND_SEED),
                     random_state=RND_SEED)
    X_resampled, y_resampled = smote.fit_sample(X, Y)

    X_gt = np.array([[1.52091956, -0.49283504],
                     [0.84976473, -0.15570176],
                     [0.61319159, -0.11571667],
                     [0.66052536, -0.28246518],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.08711622, 0.93259929]])
    y_gt = np.array([0, 0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
 def SMOTE(self, bug_rate, X, Y):
     """
     Combine over- and under-sampling using SMOTE and
      Edited Nearest Neighbours.
      通过改进的SMOTE来对原来的数据集做处理
     :param bug_rate:
     :param X:数据集除了lable以外的部分
     :param Y:lable信息
     :return:处理过的X,Y。
     """
     from collections import Counter
     from imblearn.combine import SMOTEENN
     sme = SMOTEENN(ratio=bug_rate)
     x_res, y_res = sme.fit_sample(X, Y)
     import numpy as np
     nx = np.column_stack((x_res, y_res))
     self.new_list_SMOTE = nx
示例#37
0
class Undersampler:

	def __init__(self,kind,data,target,verbose = False, ratio = 'auto'):

		assert len(data) == len(target)
		self.data = data
		self.target = target

		if kind in [Undersampling.ClusterCentroids]:
			if verbose: print('> CLUSTER CENTROIDS')

			# Undersampling por Cluster Centroids
			self.undersampler = ClusterCentroids(verbose = verbose, ratio=ratio)
		elif kind in [Undersampling.SMOTEENN]:
			if verbose: print('> SMOTEENN')

			# Undersampling por SMOTEENN
			self.undersampler = SMOTEENN(verbose = verbose, ratio=ratio)
		else:
			raise("Nonexistent undersampling type: "+kind.name)

	def balance(self):
		#return self.undersampler.fit_transform(self.data, self.target)
		return self.undersampler.fit_sample(self.data, self.target)
示例#38
0
        return (data[i - 1] + data[i])/2

start = time()
n_iter = 100          ## Number of evaluations (SMAC)
n_validations = 7     ## Number of Monte-Carlo Cross-Validations for each model's accuracy evaluated

## Dataset 11

url11 = "https://archive.ics.uci.edu/ml/machine-learning-databases/tic-mld/ticdata2000.txt"
dataset11 = np.genfromtxt(urllib.urlopen(url11))

X = dataset11[:,0:85]
Y = dataset11[:,85]

sm = SMOTEENN()
X, Y = sm.fit_sample(X, Y)

# We fit the MLP with the hyperparameters given and return the model's median accuracy from 7 trials
def mlp(number_layers, number_neurons_1, number_neurons_2, number_neurons_3, number_neurons_4, dropout_rate):

	layers = []
	number_neurons = []

	number_neurons.append(number_neurons_1)
	number_neurons.append(number_neurons_2)
	number_neurons.append(number_neurons_3)
	number_neurons.append(number_neurons_4)

	for i in np.arange(number_layers):
		layers.append(Layer("Sigmoid", units=number_neurons[i], dropout = dropout_rate))
示例#39
0
from imblearn.combine import SMOTEENN

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply SMOTE + ENN
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
示例#40
0
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


#define X y
X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#smoteen
sme = SMOTEENN(random_state=42)
os_X,os_y = sme.fit_sample(X_train,y_train)

#QDA
clf_QDA = QuadraticDiscriminantAnalysis(store_covariances=True)
clf_QDA.fit(os_X, os_y)
y_true, y_pred = y_test, clf_QDA.predict(X_test)

#F1_score, precision, recall, specifity, G score
print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred)  
print "Recall : %.4g" % metrics.recall_score(y_true, y_pred)
recall = metrics.recall_score(y_true, y_pred)  
print "Precision : %.4g" % metrics.precision_score(y_true, y_pred)
 
#Compute confusion matrix
cnf_matrix = confusion_matrix(y_test,y_pred)
np.set_printoptions(precision=2)