def test_fit_sample_auto():
    """Test the fit and sample routine with auto ratio."""

    # Define the ratio parameter
    ratio = 'auto'

    # Create the sampling object
    ee = EasyEnsemble(
        ratio=ratio, random_state=RND_SEED, return_indices=True, n_subsets=3)

    # Get the different subset
    X_resampled, y_resampled, idx_under = ee.fit_sample(X, Y)

    X_gt = np.array([[[0.85117925, 1.0185556], [-0.58539673, 0.62515052],
                      [1.35269503, 0.44812421], [0.5220963, 0.11349303],
                      [1.10915364, 0.05718352], [0.22039505, 0.26469445]],
                     [[0.85117925, 1.0185556], [-0.58539673, 0.62515052],
                      [-1.23195149, 0.15427291], [-2.10724436, 0.70263997],
                      [0.22039505, 0.26469445], [1.10915364, 0.05718352]],
                     [[0.85117925, 1.0185556], [-0.58539673, 0.62515052],
                      [-1.23195149, 0.15427291], [0.5220963, 0.11349303],
                      [1.10915364, 0.05718352], [0.59091459, 0.40692742]]])
    y_gt = np.array([[0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2],
                     [0, 0, 1, 1, 2, 2]])
    idx_gt = np.array([[5, 9, 4, 0, 2, 3], [5, 9, 8, 6, 3, 2],
                       [5, 9, 8, 0, 2, 1]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Пример #2
0
def test_fit_resample_half():
    # Define the sampling_strategy parameter
    sampling_strategy = {0: 2, 1: 3, 2: 3}

    # Create the sampling object
    ee = EasyEnsemble(
        sampling_strategy=sampling_strategy,
        random_state=RND_SEED,
        n_subsets=3)

    # Get the different subset
    X_resampled, y_resampled = ee.fit_resample(X, Y)

    X_gt = np.array([[[-0.58539673, 0.62515052], [0.85117925, 1.0185556],
                      [1.35269503, 0.44812421], [-1.23195149, 0.15427291],
                      [0.5220963, 0.11349303], [1.10915364, 0.05718352],
                      [0.59091459, 0.40692742], [0.22039505, 0.26469445]],
                     [[0.85117925, 1.0185556], [-0.58539673, 0.62515052],
                      [1.35269503, 0.44812421], [-2.10724436, 0.70263997],
                      [-1.23195149, 0.15427291], [0.59091459, 0.40692742],
                      [0.22039505, 0.26469445], [1.10915364, 0.05718352]],
                     [[0.85117925, 1.0185556], [-0.58539673, 0.62515052],
                      [-1.23195149, 0.15427291], [0.5220963, 0.11349303],
                      [1.35269503, 0.44812421], [1.10915364, 0.05718352],
                      [0.59091459, 0.40692742], [0.22039505, 0.26469445]]])
    y_gt = np.array([[0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 1, 1, 1, 2, 2, 2],
                     [0, 0, 1, 1, 1, 2, 2, 2]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Пример #3
0
def buildModel(clf, X, y, cv_nums=10, is_random=False):
    # 是否打乱数据
    if is_random == True:
        random_lst = list(np.random.randint(0, 1000, 4))
    elif is_random == False:
        random_lst = [0] * 4

    print('----------各种类别不平衡处理方法结果, 为' + str(cv_nums) + '折交叉验证的f1均值----------')
    # 不做处理,使用原始数据集做预测
    print('原始数据集: ', np.mean(cross_val_score(clf, X, y, scoring='f1', cv=cv_nums)))

    ros = RandomOverSampler(random_state=random_lst[0])
    X_oversampled, y_oversampled = ros.fit_sample(X, y)
    # print(sorted(Counter(y_oversampled).items()))
    print('过采样: ', np.mean(cross_val_score(clf, X_oversampled, y_oversampled, scoring='f1', cv=cv_nums)))

    cc = ClusterCentroids(random_state=random_lst[1])
    X_undersampled, y_undersampled = cc.fit_sample(X, y)
    #print(sorted(Counter(y_undersampled).items()))
    print('欠采样: ', np.mean(cross_val_score(clf, X_undersampled, y_undersampled, scoring='f1', cv=cv_nums)))

    sm = SMOTE(random_state=random_lst[2])
    X_smote, y_smote = sm.fit_sample(X, y)
    #print(sorted(Counter(y_smote).items()))
    print('SMOTE: ', np.mean(cross_val_score(clf, X_smote, y_smote, scoring='f1', cv=cv_nums)))

    # 将样本多的类别划分为若干个集合供不同学习器使用,这样对每个学习器来看都进行了欠采样,
    # 但在全局来看却不会丢失重要信息,假设将负样本的类别划分为10份,正样本的类别只有1份,
    # 这样训练10个学习器,每个学习器使用1份负样本和1份正样本,正样本共用
    ee = EasyEnsemble(random_state=random_lst[3], n_subsets=10)
    X_ee, y_ee = ee.fit_sample(X, y)
Пример #4
0
def test_fit_sample_auto():
    """Test the fit and sample routine with auto ratio."""

    # Define the ratio parameter
    ratio = 'auto'

    # Create the sampling object
    ee = EasyEnsemble(ratio=ratio,
                      random_state=RND_SEED,
                      return_indices=True,
                      n_subsets=3)

    # Get the different subset
    X_resampled, y_resampled, idx_under = ee.fit_sample(X, Y)

    X_gt = np.array([[[0.85117925, 1.0185556], [-0.58539673, 0.62515052],
                      [1.35269503, 0.44812421], [0.5220963, 0.11349303],
                      [1.10915364, 0.05718352], [0.22039505, 0.26469445]],
                     [[0.85117925, 1.0185556], [-0.58539673, 0.62515052],
                      [-1.23195149, 0.15427291], [-2.10724436, 0.70263997],
                      [0.22039505, 0.26469445], [1.10915364, 0.05718352]],
                     [[0.85117925, 1.0185556], [-0.58539673, 0.62515052],
                      [-1.23195149, 0.15427291], [0.5220963, 0.11349303],
                      [1.10915364, 0.05718352], [0.59091459, 0.40692742]]])
    y_gt = np.array([[0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2],
                     [0, 0, 1, 1, 2, 2]])
    idx_gt = np.array([[5, 9, 4, 0, 2, 3], [5, 9, 8, 6, 3, 2],
                       [5, 9, 8, 0, 2, 1]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Пример #5
0
def ensemble_train(X,y, working_dir,n, name, svm=True):
    ees = EasyEnsemble(random_state=557, n_subsets=n)
    X_res, y_res = ees.fit_sample(X,y)
   

    try:
        raise Exception('Retrain')
        with open(working_dir + "/" + name  + '.pkl', 'rb') as f1:
            clf = pickle.load(f1)
    except:
        # scores = cross_val_score(clf, X, y, cv=4, scoring="roc_auc")
        # print("Name %s ROC_AUC: %0.2f (+/- %0.2f)" % (name, scores.mean(), scores.std() * 2))
        clf = []
        for i in range(len(X_res)):
            print(Counter(y_res[i]))
            if(svm):
                clfi = SVC(kernel="linear", probability=True)
            else:
                clfi = AdaBoostClassifier(n_estimators=20)
            #clfi=AdaBoostClassifier()
            clfi.fit(X_res[i], y_res[i])
            clf.append(clfi)
            scores = cross_val_score(clfi, X_res[i], y_res[i], cv=4, scoring="roc_auc")
            print("Name %s ROC_AUC: %0.2f (+/- %0.2f)" % (name, scores.mean(), scores.std() * 2))
        with open(working_dir + "/" + name + '.pkl', 'wb') as f1:
            pickle.dump(clf, f1)  
    return clf
def test_fit_sample_half():
    # Define the sampling_strategy parameter
    sampling_strategy = {0: 2, 1: 3, 2: 3}

    # Create the sampling object
    ee = EasyEnsemble(sampling_strategy=sampling_strategy,
                      random_state=RND_SEED,
                      n_subsets=3)

    # Get the different subset
    X_resampled, y_resampled = ee.fit_sample(X, Y)

    X_gt = np.array([[[-0.58539673, 0.62515052], [0.85117925, 1.0185556],
                      [1.35269503, 0.44812421], [-1.23195149, 0.15427291],
                      [0.5220963, 0.11349303], [1.10915364, 0.05718352],
                      [0.59091459, 0.40692742], [0.22039505, 0.26469445]],
                     [[0.85117925, 1.0185556], [-0.58539673, 0.62515052],
                      [1.35269503, 0.44812421], [-2.10724436, 0.70263997],
                      [-1.23195149, 0.15427291], [0.59091459, 0.40692742],
                      [0.22039505, 0.26469445], [1.10915364, 0.05718352]],
                     [[0.85117925, 1.0185556], [-0.58539673, 0.62515052],
                      [-1.23195149, 0.15427291], [0.5220963, 0.11349303],
                      [1.35269503, 0.44812421], [1.10915364, 0.05718352],
                      [0.59091459, 0.40692742], [0.22039505, 0.26469445]]])
    y_gt = np.array([[0, 0, 1, 1, 1, 2, 2, 2], [0, 0, 1, 1, 1, 2, 2, 2],
                     [0, 0, 1, 1, 1, 2, 2, 2]])
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Пример #7
0
 def __init__(self, base_model, n_subsets):
     self.base_model = base_model
     self.n_subsets = n_subsets
     self.easy_ensemble = EasyEnsemble('auto',
                                       random_state=RAND_SEED,
                                       n_subsets=4)
     self.trained_based_models = []
Пример #8
0
def ezensemble(X_train, y_train):
    a = list(X_train)
    ee = EasyEnsemble(random_state=0, n_subsets=10)
    ee.fit(X_train, y_train)
    X_resampled, y_resampled = ee.fit_sample(X_train, y_train)
    X_resampled = pd.DataFrame(X_resampled[1], columns=a)
    y_resampled = pd.DataFrame(y_resampled[1], columns=['Target'])
    return X_resampled, y_resampled
def easy_ensemble(train_set, train_label):
    ee = EasyEnsemble(ratio='auto',
                      return_indices=True,
                      random_state=None,
                      replacement=False,
                      n_subsets=easy_ensemble_num)
    X_resampled, y_resampled, idx_resampled = ee.fit_sample(
        train_set, train_label)
    return X_resampled, y_resampled
def test_random_state_none():
    # Define the ratio parameter
    ratio = 'auto'

    # Create the sampling object
    ee = EasyEnsemble(ratio=ratio, random_state=None)

    # Get the different subset
    X_resampled, y_resampled = ee.fit_sample(X, Y)
def test_random_state_none():
    # Define the sampling_strategy parameter
    sampling_strategy = 'auto'

    # Create the sampling object
    ee = EasyEnsemble(sampling_strategy=sampling_strategy, random_state=None)

    # Get the different subset
    X_resampled, y_resampled = ee.fit_sample(X, Y)
Пример #12
0
def test_random_state_none():
    # Define the sampling_strategy parameter
    sampling_strategy = 'auto'

    # Create the sampling object
    ee = EasyEnsemble(sampling_strategy=sampling_strategy, random_state=None)

    # Get the different subset
    X_resampled, y_resampled = ee.fit_resample(X, Y)
Пример #13
0
def test_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    ee = EasyEnsemble(random_state=RND_SEED)
    ee.fit(X, Y)
    assert_raises(RuntimeError, ee.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
def test_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    ee = EasyEnsemble(random_state=RND_SEED)
    ee.fit(X, Y)
    assert_raises(RuntimeError, ee.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
Пример #15
0
def EasySample(data):
    x = data.iloc[:, 0:2]
    y = data.iloc[:, -2]
    # 使用集成方法EasyEnsemble处理不均衡样本
    model_EasyEnsemble = EasyEnsemble()  # 建立EasyEnsemble模型对象
    x_EasyEnsemble_resampled, y_EasyEnsemble_resampled = model_EasyEnsemble.fit_sample(
        x, y)  # 输入数据并应用集成方法处理
    print(x_EasyEnsemble_resampled.shape)  # 打印输出集成方法处理后的x样本集概况
    print(y_EasyEnsemble_resampled.shape)  # 打印输出集成方法处理后的y标签集概况
def test_random_state_none():
    """Test that the processing is going throw with random state being None."""

    # Define the ratio parameter
    ratio = 0.5

    # Create the sampling object
    ee = EasyEnsemble(ratio=ratio, random_state=None)

    # Get the different subset
    X_resampled, y_resampled = ee.fit_sample(X, Y)
Пример #17
0
def test_random_state_none():
    """Test that the processing is going throw with random state being None."""

    # Define the ratio parameter
    ratio = 'auto'

    # Create the sampling object
    ee = EasyEnsemble(ratio=ratio, random_state=None)

    # Get the different subset
    X_resampled, y_resampled = ee.fit_sample(X, Y)
Пример #18
0
    def fit(self, train_x, train_y):
        self._estimators = []
        ee = EasyEnsemble(replacement=True, n_subsets=self._no_of_estimators)
        X_res, y_res = ee.fit_sample(train_x, train_y)

        for i in range(self._no_of_estimators):
            X, y = X_res[i, :, :], y_res[i, :]

            estimator = clone(self._base_classifier)
            estimator.fit(X, y)

            self._estimators.append(estimator)

        return self
Пример #19
0
def easy_ensemble_classifier(clf, x_train, y_train, x_test, nsubs, repl):
    ee = EasyEnsemble(n_subsets=nsubs, replacement=repl)  # Create EasyEnsemble object
    X_train_res, y_train_res = ee.fit_sample(x_train, y_train)  # re-sample the data
    clfs = []
    i = 0
    preds_ = np.zeros([1, np.shape(x_test)[0]])

    # Iterate through sub-samples:
    for xtrain in X_train_res:
        clfs += [clf]
        clfs[i].fit(xtrain, y_train_res[i])
        preds_ = np.add(preds_, clfs[i].predict(x_test))
        i += 1

    return np.divide(preds_, nsubs)
Пример #20
0
def get_downsampling_data(train_pth="data/train_data.npy",
                          val_pth="data/val_data.npy",
                          test_pth="data/test_data.npy"):
    train_data = np.load(train_pth)[:, :-1]
    train_flag = np.load(train_pth)[:, -1]
    ee = EasyEnsemble(random_state=0, n_subsets=10)
    train_data, train_flag = ee.fit_sample(train_data, train_flag)
    train_flag = np.array(train_flag, dtype=np.int)
    val_data = np.load(val_pth)[:, :-1]
    val_flag = np.load(val_pth)[:, -1]
    val_flag = np.array(val_flag, dtype=np.int)
    test_data = np.load(test_pth)[:, :-1]
    test_flag = np.load(test_pth)[:, -1]
    test_flag = np.array(test_flag, dtype=np.int)
    return train_data, train_flag, val_data, val_flag, test_data, test_flag
Пример #21
0
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    ee = EasyEnsemble(random_state=RND_SEED)
    X_resampled, y_resampled = ee.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled[0])
    assert_equal(count_y_res[0], 400)
    assert_equal(count_y_res[1], 400)
    assert_equal(count_y_res[2], 400)
def test_ee_fit():
    """Test the fitting method"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED)
    # Fit the data
    ee.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(ee.min_c_, 0)
    assert_equal(ee.maj_c_, 1)
    assert_equal(ee.stats_c_[0], 500)
    assert_equal(ee.stats_c_[1], 4500)
def test_multiclass_fit_sample():
    """Test fit sample method with multiclass target"""

    # Make y to be multiclass
    y = Y.copy()
    y[0:1000] = 2

    # Resample the data
    ee = EasyEnsemble(random_state=RND_SEED)
    X_resampled, y_resampled = ee.fit_sample(X, y)

    # Check the size of y
    count_y_res = Counter(y_resampled[0])
    assert_equal(count_y_res[0], 400)
    assert_equal(count_y_res[1], 400)
    assert_equal(count_y_res[2], 400)
Пример #24
0
def test_ee_fit():
    """Test the fitting method"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED)
    # Fit the data
    ee.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(ee.min_c_, 0)
    assert_equal(ee.maj_c_, 1)
    assert_equal(ee.stats_c_[0], 500)
    assert_equal(ee.stats_c_[1], 4500)
Пример #25
0
def Balance_classes(X_train, y_train, Sampling_Function):
    if Sampling_Function == 'RandomUnderSampler':
        us = RandomUnderSampler(ratio=0.5, random_state=1)
    elif Sampling_Function == 'NearMiss1':
        us = NearMiss(ratio=0.5, random_state=1, version=1, size_ngh=3)
    elif Sampling_Function == 'NearMiss2':
        us = NearMiss(ratio=0.5, random_state=1, version=2, size_ngh=3)
    elif Sampling_Function == 'NearMiss3':
        us = NearMiss(ratio=0.5, random_state=1, version=3, ver3_samp_ngh=3)
    elif Sampling_Function == 'CondensedNearestNeighbour':
        us = CondensedNearestNeighbour(random_state=1)
    elif Sampling_Function == 'EditedNearestNeighbours':
        us = EditedNearestNeighbours(random_state=1, size_ngh=5)
    elif Sampling_Function == 'RepeatedEditedNearestNeighbours':
        us = EditedNearestNeighbours(random_state=1, size_ngh=5)
    elif Sampling_Function == 'TomekLinks':
        us = TomekLinks(random_state=1)
    elif Sampling_Function == 'RandomOverSampler':
        us = RandomOverSampler(ratio=0.5, random_state=1)
    elif Sampling_Function == 'SMOTE':
        us = SMOTE(ratio=0.5, k=5, random_state=1)
    elif Sampling_Function == 'SMOTETomek':
        us = SMOTETomek(ratio=0.5, k=5, random_state=1)
    elif Sampling_Function == 'SMOTEENN':
        us = SMOTEENN(ratio=0.5, k=5, random_state=1, size_ngh=5)
    elif Sampling_Function == 'EasyEnsemble':
        us = EasyEnsemble()
    elif Sampling_Function == 'BalanceCascade_rf':
        us = BalanceCascade(classifier='random-forest', random_state=1)
    elif Sampling_Function == 'BalanceCascade_svm':
        us = BalanceCascade(classifier='linear-svm', random_state=1)

    X_train_res, y_train_res = us.fit_sample(X_train, y_train)

    return X_train_res, y_train_res
Пример #26
0
    def __init__(self,  window_size=6, training_ratio=.7, seq="sequence", pos="label"):
        self.training_ratio = training_ratio  # Float value representing % of data used for training
        self.features = []
        self.labels = []
        self.words = []
        self.window_size = window_size
        self.supervised_classifiers = {"forest": RandomForestClassifier(n_jobs=4),
                                       "mlp_adam": MLPClassifier(),
                                       "svc": svm.SVC(verbose=1),
                                       "xgb": XGBClassifier(max_delta_step=5),
                                       "bagging": BaggingClassifier(), "one_class_svm": OneClassSVM(kernel="rbf")
                                       }

        self.imbalance_functions = {"easy_ensemble": EasyEnsemble(), "SMOTEENN": SMOTEENN(),
                                    "SMOTETomek": SMOTETomek(), "ADASYN": ADASYN(),
                                    "random_under_sample": RandomUnderSampler(), "ncl": NeighbourhoodCleaningRule(),
                                    "near_miss": NearMiss(), "pass": -1}
        self.seq = seq
        self.pos = pos
        self.random_data = 0
        self.test_results = 0
        self.vecs = {"sequence": sequence_vector, "chemical": chemical_vector, "binary": binary_vector, "w2v": "w2v"}
        self.vector = 0
        self.features_labels = {}
        self.test_cv = 0
        self.benchmark_mcc = 0
        self.mcc_scorer = make_scorer(matthews_corrcoef)
Пример #27
0
def easy_ensemble_classifier(clf, x_train, y_train, x_test, nsubs, repl):
    ee = EasyEnsemble(n_subsets=nsubs,
                      replacement=repl)  # Create EasyEnsemble object
    X_train_res, y_train_res = ee.fit_sample(x_train,
                                             y_train)  # re-sample the data
    clfs = []
    i = 0
    preds_ = np.zeros([1, np.shape(x_test)[0]])

    # Iterate through sub-samples:
    for xtrain in X_train_res:
        clfs += [clf]
        clfs[i].fit(xtrain, y_train_res[i])
        preds_ = np.add(preds_, clfs[i].predict(x_test))
        i += 1

    return np.divide(preds_, nsubs)
Пример #28
0
def test_fit_sample_half():
    """Test the fit and sample routine with 0.5 ratio."""

    # Define the ratio parameter
    ratio = 0.5

    # Create the sampling object
    ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED)

    # Get the different subset
    X_resampled, y_resampled = ee.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'ee_x_05.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'ee_y_05.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_fit_sample_half():
    """Test the fit and sample routine with 0.5 ratio."""

    # Define the ratio parameter
    ratio = 0.5

    # Create the sampling object
    ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED)

    # Get the different subset
    X_resampled, y_resampled = ee.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'ee_x_05.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'ee_y_05.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Пример #30
0
def test_continuous_error():
    """Test either if an error is raised when the target are continuous
    type"""

    # continuous case
    y = np.linspace(0, 1, 10)
    ee = EasyEnsemble(random_state=RND_SEED)
    assert_warns(UserWarning, ee.fit, X, y)
Пример #31
0
def test_ee_init():
    # Define a ratio
    ratio = 1.
    ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED)

    assert_equal(ee.ratio, ratio)
    assert_equal(ee.replacement, False)
    assert_equal(ee.n_subsets, 10)
    assert_equal(ee.random_state, RND_SEED)
Пример #32
0
    def get_batch(self, tokenized_samples, labels):
        e = EasyEnsemble(random_state=0, n_subsets=1)
        e.fit(tokenized_samples, labels)
        X_resampled, y_resampled = e.sample(tokenized_samples, labels)

        X = X_resampled[0]
        y = y_resampled[0]

        targets = np.zeros(shape=(len(X), self._num_labels))
        samples = np.zeros(shape=(len(X), self._max_document_length))

        for sample_ix, sample in enumerate(X):
            label = y[sample_ix]

            targets[sample_ix, label] = 1
            samples[sample_ix, :sample.shape[0]] = \
                sample[:self._max_document_length]
        return samples, targets
Пример #33
0
def EnsembleSample(X, Y, method='EasyEnsemble', random_state=42):
    if X.size == len(X):
        X = X.reshape(-1, 1)
    if method is 'EasyEnsemble':
        sampler = EasyEnsemble(ratio='auto',
                               random_state=random_state,
                               replacement=False,
                               n_subsets=10)
    elif method is 'BalanceCascade':
        sampler = BalanceCascade(ratio='auto',
                                 random_state=random_state,
                                 n_max_subset=None,
                                 classifier=None,
                                 estimator=None)
    X_resampled, Y_resampled = sampler.fit_sample(X, Y)
    # 组合采样+分类器,返回的是分类器
    #    BalancedBaggingClassifier(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, ratio='auto', replacement=False, n_jobs=1, random_state=None, verbose=0)
    return X_resampled, Y_resampled
Пример #34
0
def test_ee_init():
    # Define a ratio
    ratio = 1.
    ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED)

    assert ee.ratio == ratio
    assert ee.replacement is False
    assert ee.n_subsets == 10
    assert ee.random_state == RND_SEED
Пример #35
0
def test_ee_fit_invalid_ratio():
    """Test either if an error is raised when the balancing ratio to fit is
    smaller than the one of the data"""

    # Create the object
    ratio = 1. / 10000.
    ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED)
    # Fit the data
    assert_raises(RuntimeError, ee.fit, X, Y)
def test_ee_init():
    # Define a sampling_strategy
    sampling_strategy = 1.
    ee = EasyEnsemble(sampling_strategy=sampling_strategy,
                      random_state=RND_SEED)

    assert ee.sampling_strategy == sampling_strategy
    assert ee.replacement is False
    assert ee.n_subsets == 10
    assert ee.random_state == RND_SEED
Пример #37
0
def test_fit_sample_auto():
    """Test the fit and sample routine with auto ratio."""

    # Define the ratio parameter
    ratio = 'auto'

    # Create the sampling object
    ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED, return_indices=True)

    # Get the different subset
    X_resampled, y_resampled, idx_under = ee.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'ee_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'ee_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'ee_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
Пример #38
0
def resample_data(train_feature, train_class, count_sampleset):

    multiplier = {
        0: 1.0,
        1: 0.1,
        2: 0.1,
        3: 1.0,
        4: 1.0,
        5: 0.1,
        6: 1.0,
        7: 0.5,
        8: 0.1
    }
    target_stats = collections.Counter(train_class)
    for key, value in target_stats.items():
        target_stats[key] = int(value * multiplier[key])

    ee = EasyEnsemble(ratio=target_stats, n_subsets=count_sampleset)
    return ee.fit_sample(train_feature, train_class)
Пример #39
0
def test_sample_wt_fit():
    """Test either if an error is raised when sample is called before
    fitting"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED)
    assert_raises(RuntimeError, ee.sample, X, Y)
Пример #40
0
def test_ee_init():
    """Test the initialisation of the object"""

    # Define a ratio
    ratio = 1.
    ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED)

    assert_equal(ee.ratio, ratio)
    assert_equal(ee.replacement, False)
    assert_equal(ee.n_subsets, 10)
    assert_equal(ee.random_state, RND_SEED)
def test_fit_sample_auto():
    """Test the fit and sample routine with auto ratio."""

    # Define the ratio parameter
    ratio = 'auto'

    # Create the sampling object
    ee = EasyEnsemble(ratio=ratio, random_state=RND_SEED,
                      return_indices=True)

    # Get the different subset
    X_resampled, y_resampled, idx_under = ee.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'ee_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'ee_y.npy'))
    idx_gt = np.load(os.path.join(currdir, 'data', 'ee_idx.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    assert_array_equal(idx_under, idx_gt)
print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.3, 0.7],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=100, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply Easy Ensemble
ee = EasyEnsemble(n_subsets=3)
X_resampled, y_resampled = ee.fit_sample(X, y)
X_res_vis = []
for X_res in X_resampled:
    X_res_vis.append(pca.transform(X_res))

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5)
ax1.set_title('Original set')

ax2.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5)
for iy, e in enumerate(X_res_vis):
    ax2.scatter(e[y_resampled[iy] == 1, 0], e[y_resampled[iy] == 1, 1],