Python ClusterCentroids.ClusterCentroids示例，imblearn.under_sampling.ClusterCentroids.ClusterCentroids Python示例

示例#1

0

显示文件

文件： ubHelper.py 项目： RacleRay/RaychSnippts

def unbalance_helper(X_train,
                     X_test,
                     y_train,
                     y_test,
                     imbalance_method='under_sampling'):
    """
    Args:
        imbalance_method (str, optional): over_sampling, or under_sampling. Defaults to 'under_sampling'.

    Returns:
        processed data
    """
    # 是否使用不平衡数据处理方式，上采样， 下采样， ensemble
    if imbalance_method == 'over_sampling':
        print("Use SMOTETomek deal with unbalance data ")
        # 插值生成新样本
        X_train, y_train = SMOTETomek().fit_resample(X_train, y_train)
        X_test, y_test = SMOTETomek().fit_resample(X_train, y_train)
    elif imbalance_method == 'under_sampling':
        print("Use ClusterCentroids deal with unbalance data ")
        X_train, y_train = ClusterCentroids(random_state=0).fit_resample(
            X_train, y_train)
        X_test, y_test = ClusterCentroids(random_state=0).fit_resample(
            X_test, y_test)

    return X_train, y_train, X_test, y_test

示例#2

0

显示文件

def test_fit_resample_check_voting():
    cc = ClusterCentroids(random_state=RND_SEED)
    cc.fit_resample(X, Y)
    assert cc.voting_ == 'soft'
    cc = ClusterCentroids(random_state=RND_SEED)
    cc.fit_resample(sparse.csr_matrix(X), Y)
    assert cc.voting_ == 'hard'

示例#3

0

显示文件

def test_fit_sample_error():
    ratio = 'auto'
    cluster = 'rnd'
    cc = ClusterCentroids(ratio=ratio,
                          random_state=RND_SEED,
                          estimator=cluster)
    with raises(ValueError, match="has to be a KMeans clustering"):
        cc.fit_sample(X, Y)

    voting = 'unknown'
    cc = ClusterCentroids(ratio=ratio, voting=voting, random_state=RND_SEED)
    with raises(ValueError, match="needs to be one of"):
        cc.fit_sample(X, Y)

示例#4

0

显示文件

文件： test_cluster_centroids.py 项目： scikit-learn-contrib/imbalanced-learn

def test_cluster_centroids_error_estimator():
    """Check that an error is raised when estimator does not have a cluster API."""

    err_msg = (
        "`estimator` should be a clustering estimator exposing a parameter "
        "`n_clusters` and a fitted parameter `cluster_centers_`.")
    with pytest.raises(ValueError, match=err_msg):
        ClusterCentroids(estimator=LogisticRegression()).fit_resample(X, Y)

    err_msg = (
        "`estimator` should be a clustering estimator exposing a fitted parameter "
        "`cluster_centers_`.")
    with pytest.raises(RuntimeError, match=err_msg):
        ClusterCentroids(estimator=_CustomClusterer()).fit_resample(X, Y)

示例#5

0

显示文件

文件： test_cluster_centroids.py 项目： scikit-learn-contrib/imbalanced-learn

def test_cluster_centroids_hard_target_class():
    # check that the samples selecting by the hard voting corresponds to the
    # targeted class
    # non-regression test for:
    # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/738
    X, y = make_classification(
        n_samples=1000,
        n_features=2,
        n_informative=1,
        n_redundant=0,
        n_repeated=0,
        n_clusters_per_class=1,
        weights=[0.3, 0.7],
        class_sep=0.01,
        random_state=0,
    )

    cc = ClusterCentroids(voting="hard", random_state=0)
    X_res, y_res = cc.fit_resample(X, y)

    minority_class_indices = np.flatnonzero(y == 0)
    X_minority_class = X[minority_class_indices]

    resampled_majority_class_indices = np.flatnonzero(y_res == 1)
    X_res_majority = X_res[resampled_majority_class_indices]

    sample_from_minority_in_majority = [
        np.all(np.isclose(selected_sample, minority_sample))
        for selected_sample in X_res_majority
        for minority_sample in X_minority_class
    ]
    assert sum(sample_from_minority_in_majority) == 0

示例#6

0

显示文件

def under_sample(X, y, sampler="RandomUnderSampler"):
    # list of all samplers, in case you want to iterate all of them
    samplers_list = ['RandomUnderSampler', 'ClusterCentroids', 'NearMiss', 'InstanceHardnessThreshold',
                     'CondensedNearestNeighbour', 'EditedNearestNeighbours', 'RepeatedEditedNearestNeighbours',
                     'AllKNN', 'NeighbourhoodCleaningRule', 'OneSidedSelection']
    print(samplers_list)

    # currently there is no parameters sampler
    # this dict is used to choose a resampler by user. default is random
    samplers = {
        "RandomUnderSampler": RandomUnderSampler(),
        "ClusterCentroids": ClusterCentroids(),
        "NearMiss": NearMiss(),
        "InstanceHardnessThreshold": InstanceHardnessThreshold(),
        "CondensedNearestNeighbour": CondensedNearestNeighbour(),
        "EditedNearestNeighbours": EditedNearestNeighbours(),
        "RepeatedEditedNearestNeighbours": RepeatedEditedNearestNeighbours(),
        "AllKNN": AllKNN(),
        "NeighbourhoodCleaningRule": NeighbourhoodCleaningRule(),
        "OneSidedSelection": OneSidedSelection(),
    }
    sampler = samplers[sampler]

    # plot y class count before and after resample
    print("before", sorted(Counter(y).items()))

    # to resample simply call fit_resample method of sampler
    X_resampled, y_resampled = sampler.fit_resample(X, y)

    print("after", sorted(Counter(y_resampled).items()))

    print('===' * 4, 'under_sample finished')

    return X_resampled, y_resampled

示例#7

0

显示文件

文件： sampling.py 项目： refactoring-ai/Machine-Learning

def perform_balancing(x, y, strategy=None):
    """
    Performs under/over sampling, according to the
    number of true and false instances of the x, y dataset.
    :param x: feature values
    :param y: labels
    :return: a balanced x, y
    """

    if strategy is None:
        strategy = BALANCE_DATASET_STRATEGY

    if strategy == 'random':
        # more info:
        # https://imbalanced-learn.readthedocs.io/en/stable/under_sampling.html
        rus = RandomUnderSampler(random_state=SEED)
    elif strategy == 'oversampling':
        rus = RandomOverSampler(random_state=SEED)
    elif strategy == 'cluster_centroids':
        rus = ClusterCentroids(random_state=SEED, n_jobs=CORE_COUNT)
    elif strategy == 'nearmiss':
        rus = NearMiss(version=1, n_jobs=CORE_COUNT)
    else:
        raise ValueError("algorithm not found")

    # keeping column names
    new_x, new_y = rus.fit_resample(x, y)
    new_x = pd.DataFrame(new_x, columns=x.columns)
    return new_x, new_y

示例#8

0

显示文件

文件： test_cluster_centroids.py 项目： tasrif60/Software_inventory

def test_fit_resample_half():
    sampling_strategy = {0: 3, 1: 6}
    cc = ClusterCentroids(sampling_strategy=sampling_strategy,
                          random_state=RND_SEED)
    X_resampled, y_resampled = cc.fit_resample(X, Y)
    assert X_resampled.shape == (9, 2)
    assert y_resampled.shape == (9, )

示例#9

0

显示文件

文件： test_cluster_centroids.py 项目： tasrif60/Software_inventory

def test_fit_resample_auto():
    sampling_strategy = "auto"
    cc = ClusterCentroids(sampling_strategy=sampling_strategy,
                          random_state=RND_SEED)
    X_resampled, y_resampled = cc.fit_resample(X, Y)
    assert X_resampled.shape == (6, 2)
    assert y_resampled.shape == (6, )

示例#10

0

显示文件

文件： resampling_methods.py 项目： punkie/resampler_new

class ResamplingAlgorithms(Enum):
    RO = ("Random Over-sampling", RandomOverSampler(random_state=1))
    SMOTE = ("Smote", SMOTE(random_state=1))
    ADASYN = ("ADASYN", ADASYN(random_state=1))
    SMOTE_TL = ('SMOTE+TL', SMOTETomek(random_state=1))
    SMOTE_ENN = ('SMOTE+ENN', SMOTEENN(random_state=1))
    SMOTE_BOOST = ("SMOTEBoost", smote_boost.SMOTEBoost())
    RU = ("Random Under-sampling", RandomUnderSampler(random_state=1))
    CLUSTERCENTROIDS = ("ClusterCentroids", ClusterCentroids(random_state=1))
    TOMEK_LINKS = ("TomekLinks", TomekLinks())
    NM1 = ("NM1", NearMiss(version=1))
    NM2 = ("NM2", NearMiss(version=2))
    NM3 = ("NM3", NearMiss(version=3))
    CNN = ("CNN", CondensedNearestNeighbour(random_state=1))
    OSS = ("OneSidedSelection", OneSidedSelection(random_state=1))
    ENN = ('ENN', EditedNearestNeighbours())
    NCL = ('NCL', NeighbourhoodCleaningRule())
    IHT = ('IHT', (InstanceHardnessThreshold(random_state=1)))
    RENN = ('RENN', RepeatedEditedNearestNeighbours())
    AllKNN = ('AllKNN', AllKNN())

    @classmethod
    def get_algorithm_by_name(cls, name):
        filtered_algos = filter(lambda ra: ra.value[0] == name,
                                ResamplingAlgorithms)
        return next(filtered_algos, ResamplingAlgorithms.RO)

示例#11

0

显示文件

def test_cluster_centroids_n_jobs():
    # check that we deprecate the `n_jobs` parameter.
    cc = ClusterCentroids(n_jobs=1)
    with pytest.warns(FutureWarning) as record:
        cc.fit_resample(X, Y)
    assert len(record) == 1
    assert "'n_jobs' was deprecated" in record[0].message.args[0]

示例#12

0

显示文件

文件： Utils.py 项目： Lipairui/Deal_with_Imbalance

def under_sampling(X, y, method):
    if method == 'ClusterCentroids':
        model = ClusterCentroids()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'RandomUnderSampler':
        model = RandomUnderSampler()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'NearMiss':
        model = NearMiss()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'EditedNearestNeighbours':
        model = EditedNearestNeighbours()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'RepeatedEditedNearestNeighbours':
        model = RepeatedEditedNearestNeighbours()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'AllKNN':
        model = AllKNN()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'NeighbourhoodCleaningRule':
        model = NeighbourhoodCleaningRule()
        X_resampled, y_resampled = model.fit_resample(X, y)
    elif method == 'OneSidedSelection':
        model = OneSidedSelection()
        X_resampled, y_resampled = model.fit_resample(X, y)
    return X_resampled, y_resampled

示例#13

0

显示文件

文件： test_cluster_centroids.py 项目： weimengdong/imbalanced-learn

def test_fit_sample_wrong_object():
    ratio = 'auto'
    cluster = 'rnd'
    cc = ClusterCentroids(
        ratio=ratio, random_state=RND_SEED, estimator=cluster)
    assert_raises_regex(ValueError, "has to be a KMeans clustering",
                        cc.fit_sample, X, Y)

示例#14

0

显示文件

文件： test_generator.py 项目： wenqin2017/imbalanced-learn

def test_balanced_batch_generator_function_no_return_indices():
    with pytest.raises(ValueError, match='needs to return the indices'):
        balanced_batch_generator(X,
                                 y,
                                 sampler=ClusterCentroids(),
                                 batch_size=10,
                                 random_state=42)

示例#15

0

显示文件

def buildModel(clf, X, y, cv_nums=10, is_random=False):
    # 是否打乱数据
    if is_random == True:
        random_lst = list(np.random.randint(0, 1000, 4))
    elif is_random == False:
        random_lst = [0] * 4

    print('----------各种类别不平衡处理方法结果, 为' + str(cv_nums) + '折交叉验证的f1均值----------')
    # 不做处理，使用原始数据集做预测
    print('原始数据集: ', np.mean(cross_val_score(clf, X, y, scoring='f1', cv=cv_nums)))

    ros = RandomOverSampler(random_state=random_lst[0])
    X_oversampled, y_oversampled = ros.fit_sample(X, y)
    # print(sorted(Counter(y_oversampled).items()))
    print('过采样: ', np.mean(cross_val_score(clf, X_oversampled, y_oversampled, scoring='f1', cv=cv_nums)))

    cc = ClusterCentroids(random_state=random_lst[1])
    X_undersampled, y_undersampled = cc.fit_sample(X, y)
    #print(sorted(Counter(y_undersampled).items()))
    print('欠采样: ', np.mean(cross_val_score(clf, X_undersampled, y_undersampled, scoring='f1', cv=cv_nums)))

    sm = SMOTE(random_state=random_lst[2])
    X_smote, y_smote = sm.fit_sample(X, y)
    #print(sorted(Counter(y_smote).items()))
    print('SMOTE: ', np.mean(cross_val_score(clf, X_smote, y_smote, scoring='f1', cv=cv_nums)))

    # 将样本多的类别划分为若干个集合供不同学习器使用，这样对每个学习器来看都进行了欠采样，
    # 但在全局来看却不会丢失重要信息，假设将负样本的类别划分为10份，正样本的类别只有1份，
    # 这样训练10个学习器，每个学习器使用1份负样本和1份正样本，正样本共用
    ee = EasyEnsemble(random_state=random_lst[3], n_subsets=10)
    X_ee, y_ee = ee.fit_sample(X, y)

示例#16

0

显示文件

def test_continuous_error():
    """Test either if an error is raised when the target are continuous
    type"""

    # continuous case
    y = np.linspace(0, 1, 10)
    cc = ClusterCentroids(random_state=RND_SEED)
    assert_warns(UserWarning, cc.fit, X, y)

示例#17

0

显示文件

def test_init():
    """Test the initialisation of the object"""

    # Define a ratio
    ratio = 1.
    cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED)

    assert_equal(cc.ratio, ratio)

示例#18

0

显示文件

文件： model.py 项目： Quiescent/opensafely-sandpit

def fix_imbalance(X, y):
    """Fix imbalanced data in features X with labels Y.

This is an important step because an over representation of a label
means that it's easy to score high by guessing one label the whole
time."""
    cluster_centroids = ClusterCentroids()
    return cluster_centroids.fit_resample(X, y)

示例#19

0

显示文件

文件： dataPreparation.py 项目： ceciliacal/MOBD_project

def Resampling(train_x, train_y, resampling_method):
    train_y.data = LabelEncoder().fit_transform(train_y.data)
    # summarize distribution

    # scommentare la riga di seguito se si vuole visualizzare il grafico a torta della distribuzione delle classi prima di resampling
    #plotGraphics.piePlot(train_y, "Before Resampling")

    # ---- UNDER-SAMPLING ------ #
    if resampling_method == "ClusterCentroids":
        resample = ClusterCentroids(voting='hard', random_state=42)

    if resampling_method == "CondensedNearestNeighbour":
        resample = CondensedNearestNeighbour(n_neighbors=7, random_state=42)

    if resampling_method == "EditedNearestNeighbours":
        resample = EditedNearestNeighbours(n_neighbors=7,
                                           kind_sel='mode',
                                           n_jobs=-1)

    if resampling_method == "RepeatedEditedNearestNeighbours":
        resample = RepeatedEditedNearestNeighbours(n_neighbors=7,
                                                   kind_sel='mode',
                                                   n_jobs=-1)

    if resampling_method == "AllKNN":
        resample = AllKNN(n_neighbors=7,
                          kind_sel='mode',
                          allow_minority=True,
                          n_jobs=-1)

    if resampling_method == "NearMiss":
        resample = NearMiss(n_neighbors=7, n_jobs=-1)

    if resampling_method == "NeighbourhoodCleaningRule":
        resample = NeighbourhoodCleaningRule(n_neighbors=7, kind_sel='all')

    if resampling_method == "RandomUnderSampler":
        resample = RandomUnderSampler(random_state=42)

    if resampling_method == "TomekLinks":
        resample = TomekLinks(n_jobs=-1)

    # ---- OVER-SAMPLING ------ #
    if resampling_method == "BorderlineSMOTE":
        resample = BorderlineSMOTE(random_state=42, n_jobs=-1)

    if resampling_method == "KMeansSMOTE":
        resample = KMeansSMOTE(random_state=42)

    if resampling_method == "RandomUnderSampler":
        resample = RandomOverSampler(random_state=42)

    if resampling_method == "SMOTE":
        resample = SMOTE(random_state=42, n_jobs=-1)

    # transform the dataset
    train_x.data, train_y.data = resample.fit_resample(train_x.data,
                                                       train_y.data)

示例#20

0

显示文件

def test_cc_fit_invalid_ratio():
    """Test either if an error is raised when the balancing ratio to fit is
    smaller than the one of the data"""

    # Create the object
    ratio = 1. / 10000.
    cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED)
    # Fit the data
    assert_raises(RuntimeError, cc.fit, X, Y)

示例#21

0

显示文件

文件： Class_Sampling.py 项目： abhiglobalistic/hams_ml

    def perform_Under_ClusterCentroids(self):

        print('Under sampling with ClusterCentroids, preserves imformation')

        cc = ClusterCentroids(random_state=0)

        X_resampled, y_resampled = cc.fit_resample(self.X, self.y)

        return X_resampled, y_resampled

示例#22

0

显示文件

文件： model_param_svm_linear.py 项目： alexanderwendt/sklearn_ml_toolbox

    def use_parameters(self, X_train, selected_features):
        """
        Default Parameter

        """

        test_scaler = [
            StandardScaler(),
            RobustScaler(),
            QuantileTransformer(),
            Normalizer()
        ]
        test_sampling = [
            modelutil.Nosampler(),
            ClusterCentroids(),
            RandomUnderSampler(),
            # NearMiss(version=1),
            # EditedNearestNeighbours(),
            # AllKNN(),
            # CondensedNearestNeighbour(random_state=0),
            # InstanceHardnessThreshold(random_state=0,
            #                          estimator=LogisticRegression(solver='lbfgs', multi_class='auto')),
            RandomOverSampler(random_state=0),
            SMOTE(),
            BorderlineSMOTE(),
            SMOTEENN(),
            SMOTETomek(),
            ADASYN()
        ]
        test_C = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
        test_C_linear = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2]

        # gamma default parameters
        param_scale = 1 / (X_train.shape[1] * np.mean(X_train.var()))

        parameters = [{
            'scaler': test_scaler,
            'sampling': test_sampling,
            'feat__cols': selected_features,
            'model__C': test_C_linear,  # default C=1
            'model__kernel': ['linear']
        }]

        # If no missing values, only one imputer strategy shall be used
        if X_train.isna().sum().sum() > 0:
            parameters['imputer__strategy'] = [
                'mean', 'median', 'most_frequent'
            ]
            print("Missing values used. Test different imputer strategies")
        else:
            print("No missing values. No imputer necessary")

            print("Selected Parameters: ", parameters)
        # else:
        print("Parameters defined in the input: ", parameters)

        return parameters

示例#23

0

显示文件

def test_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    cc = ClusterCentroids(random_state=RND_SEED)
    cc.fit(X, Y)
    assert_raises(RuntimeError, cc.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))

示例#24

0

显示文件

文件： models.py 项目： MiniBee/dl

    def unbanlance_helper(self,
                          imbalance_method='under_sampling',
                          search_method='grid'):
        logger.info('get all feature ... ')
        self.x_train, self.x_test, self.y_train, self.y_test = self.feature_engineer(
        )
        model_name = None
        if imbalance_method == 'over_sampling':
            logger.info('Use SMOTE deal with unbalance data ... ')
            self.x_train, self.y_train = SMOTE().fit_resample(
                self.x_train, self.y_train)
            self.x_test, self.y_test = SMOTE().fit_resample(
                self.x_test, self.y_test)
            model_name = 'lgb_over_sampling'
        elif imbalance_method == 'under_sampling':
            logger.info('User ClusterCentroids deal with unbalance data ... ')
            self.x_train, self.y_train = ClusterCentroids(
                random_state=11).fit_resample(self.x_train, self.y_train)
            self.x_test, self.y_test = ClusterCentroids(
                random_state=11).fit_resample(self.x_test, self.y_test)
            model_name = 'lgb_under_sampling'

        logger.info('search best param ... ')
        param = {}
        param['params'] = {}
        param['params']['num_leaves'] = 3
        param['params']['max_depth'] = 5
        self.model = self.model.set_params(**param['params'])
        logger.info('fit model ... ')
        self.model.fit(self.x_train, self.y_train)
        test_predict_label = self.model.predict(self.x_test)
        train_predict_label = self.model.predict(self.x_train)
        per, acc, recall, f1 = get_score(self.y_train, self.y_test,
                                         train_predict_label,
                                         test_predict_label)

        logger.info('Train accuracy %s' % per)
        # 输出测试集的准确率
        logger.info('test accuracy %s' % acc)
        # 输出recall
        logger.info('test recall %s' % recall)
        # 输出F1-score
        logger.info('test F1_score %s' % f1)
        self.save(model_name)

示例#25

0

显示文件

def test_sample_wt_fit():
    """Test either if an error is raised when sample is called before
    fitting"""

    # Define the parameter for the under-sampling
    ratio = 'auto'

    # Create the object
    cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED)
    assert_raises(RuntimeError, cc.sample, X, Y)

示例#26

0

显示文件

文件： test_cluster_centroids.py 项目： hhlisme/imbalanced-learn

def test_multiclass_fit_sample():
    y = Y.copy()
    y[5] = 2
    y[6] = 2
    cc = ClusterCentroids(random_state=RND_SEED)
    X_resampled, y_resampled = cc.fit_sample(X, y)
    count_y_res = Counter(y_resampled)
    assert count_y_res[0] == 2
    assert count_y_res[1] == 2
    assert count_y_res[2] == 2

示例#27

0

显示文件

文件： test_cluster_centroids.py 项目： hhlisme/imbalanced-learn

def test_fit_sample_auto():
    ratio = 'auto'
    cc = ClusterCentroids(ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = cc.fit_sample(X, Y)
    X_gt = np.array([[0.92923648, 0.76103773], [0.47104475, 0.44386323],
                     [0.13347175, 0.12167502], [0.06738818, -0.529627],
                     [0.17901516, 0.69860992], [0.094035, -2.55298982]])
    y_gt = np.array([0, 0, 0, 1, 1, 1])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)

示例#28

0

显示文件

def undersample(X, y):
    cc = ClusterCentroids(random_state=12)
    rX, rY = cc.fit_resample(X, y)
    if isinstance(X, pd.DataFrame):
        rX = pd.DataFrame(data=rX, columns=X.columns)
    elif isinstance(X, pd.Series):
        rX = pd.Series(data=rX)
    if isinstance(y, pd.Series):
        rY = pd.Series(data=rY)
    return rX, rY

示例#29

0

显示文件

文件： under_sampling.py 项目： yemode2k/studio

def _under_sampling(table, label_col, sampling_strategy='not majority', seed=None, estimator='KMeans',
                    n_clusters=8, voting='auto', n_jobs=1):

    # Separate features and label
    features = table.drop([label_col], axis=1)
    y = table[label_col]

    if(sklearn_utils.multiclass.type_of_target(y) == 'continuous'):
        raise_error('0718', 'label_col')
    
    # Initialization label encoder
    lab_encoder = preprocessing.LabelEncoder()

    # Filter out categorical columns in features
    categorical_cols = [col for col in features.columns if features[col].dtypes == 'object']

    # Transform categorical columns and add to the original features
    for cate_col in categorical_cols:
        features_encoder = lab_encoder.fit_transform(features[cate_col])
        features[cate_col] = features_encoder
    
    # Transform label column with object type
    if (y.dtypes == 'object'):
        y_encoder = lab_encoder.fit_transform(y)
    else:
        y_encoder = y

    if (estimator == 'Kmeans'):
        estimator_model = KMeans(n_clusters=n_clusters)
    else:
        estimator_model = None
    
    # Process under sampling
    sm = ClusterCentroids(sampling_strategy=sampling_strategy, random_state=seed, 
                    estimator=estimator_model, voting=voting, n_jobs=n_jobs)
    
    X_res, y_res = sm.fit_resample(features, y_encoder)

    # Invert to original data
    if (y.dtypes == 'object'):
        y_decoder = lab_encoder.inverse_transform(y_res)
    else:    
        y_decoder = y_res

    df = pd.DataFrame(data=X_res, columns=features.columns)

    for cate_col in categorical_cols:
        df[cate_col] = lab_encoder.inverse_transform(df[cate_col].astype('int32'))

    df1 = pd.DataFrame(data=y_decoder, columns=[label_col])

    # Output result
    out_table = df.join(df1)

    return {'out_table' : out_table}

示例#30

0

显示文件

    def resample(self, X, y, by, random_state=None, visualize=False):
        '''
        by: String
            The method used to perform re-sampling
            currently support: ['RUS', 'CNN', 'ENN', 'NCR', 'Tomek', 'ALLKNN', 'OSS',
                'NM', 'CC', 'SMOTE', 'ADASYN', 'BorderSMOTE', 'SMOTEENN', 'SMOTETomek',
                'ORG']
        '''
        if by == 'RUS':
            sampler = RandomUnderSampler(random_state=random_state)
        elif by == 'CNN':
            sampler = CondensedNearestNeighbour(random_state=random_state)
        elif by == 'ENN':
            sampler = EditedNearestNeighbours(random_state=random_state)
        elif by == 'NCR':
            sampler = NeighbourhoodCleaningRule(random_state=random_state)
        elif by == 'Tomek':
            sampler = TomekLinks(random_state=random_state)
        elif by == 'ALLKNN':
            sampler = AllKNN(random_state=random_state)
        elif by == 'OSS':
            sampler = OneSidedSelection(random_state=random_state)
        elif by == 'NM':
            sampler = NearMiss(random_state=random_state)
        elif by == 'CC':
            sampler = ClusterCentroids(random_state=random_state)
        elif by == 'SMOTE':
            sampler = SMOTE(random_state=random_state)
        elif by == 'ADASYN':
            sampler = ADASYN(random_state=random_state)
        elif by == 'BorderSMOTE':
            sampler = BorderlineSMOTE(random_state=random_state)
        elif by == 'SMOTEENN':
            sampler = SMOTEENN(random_state=random_state)
        elif by == 'SMOTETomek':
            sampler = SMOTETomek(random_state=random_state)
        elif by == 'ORG':
            sampler = None
        else:
            raise Error('Unexpected \'by\' type {}'.format(by))

        if by != 'ORG':
            X_train, y_train = sampler.fit_resample(X, y)
        else:
            X_train, y_train = X, y
        if visualize:
            df = pd.DataFrame(X_train)
            df['label'] = y_train
            df.plot.scatter(x=0,
                            y=1,
                            c='label',
                            s=3,
                            colormap='coolwarm',
                            title='{} training set'.format(by))
        return X_train, y_train