예제 #1
0
def unbalance_helper(X_train,
                     X_test,
                     y_train,
                     y_test,
                     imbalance_method='under_sampling'):
    """
    Args:
        imbalance_method (str, optional): over_sampling, or under_sampling. Defaults to 'under_sampling'.

    Returns:
        processed data
    """
    # 是否使用不平衡数据处理方式,上采样, 下采样, ensemble
    if imbalance_method == 'over_sampling':
        print("Use SMOTETomek deal with unbalance data ")
        # 插值生成新样本
        X_train, y_train = SMOTETomek().fit_resample(X_train, y_train)
        X_test, y_test = SMOTETomek().fit_resample(X_train, y_train)
    elif imbalance_method == 'under_sampling':
        print("Use ClusterCentroids deal with unbalance data ")
        X_train, y_train = ClusterCentroids(random_state=0).fit_resample(
            X_train, y_train)
        X_test, y_test = ClusterCentroids(random_state=0).fit_resample(
            X_test, y_test)

    return X_train, y_train, X_test, y_test
예제 #2
0
def test_validate_estimator_deprecation():
    """Test right processing while passing old parameters"""

    X_gt = np.array([[0.20622591, 0.0582794], [0.68481731, 0.51935141],
                     [1.34192108, -0.13367336], [0.62366841, -0.21312976],
                     [1.61091956, -0.40283504], [-0.37162401, -2.19400981],
                     [0.74680821, 1.63827342], [0.61472253, -0.82309052],
                     [0.19893132, -0.47761769], [0.97407872, 0.44454207],
                     [1.40301027, -0.83648734], [-1.20515198, -1.02689695],
                     [-0.23374509, 0.18370049], [-0.32635887, -0.29299653],
                     [-0.00288378, 0.84259929], [1.79580611, -0.02219234],
                     [0.38307743, -0.05670439], [0.93976473, -0.06570176],
                     [0.70319159, -0.02571668], [0.75052536, -0.19246517]])
    y_gt = np.array(
        [0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0])

    smt = SMOTETomek(random_state=RND_SEED, n_jobs=-1)
    X_resampled, y_resampled = smt.fit_sample(X, Y)
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)

    smt = SMOTETomek(random_state=RND_SEED, k=5)
    X_resampled, y_resampled = smt.fit_sample(X, Y)
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
예제 #3
0
def test_validate_estimator_deprecation():
    smt = SMOTETomek(random_state=RND_SEED, n_jobs=-1)
    X_resampled, y_resampled = smt.fit_sample(X, Y)
    X_gt = np.array([[0.68481731, 0.51935141],
                     [1.34192108, -0.13367336],
                     [0.62366841, -0.21312976],
                     [1.61091956, -0.40283504],
                     [-0.37162401, -2.19400981],
                     [0.74680821, 1.63827342],
                     [0.61472253, -0.82309052],
                     [0.19893132, -0.47761769],
                     [1.40301027, -0.83648734],
                     [-1.20515198, -1.02689695],
                     [-0.23374509, 0.18370049],
                     [-0.00288378, 0.84259929],
                     [1.79580611, -0.02219234],
                     [0.38307743, -0.05670439],
                     [0.70319159, -0.02571667],
                     [0.75052536, -0.19246518]])
    y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
    smt = SMOTETomek(random_state=RND_SEED, k=5)
    X_resampled, y_resampled = smt.fit_sample(X, Y)
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
def data_balance(train_df, test_df=None, ngram=(1, 1)):
    from collections import Counter
    # count_vect = CountVectorizer(ngram_range=ngram,max_features=2500)
    count_vect = CountVectorizer(ngram_range=ngram)
    y_tr = train_df.label
    # y_tr = y_tr.astype(int)
    X_train_counts = count_vect.fit_transform(train_df.text)

    smk_tr = SMOTETomek()
    X_train_counts, y_tr_res = smk_tr.fit_sample(X_train_counts, y_tr)
    print(f'original  data set count{Counter(y_tr)}')
    print(f'new balanced data set count{Counter(y_tr_res)}')
    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    if test_df is not None:
        y_ts = test_df.label
        # y_ts = y_ts.astype(int)
        X_test_counts = count_vect.transform(test_df.text)
        smk_ts = SMOTETomek()
        x_ts_res, y_ts_res = smk_ts.fit_sample(X_test_counts, y_ts)
        tf_transformer = TfidfTransformer(use_idf=False).fit(X_test_counts)
        X_test_tfidf = tf_transformer.transform(X_test_counts)

        print(f'original ts ds count{Counter(y_ts)}')
        print(f'new st ds count{Counter(y_ts_res)}')
        return X_train_counts, X_train_tf, y_tr_res, X_test_counts, X_test_tfidf, y_ts
    return X_train_counts, X_train_tf, y_tr_res
예제 #5
0
def test_error_wrong_object():
    smote = 'rnd'
    tomek = 'rnd'
    smt = SMOTETomek(smote=smote, random_state=RND_SEED)
    with raises(ValueError, match="smote needs to be a SMOTE"):
        smt.fit_sample(X, Y)
    smt = SMOTETomek(tomek=tomek, random_state=RND_SEED)
    with raises(ValueError, match="tomek needs to be a TomekLinks"):
        smt.fit_sample(X, Y)
예제 #6
0
def test_error_wrong_object():
    smote = 'rnd'
    tomek = 'rnd'
    smt = SMOTETomek(smote=smote, random_state=RND_SEED)
    assert_raises_regex(ValueError, "smote needs to be a SMOTE",
                        smt.fit_sample, X, Y)
    smt = SMOTETomek(tomek=tomek, random_state=RND_SEED)
    assert_raises_regex(ValueError, "tomek needs to be a TomekLinks",
                        smt.fit_sample, X, Y)
예제 #7
0
def test_error_wrong_object():
    """Test either if an error is raised while wrong objects are provided
    at the initialization"""

    # Create a SMOTE and Tomek object
    smote = 'rnd'
    tomek = 'rnd'

    smt = SMOTETomek(smote=smote, random_state=RND_SEED)
    assert_raises(ValueError, smt.fit, X, Y)
    smt = SMOTETomek(tomek=tomek, random_state=RND_SEED)
    assert_raises(ValueError, smt.fit, X, Y)
def test_multiclass_error():
    """ Test either if an error is raised when the target are not binary
    type. """

    # continuous case
    y = np.linspace(0, 1, 5000)
    sm = SMOTETomek(random_state=RND_SEED)
    assert_warns(UserWarning, sm.fit, X, y)

    # multiclass case
    y = np.array([0] * 2000 + [1] * 2000 + [2] * 1000)
    sm = SMOTETomek(random_state=RND_SEED)
    assert_warns(UserWarning, sm.fit, X, y)
예제 #9
0
def test_parallelisation():
    # Check if default job count is None
    smt = SMOTETomek(random_state=RND_SEED)
    smt._validate_estimator()
    assert smt.n_jobs is None
    assert smt.smote_.n_jobs is None
    assert smt.tomek_.n_jobs is None

    # Check if job count is set
    smt = SMOTETomek(random_state=RND_SEED, n_jobs=8)
    smt._validate_estimator()
    assert smt.n_jobs == 8
    assert smt.smote_.n_jobs == 8
    assert smt.tomek_.n_jobs == 8
예제 #10
0
def resample():
    test_switch = np.load('data/test_switch_w_64_f_20.npy')
    test_non_switch = np.load('data/test_non_switch_w_64_f_20.npy')
    train_switch = np.load('data/train_switch_w_64_f_20.npy')
    train_non_switch = np.load('data/train_non_switch_w_64_f_20.npy')

    resample_train = SMOTETomek(sampling_strategy='all',
                                smote=SMOTE(n_jobs=4),
                                tomek=TomekLinks(n_jobs=4))
    resampe_test = SMOTETomek(sampling_strategy='all',
                              smote=SMOTE(n_jobs=4),
                              tomek=TomekLinks(n_jobs=4))

    print('Beginning train resample...')
    X = np.concatenate((train_switch, train_non_switch))
    y = np.concatenate(
        (np.zeros(train_switch.shape[0]), np.ones(train_non_switch.shape[0])))
    X_res, y_res = resample_train.fit_resample(X, y)

    train_switch = []
    train_non_switch = []
    for i in range(X_res.shape[0]):
        if y_res[i] == 0:
            train_switch.append(X_res[i])
        else:
            train_non_switch.append(X_res[i])

    np.save('data/train_switch_w_64_f_20_samp.npy', np.array(train_switch))
    np.save('data/train_non_switch_w_64_f_20_samp.npy',
            np.array(train_non_switch))

    print('Beginning test resample...')
    X = np.concatenate((test_switch, test_non_switch))
    y = np.concatenate(
        (np.zeros(test_switch.shape[0]), np.ones(test_non_switch.shape[0])))
    X_res, y_res = resample_test.fit_resample(X, y)

    test_switch = []
    test_non_switch = []
    for i in range(X_res.shape[0]):
        if y_res[i] == 0:
            test_switch.append(X_res[i])
        else:
            test_non_switch.append(X_res[i])

    np.save('data/test_switch_w_64_f_20_samp.npy', np.array(test_switch))
    np.save('data/test_non_switch_w_64_f_20_samp.npy',
            np.array(test_non_switch))
    return
    def balanceData(self, method: str = "mixsampling") -> None:

        """
        Function -> balanceData
        Balance data classes wiht method selected

        Parameters
        ---------------------------------------------------------------------------
            method => mixsampling, undersampling or oversampling

        Return
        ---------------------------------------------------------------------------
            None => Modify self.balanceObj
        """

        if method == "mixsampling":
            from imblearn.combine import SMOTETomek
            self.balanceObj = SMOTETomek(sampling_strategy='auto')

        elif method == "undersampling":
            from imblearn.under_sampling import NearMiss
            self.balanceObj = NearMiss(sampling_strategy= "auto", n_neighbors=3, version=2)

        elif method == "oversampling":
            from imblearn.over_sampling import RandomOverSampler
            self.balanceObj = RandomOverSampler(sampling_strategy = "auto")

        else:
            raise NameError(f"{method} method not defined")
예제 #12
0
def getXY(graphs):
    '''
    得到经过均衡处理后的xy,并对x进行预处理
    :param graphs: getGraph得到的图
    :return: X,Y-list
    '''
    X = list()
    Y = list()

    for graph in graphs:
        X.append(graphs[graph]['x'])
        Y.append(graphs[graph]['target'])

    X = np.array(X).astype('float64')
    Y = np.array(Y)

    # 结合采样
    # https://blog.csdn.net/kizgel/article/details/78553009
    smote_tomek = SMOTETomek(random_state=0)
    X_resampled, y_resampled = smote_tomek.fit_sample(X, Y)
    logger.info(sorted(Counter(y_resampled).items()))
    # print(sorted(Counter(y_resampled).items()))

    # rus = RandomUnderSampler(random_state=0)
    # X_resampled, y_resampled = rus.fit_sample(X, Y)
    # logger.info(sorted(Counter(y_resampled).items()))

    # 预处理 (X-mean)/std  计算时对每个属性/每列分别进行。
    # 将数据按期属性(按列进行)减去其均值,并处以其方差。得到的结果是,对于每个属性/每列来说所有数据都聚集在0附近,方差为1。
    scaler = preprocessing.StandardScaler().fit(X_resampled)
    X_train_transformed = scaler.transform(X_resampled)

    return X_train_transformed, y_resampled
예제 #13
0
    def __turnBalanced(df):
        """Balances a unbalanced training set.

        Parameters
        ----------
        df : DataFrame
            Training set.

        Returns
        -------
        df_features: DataFrame
            Balanced features.
        df_target: DataFrame
            Balanced target.
        """
        dropCat = pd.DataFrame(df[df.columns[-1]].value_counts())
        if len(dropCat.index.tolist()) >= 10:
            limit = len(df)*0.05
        else:
            limit = 10
        dropCat = pd.DataFrame(df[df.columns[-1]].value_counts())
        dropCat = dropCat[dropCat[dropCat.columns[-1]] < limit].index.tolist()
        df = df[~df[df.columns[-1]].isin(dropCat)]
        df = df.dropna()
        df = df.reset_index()
        smt = SMOTETomek()
        X_smt, y_smt = smt.fit_sample(df.iloc[:, :-1], df[df.columns[-1]])
        collections.Counter(y_smt)
        df = pd.concat([pd.DataFrame(X_smt), pd.Series(y_smt)],
                       axis=1, sort=False)
        df_features = df.iloc[:, :-1]
        df_features = df_features.drop(columns=['index'])
        df_target = df[df.columns[-1]]
        return df_features, df_target
def test_smote_sample_wt_fit():
    """Test either if an error is raised when sample is called before
    fitting"""

    # Create the object
    smote = SMOTETomek(random_state=RND_SEED)
    assert_raises(RuntimeError, smote.sample, X, Y)
예제 #15
0
def get_smotetomek(X_trn, y_trn, seed=int(623 * 4413)):
    """
    Resamples using SMOTETOMEK
    """
    SMTMK = SMOTETomek(random_state=seed)
    X_trn, y_trn = SMTMK.fit_resample(X_trn, y_trn)
    return X_trn, y_trn
예제 #16
0
def getUnderAndOverSamplers():
    samplers = {
        'SMOTEENN': SMOTEENN(sampling_strategy=0.5, n_jobs=-1),
        # 'SMOTEENN': SMOTEENN(sampling_strategy=0.5, n_jobs=-1)
        'SMOTETomek': SMOTETomek(sampling_strategy=0.5, n_jobs=-1)
    }
    return samplers
예제 #17
0
class ResamplingAlgorithms(Enum):
    RO = ("Random Over-sampling", RandomOverSampler(random_state=1))
    SMOTE = ("Smote", SMOTE(random_state=1))
    ADASYN = ("ADASYN", ADASYN(random_state=1))
    SMOTE_TL = ('SMOTE+TL', SMOTETomek(random_state=1))
    SMOTE_ENN = ('SMOTE+ENN', SMOTEENN(random_state=1))
    SMOTE_BOOST = ("SMOTEBoost", smote_boost.SMOTEBoost())
    RU = ("Random Under-sampling", RandomUnderSampler(random_state=1))
    CLUSTERCENTROIDS = ("ClusterCentroids", ClusterCentroids(random_state=1))
    TOMEK_LINKS = ("TomekLinks", TomekLinks())
    NM1 = ("NM1", NearMiss(version=1))
    NM2 = ("NM2", NearMiss(version=2))
    NM3 = ("NM3", NearMiss(version=3))
    CNN = ("CNN", CondensedNearestNeighbour(random_state=1))
    OSS = ("OneSidedSelection", OneSidedSelection(random_state=1))
    ENN = ('ENN', EditedNearestNeighbours())
    NCL = ('NCL', NeighbourhoodCleaningRule())
    IHT = ('IHT', (InstanceHardnessThreshold(random_state=1)))
    RENN = ('RENN', RepeatedEditedNearestNeighbours())
    AllKNN = ('AllKNN', AllKNN())

    @classmethod
    def get_algorithm_by_name(cls, name):
        filtered_algos = filter(lambda ra: ra.value[0] == name,
                                ResamplingAlgorithms)
        return next(filtered_algos, ResamplingAlgorithms.RO)
예제 #18
0
def load_data(batch_size=128, smote=False, num_samples=-1):

    df_train = pd.read_csv("input/mitbih_train.csv", header=None)
    df_train = df_train.sample(frac=1)
    df_test = pd.read_csv("input/mitbih_test.csv", header=None)

    Y = np.array(df_train[187].values).astype(int)
    X = np.array(df_train[list(range(187))].values)

    Y_test = np.array(df_test[187].values).astype(int)
    X_test = np.array(df_test[list(range(187))].values)[..., np.newaxis]

    #Smote for data augmentation
    if smote:
        sm = SMOTETomek()
        X, Y = sm.fit_resample(X, Y)
        X = X[..., np.newaxis]

    train_dataset = CustomDataset(X, Y)
    val_dataset = CustomDataset(X_test, Y_test)
    if num_samples > 0:
        train_dataset = train_dataset[:num_samples]
        val_dataset = val_dataset[:num_samples]

    train_loader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    return train_loader, val_loader
예제 #19
0
    def __init__(self,  window_size=6, training_ratio=.7, seq="sequence", pos="label"):
        self.training_ratio = training_ratio  # Float value representing % of data used for training
        self.features = []
        self.labels = []
        self.words = []
        self.window_size = window_size
        self.supervised_classifiers = {"forest": RandomForestClassifier(n_jobs=4),
                                       "mlp_adam": MLPClassifier(),
                                       "svc": svm.SVC(verbose=1),
                                       "xgb": XGBClassifier(max_delta_step=5),
                                       "bagging": BaggingClassifier(), "one_class_svm": OneClassSVM(kernel="rbf")
                                       }

        self.imbalance_functions = {"easy_ensemble": EasyEnsemble(), "SMOTEENN": SMOTEENN(),
                                    "SMOTETomek": SMOTETomek(), "ADASYN": ADASYN(),
                                    "random_under_sample": RandomUnderSampler(), "ncl": NeighbourhoodCleaningRule(),
                                    "near_miss": NearMiss(), "pass": -1}
        self.seq = seq
        self.pos = pos
        self.random_data = 0
        self.test_results = 0
        self.vecs = {"sequence": sequence_vector, "chemical": chemical_vector, "binary": binary_vector, "w2v": "w2v"}
        self.vector = 0
        self.features_labels = {}
        self.test_cv = 0
        self.benchmark_mcc = 0
        self.mcc_scorer = make_scorer(matthews_corrcoef)
def get_train_test(X, y, oversample=False, undersample=False, over_sampling=None, test_size=0.20, n=8):
    '''
      --------------------------------------------------------------------------
       Utilizes sklearn train and split function to split the dataset
       this functions is used to facilitate testing different oversampling,
       undersampling ratios, test sizes and train sizes.
       --------------------------------------------------------------------------

              *  X,y are the paramters for x= features y=label
              *  If oversample is True the X_train, Y_train gets oversampled utilizing SMOTE
              *  If undersample is True  the X_train, Y_train gets undersampled
                 utilizing RandomUnderSampler
              *  over_sampling sets the sampling strategy for SMOTE over sampling
              *  under_sampling sets the sampling strategy for RandomUnderSampler under sampling
              *  test_size sets the size of the test set

        --------------------------------------------------------------------------
       '''
    if oversample:
        over = SMOTETomek(random_state=42)
    if undersample:
        undersample = NearMiss(version=2, n_neighbors_ver2=2)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42)
    if oversample:
        X_train, y_train = over.fit_resample(X_train, y_train)
        if undersample:
            X_train, y_train = under.fit_resample(X_train, y_train)
    return X_train, X_test, y_train, y_test
예제 #21
0
def outer_cv_loop(Xdata,
                  Ydata,
                  clf,
                  parameters=[],
                  n_splits=10,
                  test_size=0.25):

    pred = numpy.zeros(len(Ydata))
    importances = []
    kf = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size)
    rocscores = []
    for train, test in kf.split(Xdata, Ydata):
        if numpy.var(Ydata[test]) == 0:
            print('zero variance', varname)
            rocscores.append(numpy.nan)
            continue
        Ytrain = Ydata[train]
        Xtrain = fancyimpute.SoftImpute(verbose=False).complete(
            Xdata[train, :])
        Xtest = fancyimpute.SoftImpute(verbose=False).complete(Xdata[test, :])
        if numpy.abs(numpy.mean(Ytrain) - 0.5) > 0.2:
            smt = SMOTETomek()
            Xtrain, Ytrain = smt.fit_sample(Xtrain.copy(), Ydata[train])
        # filter out bad folds
        clf.fit(Xtrain, Ytrain)
        pred = clf.predict(Xtest)
        if numpy.var(pred) > 0:
            rocscores.append(roc_auc_score(Ydata[test], pred))
        else:
            rocscores.append(numpy.nan)
        importances.append(clf.feature_importances_)
    return rocscores, importances
예제 #22
0
def run():
    files = ["athal", "scere", "dmel", "eugra", "potra"]
    links = ["reaction", "binding", "regulation", "catalysis"]
    for org in files:
        file = "results/" + org + ".processed_data.tsv"
        print(org)
        print("Reading file")
        df = pd.read_csv(file, sep="\t", keep_default_na=True)
        print(df.shape)
        for linktype in links:
            print(linktype)
            binaryLinkTable = transformBinaryLinkTable(linktype, df)
            print(binaryLinkTable.shape)
            X = np.asarray(binaryLinkTable.iloc[:, 1:])
            y = np.asarray(binaryLinkTable["Link"])
            print("Starting Cross-Validation with TPOT")
            skf = StratifiedKFold(n_splits=10)
            #resDic = {}
            i = 1
            for train_index, test_index in skf.split(X, y):
                X_trainDev, X_test = X[train_index], X[test_index]
                y_trainDev, y_test = y[train_index], y[test_index]
                smt = SMOTETomek(random_state=i, n_jobs=-1)
                X_train, y_train = smt.fit_resample(X_trainDev, y_trainDev)
                dataToAnalise = [X_train, y_train, X_test, y_test]
                save_object(dataToAnalise,
                            org + '_' + linktype + '_to_SK' + str(i) + '.pkl')
                i += 1
예제 #23
0
def load_data_mi(batch_size=128, smote=False):

    df_mi1 = pd.read_csv("input/ptbdb_abnormal.csv", header=None)
    df_mi2 = pd.read_csv("input/ptbdb_normal.csv", header=None)
    df_mi = pd.concat([df_mi1, df_mi2], ignore_index=True)
    df_train, df_test = train_test_split(df_mi,
                                         test_size=0.2,
                                         random_state=1,
                                         stratify=df_mi[187])

    Y = np.array(df_train[187].values).astype(int)
    X = np.array(df_train[list(range(187))].values)

    Y_test = np.array(df_test[187].values).astype(int)
    X_test = np.array(df_test[list(range(187))].values)[..., np.newaxis]

    #Smote for data augmentation
    if smote:
        sm = SMOTETomek()
        X, Y = sm.fit_resample(X, Y)
        X = X[..., np.newaxis]

    train_dataset = CustomDataset(X, Y)
    val_dataset = CustomDataset(X_test, Y_test)

    train_loader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    return train_loader, val_loader
예제 #24
0
def imbalance_hander(XTrain, yTrain):
    try:
        smote = SMOTETomek(random_state=42, smote=SMOTE(k_neighbors=4))
        X_smt, y_smt = smote.fit_resample(XTrain, yTrain)
        return X_smt, y_smt
    except Exception as e:
        raise e
예제 #25
0
def trainKNN(traindata: Tuple[np.ndarray, np.ndarray]) -> KNeighborsClassifier:
    """Function returns a trained KNeighborsClassifier instance.

    Parameters
    ----------

    traindata : Tuple[np.ndarray, np.ndarray]
        Tuple of XTrain, and YTrain data to train the KNN with.

    Returns
    -------

    knn : KNeighborsClassifier
        Trained K nearest neighbors classifier.
    """

    Xtrain, Ytrain = traindata

    # Use SMotetomek sampling to balances the classes.
    sample = SMOTETomek(random_state=49, sampling_strategy='minority')
    Xtrain_sample, Ytrain_sample = sample.fit_sample(Xtrain, Ytrain)

    # Train KNN
    # maximum at 13 neighbors
    # n_jobs=-1 to utilize all cores
    knn = KNeighborsClassifier(n_neighbors=13, n_jobs=-1)
    knn.fit(Xtrain_sample, Ytrain_sample.values.ravel())

    return knn
예제 #26
0
def Balance_classes(X_train, y_train, Sampling_Function):
    if Sampling_Function == 'RandomUnderSampler':
        us = RandomUnderSampler(ratio=0.5, random_state=1)
    elif Sampling_Function == 'NearMiss1':
        us = NearMiss(ratio=0.5, random_state=1, version=1, size_ngh=3)
    elif Sampling_Function == 'NearMiss2':
        us = NearMiss(ratio=0.5, random_state=1, version=2, size_ngh=3)
    elif Sampling_Function == 'NearMiss3':
        us = NearMiss(ratio=0.5, random_state=1, version=3, ver3_samp_ngh=3)
    elif Sampling_Function == 'CondensedNearestNeighbour':
        us = CondensedNearestNeighbour(random_state=1)
    elif Sampling_Function == 'EditedNearestNeighbours':
        us = EditedNearestNeighbours(random_state=1, size_ngh=5)
    elif Sampling_Function == 'RepeatedEditedNearestNeighbours':
        us = EditedNearestNeighbours(random_state=1, size_ngh=5)
    elif Sampling_Function == 'TomekLinks':
        us = TomekLinks(random_state=1)
    elif Sampling_Function == 'RandomOverSampler':
        us = RandomOverSampler(ratio=0.5, random_state=1)
    elif Sampling_Function == 'SMOTE':
        us = SMOTE(ratio=0.5, k=5, random_state=1)
    elif Sampling_Function == 'SMOTETomek':
        us = SMOTETomek(ratio=0.5, k=5, random_state=1)
    elif Sampling_Function == 'SMOTEENN':
        us = SMOTEENN(ratio=0.5, k=5, random_state=1, size_ngh=5)
    elif Sampling_Function == 'EasyEnsemble':
        us = EasyEnsemble()
    elif Sampling_Function == 'BalanceCascade_rf':
        us = BalanceCascade(classifier='random-forest', random_state=1)
    elif Sampling_Function == 'BalanceCascade_svm':
        us = BalanceCascade(classifier='linear-svm', random_state=1)

    X_train_res, y_train_res = us.fit_sample(X_train, y_train)

    return X_train_res, y_train_res
예제 #27
0
def SMOTE_methods(df_train, target, method):
    '''The output data has been normalized by MinMaxScaler'''
    scaler = MinMaxScaler()
    X = df_train.drop([target], axis=1)
    y = df_train[target]
    X_normalized = scaler.fit_transform(X)
    if method == 'regular':
        X_res, y_res = SMOTE(kind='regular').fit_sample(X_normalized, y)
    elif method == 'borderline1':
        X_res, y_res = SMOTE(kind='borderline1').fit_sample(X_normalized, y)
    elif method == 'borderline2':
        X_res, y_res = SMOTE(kind='borderline2').fit_sample(X_normalized, y)
    elif method == 'svm':
        X_res, y_res = SMOET(kind='svm').fit_sample(X_normalized, y)
    elif method == 'Tomek':
        sm = SMOTETomek()
        X_res, y_res = sm().fit_sample(X_normalized, y)
    elif method == 'ENN':
        sm = SMOTEENN()
        X_res, y_res = sm().fit_sample(X_normalized, y)
    else:
        raise ValueError('输入方法有误')
    df_final = pd.DataFrame(X_res, columns=X.columns)
    df_final['target'] = y_res
    return df_final
def test_validate_estimator_init():
    """Test right processing while passing objects as initialization"""

    # Create a SMOTE and Tomek object
    smote = SMOTE(random_state=RND_SEED)
    tomek = TomekLinks(random_state=RND_SEED)

    smt = SMOTETomek(smote=smote, tomek=tomek, random_state=RND_SEED)

    X_resampled, y_resampled = smt.fit_sample(X, Y)

    X_gt = np.array([[0.20622591, 0.0582794], [0.68481731, 0.51935141],
                     [1.34192108, -0.13367336], [0.62366841, -0.21312976],
                     [1.61091956, -0.40283504], [-0.37162401, -2.19400981],
                     [0.74680821, 1.63827342], [0.61472253, -0.82309052],
                     [0.19893132, -0.47761769], [0.97407872, 0.44454207],
                     [1.40301027, -0.83648734], [-1.20515198, -1.02689695],
                     [-0.23374509, 0.18370049], [-0.32635887, -0.29299653],
                     [-0.00288378, 0.84259929], [1.79580611, -0.02219234],
                     [0.38307743, -0.05670439], [0.93976473, -0.06570176],
                     [0.70319159, -0.02571668], [0.75052536, -0.19246517]])
    y_gt = np.array(
        [0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
예제 #29
0
def get_smote(feature, label):
    """Uses SMOTE and Tomek Links to under and over sample to combat the class imbalance."""
    print("Raw Data: " + str(sorted(Counter(label).items())))
    smt = SMOTETomek(random_state=42)
    feature_resampled, label_resampled = smt.fit_sample(feature, label)
    print("Resampled: " + str(sorted(Counter(label_resampled).items())))
    return feature_resampled, label_resampled
예제 #30
0
def trainSVM(traindata: Tuple[np.ndarray, np.ndarray]) -> svm.SVC:
    """Function returns a trained svm.SVC instance

    Parameters
    ----------
    traindata : Tuple[np.ndarray, np.ndarray]
        Tuple of XTrain, and YTrain data to train the KNN with.


    Returns
    -------
    svm_model : svm.SVC
        Trained SVM classifier.
    """

    Xtrain, Ytrain = traindata

    sample = SMOTETomek(random_state=49, sampling_strategy='minority')
    Xtrain_sample, Ytrain_sample = sample.fit_sample(Xtrain, Ytrain)

    # hyperparameters found using grid serach and 3 fold validation
    svm_model = svm.SVC(class_weight='balanced',
                        C=1,
                        gamma=0.001,
                        kernel='linear')
    svm_model.fit(Xtrain_sample, Ytrain_sample.values.ravel())

    return svm_model