def unbalance_helper(X_train, X_test, y_train, y_test, imbalance_method='under_sampling'): """ Args: imbalance_method (str, optional): over_sampling, or under_sampling. Defaults to 'under_sampling'. Returns: processed data """ # 是否使用不平衡数据处理方式,上采样, 下采样, ensemble if imbalance_method == 'over_sampling': print("Use SMOTETomek deal with unbalance data ") # 插值生成新样本 X_train, y_train = SMOTETomek().fit_resample(X_train, y_train) X_test, y_test = SMOTETomek().fit_resample(X_train, y_train) elif imbalance_method == 'under_sampling': print("Use ClusterCentroids deal with unbalance data ") X_train, y_train = ClusterCentroids(random_state=0).fit_resample( X_train, y_train) X_test, y_test = ClusterCentroids(random_state=0).fit_resample( X_test, y_test) return X_train, y_train, X_test, y_test
def test_validate_estimator_deprecation(): """Test right processing while passing old parameters""" X_gt = np.array([[0.20622591, 0.0582794], [0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [0.97407872, 0.44454207], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [0.38307743, -0.05670439], [0.93976473, -0.06570176], [0.70319159, -0.02571668], [0.75052536, -0.19246517]]) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0]) smt = SMOTETomek(random_state=RND_SEED, n_jobs=-1) X_resampled, y_resampled = smt.fit_sample(X, Y) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) smt = SMOTETomek(random_state=RND_SEED, k=5) X_resampled, y_resampled = smt.fit_sample(X, Y) assert_array_almost_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_validate_estimator_deprecation(): smt = SMOTETomek(random_state=RND_SEED, n_jobs=-1) X_resampled, y_resampled = smt.fit_sample(X, Y) X_gt = np.array([[0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [0.38307743, -0.05670439], [0.70319159, -0.02571667], [0.75052536, -0.19246518]]) y_gt = np.array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) smt = SMOTETomek(random_state=RND_SEED, k=5) X_resampled, y_resampled = smt.fit_sample(X, Y) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def data_balance(train_df, test_df=None, ngram=(1, 1)): from collections import Counter # count_vect = CountVectorizer(ngram_range=ngram,max_features=2500) count_vect = CountVectorizer(ngram_range=ngram) y_tr = train_df.label # y_tr = y_tr.astype(int) X_train_counts = count_vect.fit_transform(train_df.text) smk_tr = SMOTETomek() X_train_counts, y_tr_res = smk_tr.fit_sample(X_train_counts, y_tr) print(f'original data set count{Counter(y_tr)}') print(f'new balanced data set count{Counter(y_tr_res)}') tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts) X_train_tf = tf_transformer.transform(X_train_counts) if test_df is not None: y_ts = test_df.label # y_ts = y_ts.astype(int) X_test_counts = count_vect.transform(test_df.text) smk_ts = SMOTETomek() x_ts_res, y_ts_res = smk_ts.fit_sample(X_test_counts, y_ts) tf_transformer = TfidfTransformer(use_idf=False).fit(X_test_counts) X_test_tfidf = tf_transformer.transform(X_test_counts) print(f'original ts ds count{Counter(y_ts)}') print(f'new st ds count{Counter(y_ts_res)}') return X_train_counts, X_train_tf, y_tr_res, X_test_counts, X_test_tfidf, y_ts return X_train_counts, X_train_tf, y_tr_res
def test_error_wrong_object(): smote = 'rnd' tomek = 'rnd' smt = SMOTETomek(smote=smote, random_state=RND_SEED) with raises(ValueError, match="smote needs to be a SMOTE"): smt.fit_sample(X, Y) smt = SMOTETomek(tomek=tomek, random_state=RND_SEED) with raises(ValueError, match="tomek needs to be a TomekLinks"): smt.fit_sample(X, Y)
def test_error_wrong_object(): smote = 'rnd' tomek = 'rnd' smt = SMOTETomek(smote=smote, random_state=RND_SEED) assert_raises_regex(ValueError, "smote needs to be a SMOTE", smt.fit_sample, X, Y) smt = SMOTETomek(tomek=tomek, random_state=RND_SEED) assert_raises_regex(ValueError, "tomek needs to be a TomekLinks", smt.fit_sample, X, Y)
def test_error_wrong_object(): """Test either if an error is raised while wrong objects are provided at the initialization""" # Create a SMOTE and Tomek object smote = 'rnd' tomek = 'rnd' smt = SMOTETomek(smote=smote, random_state=RND_SEED) assert_raises(ValueError, smt.fit, X, Y) smt = SMOTETomek(tomek=tomek, random_state=RND_SEED) assert_raises(ValueError, smt.fit, X, Y)
def test_multiclass_error(): """ Test either if an error is raised when the target are not binary type. """ # continuous case y = np.linspace(0, 1, 5000) sm = SMOTETomek(random_state=RND_SEED) assert_warns(UserWarning, sm.fit, X, y) # multiclass case y = np.array([0] * 2000 + [1] * 2000 + [2] * 1000) sm = SMOTETomek(random_state=RND_SEED) assert_warns(UserWarning, sm.fit, X, y)
def test_parallelisation(): # Check if default job count is None smt = SMOTETomek(random_state=RND_SEED) smt._validate_estimator() assert smt.n_jobs is None assert smt.smote_.n_jobs is None assert smt.tomek_.n_jobs is None # Check if job count is set smt = SMOTETomek(random_state=RND_SEED, n_jobs=8) smt._validate_estimator() assert smt.n_jobs == 8 assert smt.smote_.n_jobs == 8 assert smt.tomek_.n_jobs == 8
def resample(): test_switch = np.load('data/test_switch_w_64_f_20.npy') test_non_switch = np.load('data/test_non_switch_w_64_f_20.npy') train_switch = np.load('data/train_switch_w_64_f_20.npy') train_non_switch = np.load('data/train_non_switch_w_64_f_20.npy') resample_train = SMOTETomek(sampling_strategy='all', smote=SMOTE(n_jobs=4), tomek=TomekLinks(n_jobs=4)) resampe_test = SMOTETomek(sampling_strategy='all', smote=SMOTE(n_jobs=4), tomek=TomekLinks(n_jobs=4)) print('Beginning train resample...') X = np.concatenate((train_switch, train_non_switch)) y = np.concatenate( (np.zeros(train_switch.shape[0]), np.ones(train_non_switch.shape[0]))) X_res, y_res = resample_train.fit_resample(X, y) train_switch = [] train_non_switch = [] for i in range(X_res.shape[0]): if y_res[i] == 0: train_switch.append(X_res[i]) else: train_non_switch.append(X_res[i]) np.save('data/train_switch_w_64_f_20_samp.npy', np.array(train_switch)) np.save('data/train_non_switch_w_64_f_20_samp.npy', np.array(train_non_switch)) print('Beginning test resample...') X = np.concatenate((test_switch, test_non_switch)) y = np.concatenate( (np.zeros(test_switch.shape[0]), np.ones(test_non_switch.shape[0]))) X_res, y_res = resample_test.fit_resample(X, y) test_switch = [] test_non_switch = [] for i in range(X_res.shape[0]): if y_res[i] == 0: test_switch.append(X_res[i]) else: test_non_switch.append(X_res[i]) np.save('data/test_switch_w_64_f_20_samp.npy', np.array(test_switch)) np.save('data/test_non_switch_w_64_f_20_samp.npy', np.array(test_non_switch)) return
def balanceData(self, method: str = "mixsampling") -> None: """ Function -> balanceData Balance data classes wiht method selected Parameters --------------------------------------------------------------------------- method => mixsampling, undersampling or oversampling Return --------------------------------------------------------------------------- None => Modify self.balanceObj """ if method == "mixsampling": from imblearn.combine import SMOTETomek self.balanceObj = SMOTETomek(sampling_strategy='auto') elif method == "undersampling": from imblearn.under_sampling import NearMiss self.balanceObj = NearMiss(sampling_strategy= "auto", n_neighbors=3, version=2) elif method == "oversampling": from imblearn.over_sampling import RandomOverSampler self.balanceObj = RandomOverSampler(sampling_strategy = "auto") else: raise NameError(f"{method} method not defined")
def getXY(graphs): ''' 得到经过均衡处理后的xy,并对x进行预处理 :param graphs: getGraph得到的图 :return: X,Y-list ''' X = list() Y = list() for graph in graphs: X.append(graphs[graph]['x']) Y.append(graphs[graph]['target']) X = np.array(X).astype('float64') Y = np.array(Y) # 结合采样 # https://blog.csdn.net/kizgel/article/details/78553009 smote_tomek = SMOTETomek(random_state=0) X_resampled, y_resampled = smote_tomek.fit_sample(X, Y) logger.info(sorted(Counter(y_resampled).items())) # print(sorted(Counter(y_resampled).items())) # rus = RandomUnderSampler(random_state=0) # X_resampled, y_resampled = rus.fit_sample(X, Y) # logger.info(sorted(Counter(y_resampled).items())) # 预处理 (X-mean)/std 计算时对每个属性/每列分别进行。 # 将数据按期属性(按列进行)减去其均值,并处以其方差。得到的结果是,对于每个属性/每列来说所有数据都聚集在0附近,方差为1。 scaler = preprocessing.StandardScaler().fit(X_resampled) X_train_transformed = scaler.transform(X_resampled) return X_train_transformed, y_resampled
def __turnBalanced(df): """Balances a unbalanced training set. Parameters ---------- df : DataFrame Training set. Returns ------- df_features: DataFrame Balanced features. df_target: DataFrame Balanced target. """ dropCat = pd.DataFrame(df[df.columns[-1]].value_counts()) if len(dropCat.index.tolist()) >= 10: limit = len(df)*0.05 else: limit = 10 dropCat = pd.DataFrame(df[df.columns[-1]].value_counts()) dropCat = dropCat[dropCat[dropCat.columns[-1]] < limit].index.tolist() df = df[~df[df.columns[-1]].isin(dropCat)] df = df.dropna() df = df.reset_index() smt = SMOTETomek() X_smt, y_smt = smt.fit_sample(df.iloc[:, :-1], df[df.columns[-1]]) collections.Counter(y_smt) df = pd.concat([pd.DataFrame(X_smt), pd.Series(y_smt)], axis=1, sort=False) df_features = df.iloc[:, :-1] df_features = df_features.drop(columns=['index']) df_target = df[df.columns[-1]] return df_features, df_target
def test_smote_sample_wt_fit(): """Test either if an error is raised when sample is called before fitting""" # Create the object smote = SMOTETomek(random_state=RND_SEED) assert_raises(RuntimeError, smote.sample, X, Y)
def get_smotetomek(X_trn, y_trn, seed=int(623 * 4413)): """ Resamples using SMOTETOMEK """ SMTMK = SMOTETomek(random_state=seed) X_trn, y_trn = SMTMK.fit_resample(X_trn, y_trn) return X_trn, y_trn
def getUnderAndOverSamplers(): samplers = { 'SMOTEENN': SMOTEENN(sampling_strategy=0.5, n_jobs=-1), # 'SMOTEENN': SMOTEENN(sampling_strategy=0.5, n_jobs=-1) 'SMOTETomek': SMOTETomek(sampling_strategy=0.5, n_jobs=-1) } return samplers
class ResamplingAlgorithms(Enum): RO = ("Random Over-sampling", RandomOverSampler(random_state=1)) SMOTE = ("Smote", SMOTE(random_state=1)) ADASYN = ("ADASYN", ADASYN(random_state=1)) SMOTE_TL = ('SMOTE+TL', SMOTETomek(random_state=1)) SMOTE_ENN = ('SMOTE+ENN', SMOTEENN(random_state=1)) SMOTE_BOOST = ("SMOTEBoost", smote_boost.SMOTEBoost()) RU = ("Random Under-sampling", RandomUnderSampler(random_state=1)) CLUSTERCENTROIDS = ("ClusterCentroids", ClusterCentroids(random_state=1)) TOMEK_LINKS = ("TomekLinks", TomekLinks()) NM1 = ("NM1", NearMiss(version=1)) NM2 = ("NM2", NearMiss(version=2)) NM3 = ("NM3", NearMiss(version=3)) CNN = ("CNN", CondensedNearestNeighbour(random_state=1)) OSS = ("OneSidedSelection", OneSidedSelection(random_state=1)) ENN = ('ENN', EditedNearestNeighbours()) NCL = ('NCL', NeighbourhoodCleaningRule()) IHT = ('IHT', (InstanceHardnessThreshold(random_state=1))) RENN = ('RENN', RepeatedEditedNearestNeighbours()) AllKNN = ('AllKNN', AllKNN()) @classmethod def get_algorithm_by_name(cls, name): filtered_algos = filter(lambda ra: ra.value[0] == name, ResamplingAlgorithms) return next(filtered_algos, ResamplingAlgorithms.RO)
def load_data(batch_size=128, smote=False, num_samples=-1): df_train = pd.read_csv("input/mitbih_train.csv", header=None) df_train = df_train.sample(frac=1) df_test = pd.read_csv("input/mitbih_test.csv", header=None) Y = np.array(df_train[187].values).astype(int) X = np.array(df_train[list(range(187))].values) Y_test = np.array(df_test[187].values).astype(int) X_test = np.array(df_test[list(range(187))].values)[..., np.newaxis] #Smote for data augmentation if smote: sm = SMOTETomek() X, Y = sm.fit_resample(X, Y) X = X[..., np.newaxis] train_dataset = CustomDataset(X, Y) val_dataset = CustomDataset(X_test, Y_test) if num_samples > 0: train_dataset = train_dataset[:num_samples] val_dataset = val_dataset[:num_samples] train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=batch_size) return train_loader, val_loader
def __init__(self, window_size=6, training_ratio=.7, seq="sequence", pos="label"): self.training_ratio = training_ratio # Float value representing % of data used for training self.features = [] self.labels = [] self.words = [] self.window_size = window_size self.supervised_classifiers = {"forest": RandomForestClassifier(n_jobs=4), "mlp_adam": MLPClassifier(), "svc": svm.SVC(verbose=1), "xgb": XGBClassifier(max_delta_step=5), "bagging": BaggingClassifier(), "one_class_svm": OneClassSVM(kernel="rbf") } self.imbalance_functions = {"easy_ensemble": EasyEnsemble(), "SMOTEENN": SMOTEENN(), "SMOTETomek": SMOTETomek(), "ADASYN": ADASYN(), "random_under_sample": RandomUnderSampler(), "ncl": NeighbourhoodCleaningRule(), "near_miss": NearMiss(), "pass": -1} self.seq = seq self.pos = pos self.random_data = 0 self.test_results = 0 self.vecs = {"sequence": sequence_vector, "chemical": chemical_vector, "binary": binary_vector, "w2v": "w2v"} self.vector = 0 self.features_labels = {} self.test_cv = 0 self.benchmark_mcc = 0 self.mcc_scorer = make_scorer(matthews_corrcoef)
def get_train_test(X, y, oversample=False, undersample=False, over_sampling=None, test_size=0.20, n=8): ''' -------------------------------------------------------------------------- Utilizes sklearn train and split function to split the dataset this functions is used to facilitate testing different oversampling, undersampling ratios, test sizes and train sizes. -------------------------------------------------------------------------- * X,y are the paramters for x= features y=label * If oversample is True the X_train, Y_train gets oversampled utilizing SMOTE * If undersample is True the X_train, Y_train gets undersampled utilizing RandomUnderSampler * over_sampling sets the sampling strategy for SMOTE over sampling * under_sampling sets the sampling strategy for RandomUnderSampler under sampling * test_size sets the size of the test set -------------------------------------------------------------------------- ''' if oversample: over = SMOTETomek(random_state=42) if undersample: undersample = NearMiss(version=2, n_neighbors_ver2=2) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=42) if oversample: X_train, y_train = over.fit_resample(X_train, y_train) if undersample: X_train, y_train = under.fit_resample(X_train, y_train) return X_train, X_test, y_train, y_test
def outer_cv_loop(Xdata, Ydata, clf, parameters=[], n_splits=10, test_size=0.25): pred = numpy.zeros(len(Ydata)) importances = [] kf = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size) rocscores = [] for train, test in kf.split(Xdata, Ydata): if numpy.var(Ydata[test]) == 0: print('zero variance', varname) rocscores.append(numpy.nan) continue Ytrain = Ydata[train] Xtrain = fancyimpute.SoftImpute(verbose=False).complete( Xdata[train, :]) Xtest = fancyimpute.SoftImpute(verbose=False).complete(Xdata[test, :]) if numpy.abs(numpy.mean(Ytrain) - 0.5) > 0.2: smt = SMOTETomek() Xtrain, Ytrain = smt.fit_sample(Xtrain.copy(), Ydata[train]) # filter out bad folds clf.fit(Xtrain, Ytrain) pred = clf.predict(Xtest) if numpy.var(pred) > 0: rocscores.append(roc_auc_score(Ydata[test], pred)) else: rocscores.append(numpy.nan) importances.append(clf.feature_importances_) return rocscores, importances
def run(): files = ["athal", "scere", "dmel", "eugra", "potra"] links = ["reaction", "binding", "regulation", "catalysis"] for org in files: file = "results/" + org + ".processed_data.tsv" print(org) print("Reading file") df = pd.read_csv(file, sep="\t", keep_default_na=True) print(df.shape) for linktype in links: print(linktype) binaryLinkTable = transformBinaryLinkTable(linktype, df) print(binaryLinkTable.shape) X = np.asarray(binaryLinkTable.iloc[:, 1:]) y = np.asarray(binaryLinkTable["Link"]) print("Starting Cross-Validation with TPOT") skf = StratifiedKFold(n_splits=10) #resDic = {} i = 1 for train_index, test_index in skf.split(X, y): X_trainDev, X_test = X[train_index], X[test_index] y_trainDev, y_test = y[train_index], y[test_index] smt = SMOTETomek(random_state=i, n_jobs=-1) X_train, y_train = smt.fit_resample(X_trainDev, y_trainDev) dataToAnalise = [X_train, y_train, X_test, y_test] save_object(dataToAnalise, org + '_' + linktype + '_to_SK' + str(i) + '.pkl') i += 1
def load_data_mi(batch_size=128, smote=False): df_mi1 = pd.read_csv("input/ptbdb_abnormal.csv", header=None) df_mi2 = pd.read_csv("input/ptbdb_normal.csv", header=None) df_mi = pd.concat([df_mi1, df_mi2], ignore_index=True) df_train, df_test = train_test_split(df_mi, test_size=0.2, random_state=1, stratify=df_mi[187]) Y = np.array(df_train[187].values).astype(int) X = np.array(df_train[list(range(187))].values) Y_test = np.array(df_test[187].values).astype(int) X_test = np.array(df_test[list(range(187))].values)[..., np.newaxis] #Smote for data augmentation if smote: sm = SMOTETomek() X, Y = sm.fit_resample(X, Y) X = X[..., np.newaxis] train_dataset = CustomDataset(X, Y) val_dataset = CustomDataset(X_test, Y_test) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=batch_size) return train_loader, val_loader
def imbalance_hander(XTrain, yTrain): try: smote = SMOTETomek(random_state=42, smote=SMOTE(k_neighbors=4)) X_smt, y_smt = smote.fit_resample(XTrain, yTrain) return X_smt, y_smt except Exception as e: raise e
def trainKNN(traindata: Tuple[np.ndarray, np.ndarray]) -> KNeighborsClassifier: """Function returns a trained KNeighborsClassifier instance. Parameters ---------- traindata : Tuple[np.ndarray, np.ndarray] Tuple of XTrain, and YTrain data to train the KNN with. Returns ------- knn : KNeighborsClassifier Trained K nearest neighbors classifier. """ Xtrain, Ytrain = traindata # Use SMotetomek sampling to balances the classes. sample = SMOTETomek(random_state=49, sampling_strategy='minority') Xtrain_sample, Ytrain_sample = sample.fit_sample(Xtrain, Ytrain) # Train KNN # maximum at 13 neighbors # n_jobs=-1 to utilize all cores knn = KNeighborsClassifier(n_neighbors=13, n_jobs=-1) knn.fit(Xtrain_sample, Ytrain_sample.values.ravel()) return knn
def Balance_classes(X_train, y_train, Sampling_Function): if Sampling_Function == 'RandomUnderSampler': us = RandomUnderSampler(ratio=0.5, random_state=1) elif Sampling_Function == 'NearMiss1': us = NearMiss(ratio=0.5, random_state=1, version=1, size_ngh=3) elif Sampling_Function == 'NearMiss2': us = NearMiss(ratio=0.5, random_state=1, version=2, size_ngh=3) elif Sampling_Function == 'NearMiss3': us = NearMiss(ratio=0.5, random_state=1, version=3, ver3_samp_ngh=3) elif Sampling_Function == 'CondensedNearestNeighbour': us = CondensedNearestNeighbour(random_state=1) elif Sampling_Function == 'EditedNearestNeighbours': us = EditedNearestNeighbours(random_state=1, size_ngh=5) elif Sampling_Function == 'RepeatedEditedNearestNeighbours': us = EditedNearestNeighbours(random_state=1, size_ngh=5) elif Sampling_Function == 'TomekLinks': us = TomekLinks(random_state=1) elif Sampling_Function == 'RandomOverSampler': us = RandomOverSampler(ratio=0.5, random_state=1) elif Sampling_Function == 'SMOTE': us = SMOTE(ratio=0.5, k=5, random_state=1) elif Sampling_Function == 'SMOTETomek': us = SMOTETomek(ratio=0.5, k=5, random_state=1) elif Sampling_Function == 'SMOTEENN': us = SMOTEENN(ratio=0.5, k=5, random_state=1, size_ngh=5) elif Sampling_Function == 'EasyEnsemble': us = EasyEnsemble() elif Sampling_Function == 'BalanceCascade_rf': us = BalanceCascade(classifier='random-forest', random_state=1) elif Sampling_Function == 'BalanceCascade_svm': us = BalanceCascade(classifier='linear-svm', random_state=1) X_train_res, y_train_res = us.fit_sample(X_train, y_train) return X_train_res, y_train_res
def SMOTE_methods(df_train, target, method): '''The output data has been normalized by MinMaxScaler''' scaler = MinMaxScaler() X = df_train.drop([target], axis=1) y = df_train[target] X_normalized = scaler.fit_transform(X) if method == 'regular': X_res, y_res = SMOTE(kind='regular').fit_sample(X_normalized, y) elif method == 'borderline1': X_res, y_res = SMOTE(kind='borderline1').fit_sample(X_normalized, y) elif method == 'borderline2': X_res, y_res = SMOTE(kind='borderline2').fit_sample(X_normalized, y) elif method == 'svm': X_res, y_res = SMOET(kind='svm').fit_sample(X_normalized, y) elif method == 'Tomek': sm = SMOTETomek() X_res, y_res = sm().fit_sample(X_normalized, y) elif method == 'ENN': sm = SMOTEENN() X_res, y_res = sm().fit_sample(X_normalized, y) else: raise ValueError('输入方法有误') df_final = pd.DataFrame(X_res, columns=X.columns) df_final['target'] = y_res return df_final
def test_validate_estimator_init(): """Test right processing while passing objects as initialization""" # Create a SMOTE and Tomek object smote = SMOTE(random_state=RND_SEED) tomek = TomekLinks(random_state=RND_SEED) smt = SMOTETomek(smote=smote, tomek=tomek, random_state=RND_SEED) X_resampled, y_resampled = smt.fit_sample(X, Y) X_gt = np.array([[0.20622591, 0.0582794], [0.68481731, 0.51935141], [1.34192108, -0.13367336], [0.62366841, -0.21312976], [1.61091956, -0.40283504], [-0.37162401, -2.19400981], [0.74680821, 1.63827342], [0.61472253, -0.82309052], [0.19893132, -0.47761769], [0.97407872, 0.44454207], [1.40301027, -0.83648734], [-1.20515198, -1.02689695], [-0.23374509, 0.18370049], [-0.32635887, -0.29299653], [-0.00288378, 0.84259929], [1.79580611, -0.02219234], [0.38307743, -0.05670439], [0.93976473, -0.06570176], [0.70319159, -0.02571668], [0.75052536, -0.19246517]]) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def get_smote(feature, label): """Uses SMOTE and Tomek Links to under and over sample to combat the class imbalance.""" print("Raw Data: " + str(sorted(Counter(label).items()))) smt = SMOTETomek(random_state=42) feature_resampled, label_resampled = smt.fit_sample(feature, label) print("Resampled: " + str(sorted(Counter(label_resampled).items()))) return feature_resampled, label_resampled
def trainSVM(traindata: Tuple[np.ndarray, np.ndarray]) -> svm.SVC: """Function returns a trained svm.SVC instance Parameters ---------- traindata : Tuple[np.ndarray, np.ndarray] Tuple of XTrain, and YTrain data to train the KNN with. Returns ------- svm_model : svm.SVC Trained SVM classifier. """ Xtrain, Ytrain = traindata sample = SMOTETomek(random_state=49, sampling_strategy='minority') Xtrain_sample, Ytrain_sample = sample.fit_sample(Xtrain, Ytrain) # hyperparameters found using grid serach and 3 fold validation svm_model = svm.SVC(class_weight='balanced', C=1, gamma=0.001, kernel='linear') svm_model.fit(Xtrain_sample, Ytrain_sample.values.ravel()) return svm_model