def __init__(self, smooth_iteration=25, training_iteration=50, spreading_factor=0.83, FD=0.1, learning_rate=0.3, smooth_learning_factor=0.8): """ Called when initializing the classifier """ self.smooth_iteration = smooth_iteration self.spreading_factor = spreading_factor self.training_iteration = training_iteration self.FD = FD self.learning_rate = learning_rate self.smooth_learning_factor = smooth_learning_factor self.gsom = GSOM(self.spreading_factor, 55, max_radius=4, FD=self.FD, learning_rate=self.learning_rate, smooth_learning_factor=self.smooth_learning_factor) self.gsmote = GeometricSMOTE(random_state=1, truncation_factor=1.0, deformation_factor=0, k_neighbors=5, sampling_rate=0.3)
def OverSample(X, y): print('Original dataset shape %s' % Counter(y)) gsmote = GeometricSMOTE(random_state=1) X_res, y_res = gsmote.fit_resample(X, y) print('Resampled dataset shape %s' % Counter(y_res)) return X_res, y_res
def __init__(self, truncation_factor=1.0, deformation_factor=0.0, k_neighbors=1, sampling_rate=0.3, n_estimators=100, learning_rate=0.01, max_depth=3): """ Called when initializing the classifier """ self.truncation_factor = truncation_factor self.deformation_factor = deformation_factor self.k_neighbors = k_neighbors self.sampling_rate = sampling_rate self.n_estimators = n_estimators self.learning_rate = learning_rate self.max_depth = max_depth self.regressor = GradientBoostingClassifier( n_estimators=self.n_estimators, learning_rate=self.learning_rate, max_depth=self.max_depth) self.gsmote = GeometricSMOTE( random_state=1, truncation_factor=self.truncation_factor, deformation_factor=self.deformation_factor, k_neighbors=self.k_neighbors, sampling_rate=self.sampling_rate)
def generate_oversamplers(factor): """Generate a list of oversamplers that pre-apply undersampling.""" if factor is None: return [('BENCHMARK METHOD', None, {})] return [('NO OVERSAMPLING', UnderOverSampler(oversampler=None, factor=factor), {}), ('RANDOM OVERSAMPLING', UnderOverSampler(oversampler=RandomOverSampler(), factor=factor), {}), ('SMOTE', UnderOverSampler(oversampler=SMOTE(), factor=factor), { 'oversampler__k_neighbors': [3, 5] }), ('BORDERLINE SMOTE', UnderOverSampler(oversampler=BorderlineSMOTE(), factor=factor), { 'oversampler__k_neighbors': [3, 5] }), ('G-SMOTE', UnderOverSampler(oversampler=GeometricSMOTE(), factor=factor), { 'oversampler__k_neighbors': [3, 5], 'oversampler__selection_strategy': ['combined', 'minority', 'majority'], 'oversampler__truncation_factor': [-1.0, -0.5, .0, 0.25, 0.5, 0.75, 1.0], 'oversampler__deformation_factor': [.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0] })]
class MeanClassifier(BaseEstimator, ClassifierMixin): """An example of classifier""" def __init__(self, smooth_iteration=25, training_iteration=50, spreading_factor=0.83, FD=0.1, learning_rate=0.3, smooth_learning_factor=0.8): """ Called when initializing the classifier """ self.smooth_iteration = smooth_iteration self.spreading_factor = spreading_factor self.training_iteration = training_iteration self.FD = FD self.learning_rate = learning_rate self.smooth_learning_factor = smooth_learning_factor self.gsom = GSOM(self.spreading_factor, 55, max_radius=4, FD=self.FD, learning_rate=self.learning_rate, smooth_learning_factor=self.smooth_learning_factor) self.gsmote = GeometricSMOTE(random_state=1, truncation_factor=1.0, deformation_factor=0, k_neighbors=5, sampling_rate=0.3) def fit(self, X, y): X_train, y_train = self.gsmote.fit_resample(X, y) y1 = np.copy(y_train) y = np.column_stack([y1, y_train]) labels = ["Name", "label"] y = np.vstack((labels, y)) frame = pd.DataFrame(y[1:, :], columns=y[0, :]) self.gsom.fit(X_train, self.training_iteration, self.smooth_iteration) self.gsom.labelling_gsom(X_train, frame, "Name", "label") self.gsom.finalize_gsom_label() return self # def _meaning(self, x): # return True def predict(self, X): return self.gsom.predict_values(X)
class MeanClassifier(BaseEstimator, ClassifierMixin): """An example of classifier""" def __init__(self, truncation_factor=1.0, deformation_factor=0.0, k_neighbors=1, sampling_rate=0.3, n_estimators=100, learning_rate=0.01, max_depth=3): """ Called when initializing the classifier """ self.truncation_factor = truncation_factor self.deformation_factor = deformation_factor self.k_neighbors = k_neighbors self.sampling_rate = sampling_rate self.n_estimators = n_estimators self.learning_rate = learning_rate self.max_depth = max_depth self.regressor = GradientBoostingClassifier( n_estimators=self.n_estimators, learning_rate=self.learning_rate, max_depth=self.max_depth) self.gsmote = GeometricSMOTE( random_state=1, truncation_factor=self.truncation_factor, deformation_factor=self.deformation_factor, k_neighbors=self.k_neighbors, sampling_rate=self.sampling_rate) def fit(self, X, y): print(self.max_depth, self.learning_rate, self.n_estimators, self.sampling_rate, self.k_neighbors, self.deformation_factor, self.truncation_factor) X_train, y_train = self.gsmote.fit_resample(X, y) self.regressor.fit(X_train, y_train) return self # def _meaning(self, x): # return True def predict(self, y): return self.regressor.predict(y)
def model_fit(X_train, y_train, X_test, y_test): classifier_dict = { "no_oversampling": Pipeline([("none", None), ("lr", LogisticRegression(solver="liblinear"))]), "smote": Pipeline([ ("smote", SMOTE(k_neighbors=1)), ("lr", LogisticRegression(solver="liblinear")), ]), "gsmote": Pipeline([ ("gsmote", GeometricSMOTE(k_neighbors=1)), ("lr", LogisticRegression(solver="liblinear")), ]), } results = {} for name, estimator in classifier_dict.items(): estimator.fit(X_train, y_train) results[name] = estimator.score(X_test, y_test) return pd.DataFrame(data=results.values(), index=results.keys(), columns=["score"])
plt.ylim([0.9, 1.0]) ############################################################################### # Low Imbalance Ratio or high Samples to Features Ratio ############################################################################### ############################################################################### # When :math:`\text{IR} = \frac{\text{\# majority samples}}{\text{\# minority # samples}}` (Imbalance Ratio) is low or :math:`\text{SFR} = \frac{\text{\# # samples}}{\text{\# features}}` (Samples to Features Ratio) is high then the # minority selection strategy and higher absolute values of the truncation and # deformation factors dominate as optimal hyperparameters. X, y = generate_imbalanced_data([0.3, 0.7], 2000, 6, 4) gsmote_gbc = make_pipeline(GeometricSMOTE(random_state=RANDOM_STATE), DecisionTreeClassifier(random_state=RANDOM_STATE)) scoring_name = 'Geometric Mean Score' validation_curve_info = generate_validation_curve_info(gsmote_gbc, X, y, range(1, 8), "geometricsmote__k_neighbors", SCORER) plot_validation_curve(validation_curve_info, scoring_name, 'K Neighbors') validation_curve_info = generate_validation_curve_info(gsmote_gbc, X, y, np.linspace(-1.0, 1.0, 9), "geometricsmote__truncation_factor", SCORER) plot_validation_curve(validation_curve_info, scoring_name, 'Truncation Factor') validation_curve_info = generate_validation_curve_info(gsmote_gbc, X, y, np.linspace(0.0, 1.0, 5), "geometricsmote__deformation_factor", SCORER) plot_validation_curve(validation_curve_info, scoring_name, 'Deformation Factor') validation_curve_info = generate_validation_curve_info(gsmote_gbc, X, y, ['minority', 'majority', 'combined'], "geometricsmote__selection_strategy", SCORER) plot_validation_curve(validation_curve_info, scoring_name, 'Selection Strategy')
def runSMOTEvariationsGen(self, folder): """ Create files with SMOTE preprocessing and without preprocessing. :param datasets: datasets. :param folder: cross-validation folders. :return: """ smote = SMOTE() borderline1 = BorderlineSMOTE(kind='borderline-1') borderline2 = BorderlineSMOTE(kind='borderline-2') smoteSVM = SVMSMOTE() geometric_smote = GeometricSMOTE(n_jobs=-1) for dataset in datasets: # biclass e multiclass for fold in range(5): path = os.path.join(folder, dataset, str(fold), ''.join([dataset, "_train.csv"])) train = np.genfromtxt(path, delimiter=',') X = train[:, 0:train.shape[1] - 1] Y = train[:, train.shape[1] - 1] # SMOTE print("SMOTE..." + dataset) X_res, y_res = smote.fit_sample(X, Y) y_res = y_res.reshape(len(y_res), 1) newdata = np.hstack([X_res, y_res]) newtrain = pd.DataFrame(newdata) newtrain.to_csv(os.path.join(folder, dataset, str(fold), ''.join([dataset, "_SMOTE.csv"])), header=False, index=False) # SMOTE BORDERLINE1 print("Borderline1..." + dataset) X_res, y_res = borderline1.fit_sample(X, Y) y_res = y_res.reshape(len(y_res), 1) newdata = np.hstack([X_res, y_res]) newtrain = pd.DataFrame(newdata) newtrain.to_csv(os.path.join( folder, dataset, str(fold), ''.join([dataset, "_Borderline1.csv"])), header=False, index=False) # SMOTE BORDERLINE2 print("Borderline2..." + dataset) X_res, y_res = borderline2.fit_sample(X, Y) y_res = y_res.reshape(len(y_res), 1) newdata = np.hstack([X_res, y_res]) newtrain = pd.DataFrame(newdata) newtrain.to_csv(os.path.join( folder, dataset, str(fold), ''.join([dataset, "_Borderline2.csv"])), header=False, index=False) # SMOTE SVM print("SMOTE SVM..." + dataset) X_res, y_res = smoteSVM.fit_sample(X, Y) y_res = y_res.reshape(len(y_res), 1) newdata = np.hstack([X_res, y_res]) newtrain = pd.DataFrame(newdata) newtrain.to_csv(os.path.join( folder, dataset, str(fold), ''.join([dataset, "_smoteSVM.csv"])), header=False, index=False) # GEOMETRIC SMOTE print("GEOMETRIC SMOTE..." + dataset) X_res, y_res = geometric_smote.fit_resample(X, Y) y_res = y_res.reshape(len(y_res), 1) newdata = np.hstack([X_res, y_res]) newtrain = pd.DataFrame(newdata) newtrain.to_csv(os.path.join( folder, dataset, str(fold), ''.join([dataset, "_Geometric_SMOTE.csv"])), header=False, index=False)
def parse_input_zoo_data(filename, header='infer'): gsmote = GeometricSMOTE(random_state=1) # # (X_train, y_train), (X_test, y_test) = mnist.load_data() # # d1, d2, d3 = X_train.shape # X_train_reshaped = X_train.reshape(d1, d2 * d3) # print(X_train_reshaped[:2000, :].shape) # y_train_half = y_train[:2000] # classes = y_train_half.tolist() # labels = y_train_half.tolist() # # print(labels) # # input_database = { # 0: X_train_reshaped[:2000, :] # } #GSMOTE # X_f,y_f = GSMOTE.OverSample() # # # X_t, X_test, y_t, y_test = train_test_split(X_f, y_f, test_size=0.2, random_state=0) # # # classes = y_t.tolist() # labels = y_t.tolist() # input_database = { # 0: X_t # } X, y = pp.preProcess(filename) X_t, X_test, y_t, y_test = train_test_split(X, y, test_size=0.2, random_state=0) X_train, y_train = gsmote.fit_resample(X_t, y_t) classes = y_train.tolist() labels = y_train.tolist() input_database = {0: X_train} # (X_train, y_train), (X_test, y_test) = mnist.load_data() # # d1, d2, d3 = X_train.shape # X_train_reshaped = X_train.reshape(d1, d2 * d3) # print(X_train_reshaped[:2000, :].shape) # y_train_half = y_train[:2000] # classes = y_train_half.tolist() # labels = y_train_half.tolist() # # print(labels) # # input_database = { # 0: X_train_reshaped[:2000, :] # } #Smote # X_f,y_f = smote.Data_Extract(filename) # classes = y_f.tolist() # labels = y_f.tolist() # input_database = { # 0: X_f[:,:] # } # input_data = pd.read_csv(filename, header=header) # # input_database = { # 0: input_data.as_matrix([0,1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,17,18,19,20,21,22,23,24,25,26,27,28,29]) # } # # (X_train, y_train), (X_test, y_test) = mnist.load_data() # # d1, d2, d3 = X_train.shape # X_train_reshaped = X_train.reshape(d1, d2 * d3) # print(X_train_reshaped[:2000, :].shape) # y_train_half = y_train[:2000] # classes = y_train_half.tolist() # labels = y_train_half.tolist() # # print(labels) # # input_database = { # 0: X_train_reshaped[:2000, :] # } # input_data = pd.read_csv(filename, header=header) # # classes = input_data[17].tolist() # labels = input_data[0].tolist() # input_database = { # 0: input_data.as_matrix([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]) # } return input_database, labels, classes, X_test, y_test
############################################################################### # Truncation factor #.............................................................................. # # The hyperparameter ``truncation_factor`` determines the degree of truncation # that is applied on the initial geometric area. Selecting the values of # geometric hyperparameters as `truncation_factor=0.0`, # ``deformation_factor=0.0`` and ``selection_strategy='minority'``, the data # generation area in 2D corresponds to a circle with center as one of the two # minority class samples and radius equal to the distance between them. In the # multi-dimensional case the corresponding area is a hypersphere. When # truncation factor is increased, the hypersphere is truncated and for # ``truncation_factor=1.0`` becomes a half-hypersphere. Negative values of # ``truncation_factor`` have a similar effect but on the opposite direction. gsmote = GeometricSMOTE(k_neighbors=1, deformation_factor=0.0, selection_strategy='minority', random_state=RANDOM_STATE) truncation_factors = np.array([0.0, 0.2, 0.4, 0.6, 0.8, 1.0]) n_subplots = [2, 3] plot_hyperparameters(gsmote, X, y, 'truncation_factor', truncation_factors, n_subplots) plot_hyperparameters(gsmote, X, y, 'truncation_factor', -truncation_factors, n_subplots) ############################################################################### # Deformation factor #.............................................................................. # # When the ``deformation_factor`` is increased, the data generation area deforms # to an ellipsis and for ``deformation_factor=1.0`` becomes a line segment. gsmote = GeometricSMOTE(k_neighbors=1, truncation_factor=0.0, selection_strategy='minority', random_state=RANDOM_STATE) deformation_factors = np.array([0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
# # Below we use the Random Forest Classifier to predict the forest type of each # patch of forest. Two experiments are ran: One using only the classifier and # another that creates a pipeline of Geometric SMOTE and the classifier. A # classification report is printed for both experiments. splitted_data = train_test_split(X, y, test_size=0.95, random_state=RANDOM_STATE, shuffle=True) clf = RandomForestClassifier(bootstrap=True, n_estimators=10, random_state=RANDOM_STATE) ovs_clf = make_pipeline(GeometricSMOTE(random_state=RANDOM_STATE), clf) print_classification_report(clf, *splitted_data) print_classification_report(ovs_clf, *splitted_data) ############################################################################### # Indian Pines ############################################################################### ############################################################################### # This hyperspectral data set has 220 spectral bands and 20 m spatial resolution. # The imagery was collected on 12 June 1992 and represents a 2.9 by 2.9 km area # in Tippecanoe County, Indiana, USA. The area is agricultural and eight classes # as land-use types are presented: alfalfa, corn, grass, hay, oats, soybeans, # trees, and wheat. The Indian Pines data set has been used for testing and # comparing algorithms. The number of samples varies greatly among the classes,
def _check_estimators(self, X, y): """Check various estimators.""" # Import SOM and GeometricSMOTE try: from somlearn import SOM except ImportError: raise ImportError( 'GeometricSOMO class requires the package `som-learn` to be installed.' ) try: from gsmote import GeometricSMOTE except ImportError: raise ImportError( 'GeometricSOMO class requires the package `geometric-smote` to ' 'be installed.') # Check oversampler self.oversampler_ = GeometricSMOTE( sampling_strategy=self.sampling_strategy, k_neighbors=self.k_neighbors, truncation_factor=self.truncation_factor, deformation_factor=self.deformation_factor, selection_strategy=self.selection_strategy, random_state=self.random_state_, n_jobs=self.n_jobs, ) if self.som_estimator is None: self.clusterer_ = SOM(random_state=self.random_state_) elif isinstance(self.som_estimator, int): check_scalar(self.som_estimator, 'som_estimator', int, min_val=1) n = round(sqrt(self.som_estimator)) self.clusterer_ = SOM(n_columns=n, n_rows=n, random_state=self.random_state_) elif isinstance(self.som_estimator, float): check_scalar(self.som_estimator, 'som_estimator', float, min_val=0.0, max_val=1.0) n = round(sqrt((X.shape[0] - 1) * self.som_estimator + 1)) self.clusterer_ = SOM(n_columns=n, n_rows=n, random_state=self.random_state_) elif isinstance(self.som_estimator, SOM): self.clusterer_ = clone(self.som_estimator) else: raise TypeError('Parameter `som_estimator` should be ' 'either `None` or the number of clusters ' 'or a float in the [0.0, 1.0] range equal to' ' the number of clusters over the number of ' 'samples or an instance of the `SOM` class.') # Check distributor self.distributor_ = DensityDistributor( filtering_threshold=self.imbalance_ratio_threshold, distances_exponent=self.distances_exponent, distribution_ratio=self.distribution_ratio, ) return self
def load_best_classifier(self, X, y): scores = [] for _, classifier in self.classifiers_: scores.append(self.evaluation_metric_(classifier, X, y)) self.classifier_ = self.classifiers_[np.argmax(scores)][-1] return self def predict(self, X): return self.classifier_.predict(X) CONFIG = { 'oversamplers': [ #('NONE', None, {}), ('G-SMOTE', ClusterOverSampler(GeometricSMOTE(), n_jobs=1), {}) # 'oversampler__k_neighbors': [3, 5], # 'oversampler__selection_strategy': ['combined', 'minority', 'majority'], # 'oversampler__truncation_factor': [-1.0, .0, 1.0], # 'oversampler__deformation_factor': [.0, 0.5, 1.0] # }) ], 'classifiers': [ ('LR', LogisticRegression(multi_class='multinomial', solver='sag', penalty='none', max_iter=1e4), {}), ('KNN', KNeighborsClassifier(), { 'n_neighbors': [3] }), #, 5, 8]}),
############################################################################### # Low Imbalance Ratio or high Samples to Features Ratio ############################################################################### ############################################################################### # When :math:`\text{IR} = \frac{\text{\# majority samples}}{\text{\# minority # samples}}` (Imbalance Ratio) is low or :math:`\text{SFR} = \frac{\text{\# # samples}}{\text{\# features}}` (Samples to Features Ratio) is high then the # minority selection strategy and higher absolute values of the truncation and # deformation factors dominate as optimal hyperparameters. X, y = generate_imbalanced_data([0.3, 0.7], 2000, 6, 4) gsmote_gbc = make_pipeline( GeometricSMOTE(random_state=RANDOM_STATE), DecisionTreeClassifier(random_state=RANDOM_STATE), ) scoring_name = 'Geometric Mean Score' validation_curve_info = generate_validation_curve_info( gsmote_gbc, X, y, range(1, 8), "geometricsmote__k_neighbors", SCORER) plot_validation_curve(validation_curve_info, scoring_name, 'K Neighbors') validation_curve_info = generate_validation_curve_info( gsmote_gbc, X, y, np.linspace(-1.0, 1.0, 9), "geometricsmote__truncation_factor", SCORER,
'learning_rate': [0.01], 'max_depth': [3] }] gs = GridSearchCV(MeanClassifier(), parameters) gs.fit(X, y) params = gs.best_params_ print(params) #find performance X_t, X_test, y_t, y_test = train_test_split(X, y, test_size=0.2, random_state=0) gsmote = GeometricSMOTE(random_state=1, truncation_factor=params["truncation_factor"], deformation_factor=params["deformation_factor"], k_neighbors=params["k_neighbors"], sampling_rate=params["sampling_rate"]) X_train, y_train = gsmote.fit_resample(X_t, y_t) # Fitting Gradient boosting gbc = GradientBoostingClassifier(n_estimators=params["n_estimators"], learning_rate=params["learning_rate"], max_depth=params["max_depth"]) gbc.fit(X_train, y_train) # Predicting the Test set results y_predict = gbc.predict(X_test) y_pred = np.where(y_predict.astype(int) > 0.5, 1, 0) evaluate("Gradient Boosting", y_test, y_pred)
X_largest, y_largest, X_smallest, y_smallest = X2, y2, X1, y1 intersecting_vals = np.in1d(X_largest, X_smallest).reshape(X_largest.shape) disjoin_indexes = np.where(~np.all(intersecting_vals, axis=1))[0] return X_largest.iloc[disjoin_indexes], y_largest.iloc[disjoin_indexes] for strategy in ["combined", "majority", "minority"]: X_gsmote_final = np.empty(shape=(0, X_train.shape[-1])) y_gsmote_final = np.empty(shape=(0)) for d in [0, 0.5, 1]: for t in [-1, 0, 1]: gsmote_sampling = GeometricSMOTE( k_neighbors=1, deformation_factor=d, truncation_factor=t, n_jobs=-1, selection_strategy=strategy, ).fit_resample(X_train, y_train) X_gsmote, _ = get_disjoin(X_train, y_train, gsmote_sampling[0], gsmote_sampling[1]) X_gsmote_final = np.append(X_gsmote_final, X_gsmote, axis=0) y_gsmote_final = np.append(y_gsmote_final, np.array([f"t={t}, d={d}"] * X_gsmote.shape[0]), axis=0) plot_mnist_samples( pd.DataFrame(X_gsmote_final), pd.Series(y_gsmote_final), f"Generated Using G-SMOTE: {strategy}", )
"""Geometric mean score with macro average.""" return geometric_mean_score(y_true, y_pred, average='macro') SCORERS['geometric_mean_score_macro'] = make_scorer(geometric_mean_score_macro) CONFIG = { 'oversamplers': [('NONE', None, {}), ('ROS', RandomOverSampler(), {}), ('SMOTE', SMOTE(), { 'k_neighbors': [3, 5] }), ('B-SMOTE', BorderlineSMOTE(), { 'k_neighbors': [3, 5] }), ('ADASYN', ADASYN(), { 'n_neighbors': [2, 3] }), ('G-SMOTE', GeometricSMOTE(), { 'k_neighbors': [3, 5], 'selection_strategy': ['combined', 'minority', 'majority'], 'truncation_factor': [-1.0, -0.5, .0, 0.25, 0.5, 0.75, 1.0], 'deformation_factor': [.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0] })], 'classifiers': [ ('CONSTANT CLASSIFIER', DummyClassifier(strategy='constant', constant=0), {}), ('LR', LogisticRegression(solver='liblinear', multi_class='auto'), {}), ('KNN', KNeighborsClassifier(), { 'n_neighbors': [3, 5] }), ('DT', DecisionTreeClassifier(), { 'max_depth': [3, 6] }), ('GBC', GradientBoostingClassifier(), {
def runSMOTEvariationsGen(self, folder): """ Create files with SMOTE preprocessing and without preprocessing. :param datasets: datasets. :param folder: cross-validation folders. :return: """ smote = SMOTE() borderline1 = BorderlineSMOTE(kind='borderline-1') borderline2 = BorderlineSMOTE(kind='borderline-2') smoteSVM = SVMSMOTE() geometric_smote = GeometricSMOTE(n_jobs=-1) for dataset in datasets: for fold in range(5): path = os.path.join(folder, dataset, str(fold), ''.join([dataset, "_train.csv"])) train = np.genfromtxt(path, delimiter=',') X = train[:, 0:train.shape[1] - 1] Y = train[:, train.shape[1] - 1] Y = Y.reshape(len(Y), 1) # SMOTE print("SMOTE..." + dataset) data_r = np.hstack([X, Y]) data_r = pd.DataFrame(data_r) data_r.columns = data_r.columns.astype(str) colunas = list(data_r.columns) y_name = colunas[-1] dtoregression = dtosmoter( data=data_r, y=y_name, oversampler=smote ) dtoregression.to_csv(os.path.join(folder, dataset, str(fold), ''.join([dataset, "_SMOTE.csv"])), header=False, index=False) # SMOTE BORDERLINE1 print("Borderline1..." + dataset) data_r = np.hstack([X, Y]) data_r = pd.DataFrame(data_r) data_r.columns = data_r.columns.astype(str) colunas = list(data_r.columns) y_name = colunas[-1] dtoregression = dtosmoter( data=data_r, y=y_name, oversampler=borderline1 ) dtoregression.to_csv(os.path.join(folder, dataset, str(fold), ''.join([dataset, "_Borderline1.csv"])), header=False, index=False) # SMOTE BORDERLINE2 print("Borderline2..." + dataset) data_r = np.hstack([X, Y]) data_r = pd.DataFrame(data_r) data_r.columns = data_r.columns.astype(str) colunas = list(data_r.columns) y_name = colunas[-1] dtoregression = dtosmoter( data=data_r, y=y_name, oversampler=borderline2 ) dtoregression.to_csv(os.path.join(folder, dataset, str(fold), ''.join([dataset, "_Borderline2.csv"])), header=False, index=False) # SMOTE SVM print("SMOTE SVM..." + dataset) data_r = np.hstack([X, Y]) data_r = pd.DataFrame(data_r) data_r.columns = data_r.columns.astype(str) colunas = list(data_r.columns) y_name = colunas[-1] dtoregression = dtosmoter( data=data_r, y=y_name, oversampler=smoteSVM ) dtoregression.to_csv(os.path.join(folder, dataset, str(fold), ''.join([dataset, "_smoteSVM.csv"])), header=False, index=False) # GEOMETRIC SMOTE print("GEOMETRIC SMOTE..." + dataset) data_r = np.hstack([X, Y]) data_r = pd.DataFrame(data_r) data_r.columns = data_r.columns.astype(str) colunas = list(data_r.columns) y_name = colunas[-1] dtoregression = dtosmoter( data=data_r, y=y_name, oversampler=geometric_smote ) dtoregression.to_csv( os.path.join(folder, dataset, str(fold), ''.join([dataset, "_Geometric_SMOTE.csv"])), header=False, index=False)