예제 #1
0
 def __init__(self,
              smooth_iteration=25,
              training_iteration=50,
              spreading_factor=0.83,
              FD=0.1,
              learning_rate=0.3,
              smooth_learning_factor=0.8):
     """
     Called when initializing the classifier
     """
     self.smooth_iteration = smooth_iteration
     self.spreading_factor = spreading_factor
     self.training_iteration = training_iteration
     self.FD = FD
     self.learning_rate = learning_rate
     self.smooth_learning_factor = smooth_learning_factor
     self.gsom = GSOM(self.spreading_factor,
                      55,
                      max_radius=4,
                      FD=self.FD,
                      learning_rate=self.learning_rate,
                      smooth_learning_factor=self.smooth_learning_factor)
     self.gsmote = GeometricSMOTE(random_state=1,
                                  truncation_factor=1.0,
                                  deformation_factor=0,
                                  k_neighbors=5,
                                  sampling_rate=0.3)
예제 #2
0
def OverSample(X, y):
    print('Original dataset shape %s' % Counter(y))
    gsmote = GeometricSMOTE(random_state=1)
    X_res, y_res = gsmote.fit_resample(X, y)
    print('Resampled dataset shape %s' % Counter(y_res))

    return X_res, y_res
예제 #3
0
파일: grid_search.py 프로젝트: xzsl/pygsom
 def __init__(self,
              truncation_factor=1.0,
              deformation_factor=0.0,
              k_neighbors=1,
              sampling_rate=0.3,
              n_estimators=100,
              learning_rate=0.01,
              max_depth=3):
     """
     Called when initializing the classifier
     """
     self.truncation_factor = truncation_factor
     self.deformation_factor = deformation_factor
     self.k_neighbors = k_neighbors
     self.sampling_rate = sampling_rate
     self.n_estimators = n_estimators
     self.learning_rate = learning_rate
     self.max_depth = max_depth
     self.regressor = GradientBoostingClassifier(
         n_estimators=self.n_estimators,
         learning_rate=self.learning_rate,
         max_depth=self.max_depth)
     self.gsmote = GeometricSMOTE(
         random_state=1,
         truncation_factor=self.truncation_factor,
         deformation_factor=self.deformation_factor,
         k_neighbors=self.k_neighbors,
         sampling_rate=self.sampling_rate)
예제 #4
0
def generate_oversamplers(factor):
    """Generate a list of oversamplers that pre-apply undersampling."""
    if factor is None:
        return [('BENCHMARK METHOD', None, {})]
    return [('NO OVERSAMPLING',
             UnderOverSampler(oversampler=None, factor=factor), {}),
            ('RANDOM OVERSAMPLING',
             UnderOverSampler(oversampler=RandomOverSampler(),
                              factor=factor), {}),
            ('SMOTE', UnderOverSampler(oversampler=SMOTE(), factor=factor), {
                'oversampler__k_neighbors': [3, 5]
            }),
            ('BORDERLINE SMOTE',
             UnderOverSampler(oversampler=BorderlineSMOTE(), factor=factor), {
                 'oversampler__k_neighbors': [3, 5]
             }),
            ('G-SMOTE',
             UnderOverSampler(oversampler=GeometricSMOTE(), factor=factor), {
                 'oversampler__k_neighbors': [3, 5],
                 'oversampler__selection_strategy':
                 ['combined', 'minority', 'majority'],
                 'oversampler__truncation_factor':
                 [-1.0, -0.5, .0, 0.25, 0.5, 0.75, 1.0],
                 'oversampler__deformation_factor':
                 [.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0]
             })]
예제 #5
0
class MeanClassifier(BaseEstimator, ClassifierMixin):
    """An example of classifier"""
    def __init__(self,
                 smooth_iteration=25,
                 training_iteration=50,
                 spreading_factor=0.83,
                 FD=0.1,
                 learning_rate=0.3,
                 smooth_learning_factor=0.8):
        """
        Called when initializing the classifier
        """
        self.smooth_iteration = smooth_iteration
        self.spreading_factor = spreading_factor
        self.training_iteration = training_iteration
        self.FD = FD
        self.learning_rate = learning_rate
        self.smooth_learning_factor = smooth_learning_factor
        self.gsom = GSOM(self.spreading_factor,
                         55,
                         max_radius=4,
                         FD=self.FD,
                         learning_rate=self.learning_rate,
                         smooth_learning_factor=self.smooth_learning_factor)
        self.gsmote = GeometricSMOTE(random_state=1,
                                     truncation_factor=1.0,
                                     deformation_factor=0,
                                     k_neighbors=5,
                                     sampling_rate=0.3)

    def fit(self, X, y):
        X_train, y_train = self.gsmote.fit_resample(X, y)
        y1 = np.copy(y_train)
        y = np.column_stack([y1, y_train])
        labels = ["Name", "label"]
        y = np.vstack((labels, y))
        frame = pd.DataFrame(y[1:, :], columns=y[0, :])
        self.gsom.fit(X_train, self.training_iteration, self.smooth_iteration)
        self.gsom.labelling_gsom(X_train, frame, "Name", "label")
        self.gsom.finalize_gsom_label()
        return self

    # def _meaning(self, x):
    #     return True

    def predict(self, X):
        return self.gsom.predict_values(X)
예제 #6
0
class MeanClassifier(BaseEstimator, ClassifierMixin):
    """An example of classifier"""
    def __init__(self,
                 truncation_factor=1.0,
                 deformation_factor=0.0,
                 k_neighbors=1,
                 sampling_rate=0.3,
                 n_estimators=100,
                 learning_rate=0.01,
                 max_depth=3):
        """
        Called when initializing the classifier
        """
        self.truncation_factor = truncation_factor
        self.deformation_factor = deformation_factor
        self.k_neighbors = k_neighbors
        self.sampling_rate = sampling_rate
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.regressor = GradientBoostingClassifier(
            n_estimators=self.n_estimators,
            learning_rate=self.learning_rate,
            max_depth=self.max_depth)
        self.gsmote = GeometricSMOTE(
            random_state=1,
            truncation_factor=self.truncation_factor,
            deformation_factor=self.deformation_factor,
            k_neighbors=self.k_neighbors,
            sampling_rate=self.sampling_rate)

    def fit(self, X, y):
        print(self.max_depth, self.learning_rate, self.n_estimators,
              self.sampling_rate, self.k_neighbors, self.deformation_factor,
              self.truncation_factor)
        X_train, y_train = self.gsmote.fit_resample(X, y)
        self.regressor.fit(X_train, y_train)
        return self

    # def _meaning(self, x):
    #     return True

    def predict(self, y):
        return self.regressor.predict(y)
def model_fit(X_train, y_train, X_test, y_test):
    classifier_dict = {
        "no_oversampling":
        Pipeline([("none", None),
                  ("lr", LogisticRegression(solver="liblinear"))]),
        "smote":
        Pipeline([
            ("smote", SMOTE(k_neighbors=1)),
            ("lr", LogisticRegression(solver="liblinear")),
        ]),
        "gsmote":
        Pipeline([
            ("gsmote", GeometricSMOTE(k_neighbors=1)),
            ("lr", LogisticRegression(solver="liblinear")),
        ]),
    }
    results = {}
    for name, estimator in classifier_dict.items():
        estimator.fit(X_train, y_train)
        results[name] = estimator.score(X_test, y_test)
    return pd.DataFrame(data=results.values(),
                        index=results.keys(),
                        columns=["score"])
    plt.ylim([0.9, 1.0])


###############################################################################
# Low Imbalance Ratio or high Samples to Features Ratio
###############################################################################

###############################################################################
# When :math:`\text{IR} = \frac{\text{\# majority samples}}{\text{\# minority
# samples}}` (Imbalance Ratio) is low or :math:`\text{SFR} = \frac{\text{\#
# samples}}{\text{\# features}}` (Samples to Features Ratio) is high then the
# minority selection strategy and higher absolute values of the truncation and
# deformation factors dominate as optimal hyperparameters.

X, y = generate_imbalanced_data([0.3, 0.7], 2000, 6, 4)
gsmote_gbc = make_pipeline(GeometricSMOTE(random_state=RANDOM_STATE), 
                           DecisionTreeClassifier(random_state=RANDOM_STATE))

scoring_name = 'Geometric Mean Score'
validation_curve_info = generate_validation_curve_info(gsmote_gbc, X, y, range(1, 8), "geometricsmote__k_neighbors", SCORER)
plot_validation_curve(validation_curve_info, scoring_name, 'K Neighbors')

validation_curve_info = generate_validation_curve_info(gsmote_gbc, X, y, np.linspace(-1.0, 1.0, 9), "geometricsmote__truncation_factor", SCORER)
plot_validation_curve(validation_curve_info, scoring_name, 'Truncation Factor')

validation_curve_info = generate_validation_curve_info(gsmote_gbc, X, y, np.linspace(0.0, 1.0, 5), "geometricsmote__deformation_factor", SCORER)
plot_validation_curve(validation_curve_info, scoring_name, 'Deformation Factor')

validation_curve_info = generate_validation_curve_info(gsmote_gbc, X, y, ['minority', 'majority', 'combined'], "geometricsmote__selection_strategy", SCORER)
plot_validation_curve(validation_curve_info, scoring_name, 'Selection Strategy')
예제 #9
0
    def runSMOTEvariationsGen(self, folder):
        """
        Create files with SMOTE preprocessing and without preprocessing.
        :param datasets: datasets.
        :param folder:   cross-validation folders.
        :return:
        """
        smote = SMOTE()
        borderline1 = BorderlineSMOTE(kind='borderline-1')
        borderline2 = BorderlineSMOTE(kind='borderline-2')
        smoteSVM = SVMSMOTE()
        geometric_smote = GeometricSMOTE(n_jobs=-1)

        for dataset in datasets:  # biclass e multiclass
            for fold in range(5):
                path = os.path.join(folder, dataset, str(fold),
                                    ''.join([dataset, "_train.csv"]))
                train = np.genfromtxt(path, delimiter=',')
                X = train[:, 0:train.shape[1] - 1]
                Y = train[:, train.shape[1] - 1]

                # SMOTE
                print("SMOTE..." + dataset)
                X_res, y_res = smote.fit_sample(X, Y)
                y_res = y_res.reshape(len(y_res), 1)
                newdata = np.hstack([X_res, y_res])
                newtrain = pd.DataFrame(newdata)
                newtrain.to_csv(os.path.join(folder, dataset, str(fold),
                                             ''.join([dataset, "_SMOTE.csv"])),
                                header=False,
                                index=False)
                # SMOTE BORDERLINE1
                print("Borderline1..." + dataset)
                X_res, y_res = borderline1.fit_sample(X, Y)
                y_res = y_res.reshape(len(y_res), 1)
                newdata = np.hstack([X_res, y_res])
                newtrain = pd.DataFrame(newdata)
                newtrain.to_csv(os.path.join(
                    folder, dataset, str(fold),
                    ''.join([dataset, "_Borderline1.csv"])),
                                header=False,
                                index=False)
                # SMOTE BORDERLINE2
                print("Borderline2..." + dataset)
                X_res, y_res = borderline2.fit_sample(X, Y)
                y_res = y_res.reshape(len(y_res), 1)
                newdata = np.hstack([X_res, y_res])
                newtrain = pd.DataFrame(newdata)
                newtrain.to_csv(os.path.join(
                    folder, dataset, str(fold),
                    ''.join([dataset, "_Borderline2.csv"])),
                                header=False,
                                index=False)
                # SMOTE SVM
                print("SMOTE SVM..." + dataset)
                X_res, y_res = smoteSVM.fit_sample(X, Y)
                y_res = y_res.reshape(len(y_res), 1)
                newdata = np.hstack([X_res, y_res])
                newtrain = pd.DataFrame(newdata)
                newtrain.to_csv(os.path.join(
                    folder, dataset, str(fold),
                    ''.join([dataset, "_smoteSVM.csv"])),
                                header=False,
                                index=False)

                # GEOMETRIC SMOTE
                print("GEOMETRIC SMOTE..." + dataset)
                X_res, y_res = geometric_smote.fit_resample(X, Y)
                y_res = y_res.reshape(len(y_res), 1)
                newdata = np.hstack([X_res, y_res])
                newtrain = pd.DataFrame(newdata)
                newtrain.to_csv(os.path.join(
                    folder, dataset, str(fold),
                    ''.join([dataset, "_Geometric_SMOTE.csv"])),
                                header=False,
                                index=False)
예제 #10
0
    def parse_input_zoo_data(filename, header='infer'):
        gsmote = GeometricSMOTE(random_state=1)

        #
        #     (X_train, y_train), (X_test, y_test) = mnist.load_data()
        #
        #     d1, d2, d3 = X_train.shape
        #     X_train_reshaped = X_train.reshape(d1, d2 * d3)
        #     print(X_train_reshaped[:2000, :].shape)
        #     y_train_half = y_train[:2000]
        #     classes = y_train_half.tolist()
        #     labels = y_train_half.tolist()
        #     # print(labels)
        #
        #     input_database = {
        #         0: X_train_reshaped[:2000, :]
        #     }
        #GSMOTE
        # X_f,y_f = GSMOTE.OverSample()
        #
        #
        # X_t, X_test, y_t, y_test = train_test_split(X_f, y_f, test_size=0.2, random_state=0)
        #
        #
        # classes = y_t.tolist()
        # labels = y_t.tolist()
        # input_database = {
        #     0: X_t
        # }

        X, y = pp.preProcess(filename)
        X_t, X_test, y_t, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)
        X_train, y_train = gsmote.fit_resample(X_t, y_t)
        classes = y_train.tolist()
        labels = y_train.tolist()
        input_database = {0: X_train}

        # (X_train, y_train), (X_test, y_test) = mnist.load_data()
        #
        # d1, d2, d3 = X_train.shape
        # X_train_reshaped = X_train.reshape(d1, d2 * d3)
        # print(X_train_reshaped[:2000, :].shape)
        # y_train_half = y_train[:2000]
        # classes = y_train_half.tolist()
        # labels = y_train_half.tolist()
        # # print(labels)
        #
        # input_database = {
        #     0: X_train_reshaped[:2000, :]
        # }

        #Smote
        # X_f,y_f = smote.Data_Extract(filename)
        # classes = y_f.tolist()
        # labels = y_f.tolist()
        # input_database = {
        #     0: X_f[:,:]
        # }

        # input_data = pd.read_csv(filename, header=header)
        #
        # input_database = {
        #     0: input_data.as_matrix([0,1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,17,18,19,20,21,22,23,24,25,26,27,28,29])
        # }
        #
        #     (X_train, y_train), (X_test, y_test) = mnist.load_data()
        #
        #     d1, d2, d3 = X_train.shape
        #     X_train_reshaped = X_train.reshape(d1, d2 * d3)
        #     print(X_train_reshaped[:2000, :].shape)
        #     y_train_half = y_train[:2000]
        #     classes = y_train_half.tolist()
        #     labels = y_train_half.tolist()
        #     # print(labels)
        #
        #     input_database = {
        #         0: X_train_reshaped[:2000, :]
        #     }

        # input_data = pd.read_csv(filename, header=header)
        #
        # classes = input_data[17].tolist()
        # labels = input_data[0].tolist()
        # input_database = {
        #     0: input_data.as_matrix([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16])
        # }

        return input_database, labels, classes, X_test, y_test
###############################################################################
# Truncation factor
#..............................................................................
#
# The hyperparameter ``truncation_factor`` determines the degree of truncation
# that is applied on the initial geometric area. Selecting the values of
# geometric hyperparameters as `truncation_factor=0.0`,
# ``deformation_factor=0.0`` and ``selection_strategy='minority'``, the data
# generation area in 2D corresponds to a circle with center as one of the two
# minority class samples and radius equal to the distance between them. In the
# multi-dimensional case the corresponding area is a hypersphere. When
# truncation factor is increased, the hypersphere is truncated and for
# ``truncation_factor=1.0`` becomes a half-hypersphere. Negative values of
# ``truncation_factor`` have a similar effect but on the opposite direction. 

gsmote = GeometricSMOTE(k_neighbors=1, deformation_factor=0.0, 
                        selection_strategy='minority', random_state=RANDOM_STATE)
truncation_factors = np.array([0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
n_subplots = [2, 3]
plot_hyperparameters(gsmote, X, y, 'truncation_factor', truncation_factors, n_subplots)
plot_hyperparameters(gsmote, X, y, 'truncation_factor', -truncation_factors, n_subplots)

###############################################################################
# Deformation factor
#..............................................................................
#
# When the ``deformation_factor`` is increased, the data generation area deforms
# to an ellipsis and for ``deformation_factor=1.0`` becomes a line segment.

gsmote = GeometricSMOTE(k_neighbors=1, truncation_factor=0.0,
                        selection_strategy='minority', random_state=RANDOM_STATE)
deformation_factors = np.array([0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
#
# Below we use the Random Forest Classifier to predict the forest type of each
# patch of forest. Two experiments are ran: One using only the classifier and
# another that creates a pipeline of Geometric SMOTE and the classifier. A
# classification report is printed for both experiments.

splitted_data = train_test_split(X,
                                 y,
                                 test_size=0.95,
                                 random_state=RANDOM_STATE,
                                 shuffle=True)

clf = RandomForestClassifier(bootstrap=True,
                             n_estimators=10,
                             random_state=RANDOM_STATE)
ovs_clf = make_pipeline(GeometricSMOTE(random_state=RANDOM_STATE), clf)

print_classification_report(clf, *splitted_data)
print_classification_report(ovs_clf, *splitted_data)

###############################################################################
# Indian Pines
###############################################################################

###############################################################################
# This hyperspectral data set has 220 spectral bands and 20 m spatial resolution.
# The imagery was collected on 12 June 1992 and represents a 2.9 by 2.9 km area
# in Tippecanoe County, Indiana, USA. The area is agricultural and eight classes
# as land-use types are presented: alfalfa, corn, grass, hay, oats, soybeans,
# trees, and wheat. The Indian Pines data set has been used for testing and
# comparing algorithms. The number of samples varies greatly among the classes,
예제 #13
0
    def _check_estimators(self, X, y):
        """Check various estimators."""

        # Import SOM and GeometricSMOTE
        try:
            from somlearn import SOM
        except ImportError:
            raise ImportError(
                'GeometricSOMO class requires the package `som-learn` to be installed.'
            )
        try:
            from gsmote import GeometricSMOTE
        except ImportError:
            raise ImportError(
                'GeometricSOMO class requires the package `geometric-smote` to '
                'be installed.')

        # Check oversampler
        self.oversampler_ = GeometricSMOTE(
            sampling_strategy=self.sampling_strategy,
            k_neighbors=self.k_neighbors,
            truncation_factor=self.truncation_factor,
            deformation_factor=self.deformation_factor,
            selection_strategy=self.selection_strategy,
            random_state=self.random_state_,
            n_jobs=self.n_jobs,
        )

        if self.som_estimator is None:
            self.clusterer_ = SOM(random_state=self.random_state_)
        elif isinstance(self.som_estimator, int):
            check_scalar(self.som_estimator, 'som_estimator', int, min_val=1)
            n = round(sqrt(self.som_estimator))
            self.clusterer_ = SOM(n_columns=n,
                                  n_rows=n,
                                  random_state=self.random_state_)
        elif isinstance(self.som_estimator, float):
            check_scalar(self.som_estimator,
                         'som_estimator',
                         float,
                         min_val=0.0,
                         max_val=1.0)
            n = round(sqrt((X.shape[0] - 1) * self.som_estimator + 1))
            self.clusterer_ = SOM(n_columns=n,
                                  n_rows=n,
                                  random_state=self.random_state_)
        elif isinstance(self.som_estimator, SOM):
            self.clusterer_ = clone(self.som_estimator)
        else:
            raise TypeError('Parameter `som_estimator` should be '
                            'either `None` or the number of clusters '
                            'or a float in the [0.0, 1.0] range equal to'
                            ' the number of clusters over the number of '
                            'samples or an instance of the `SOM` class.')

        # Check distributor
        self.distributor_ = DensityDistributor(
            filtering_threshold=self.imbalance_ratio_threshold,
            distances_exponent=self.distances_exponent,
            distribution_ratio=self.distribution_ratio,
        )

        return self
예제 #14
0
    def load_best_classifier(self, X, y):
        scores = []
        for _, classifier in self.classifiers_:
            scores.append(self.evaluation_metric_(classifier, X, y))

        self.classifier_ = self.classifiers_[np.argmax(scores)][-1]
        return self

    def predict(self, X):
        return self.classifier_.predict(X)


CONFIG = {
    'oversamplers': [
        #('NONE', None, {}),
        ('G-SMOTE', ClusterOverSampler(GeometricSMOTE(), n_jobs=1), {})
        #    'oversampler__k_neighbors': [3, 5],
        #    'oversampler__selection_strategy': ['combined', 'minority', 'majority'],
        #    'oversampler__truncation_factor': [-1.0, .0, 1.0],
        #    'oversampler__deformation_factor': [.0, 0.5, 1.0]
        #    })
    ],
    'classifiers': [
        ('LR',
         LogisticRegression(multi_class='multinomial',
                            solver='sag',
                            penalty='none',
                            max_iter=1e4), {}),
        ('KNN', KNeighborsClassifier(), {
            'n_neighbors': [3]
        }),  #, 5, 8]}),

###############################################################################
# Low Imbalance Ratio or high Samples to Features Ratio
###############################################################################

###############################################################################
# When :math:`\text{IR} = \frac{\text{\# majority samples}}{\text{\# minority
# samples}}` (Imbalance Ratio) is low or :math:`\text{SFR} = \frac{\text{\#
# samples}}{\text{\# features}}` (Samples to Features Ratio) is high then the
# minority selection strategy and higher absolute values of the truncation and
# deformation factors dominate as optimal hyperparameters.

X, y = generate_imbalanced_data([0.3, 0.7], 2000, 6, 4)
gsmote_gbc = make_pipeline(
    GeometricSMOTE(random_state=RANDOM_STATE),
    DecisionTreeClassifier(random_state=RANDOM_STATE),
)

scoring_name = 'Geometric Mean Score'
validation_curve_info = generate_validation_curve_info(
    gsmote_gbc, X, y, range(1, 8), "geometricsmote__k_neighbors", SCORER)
plot_validation_curve(validation_curve_info, scoring_name, 'K Neighbors')

validation_curve_info = generate_validation_curve_info(
    gsmote_gbc,
    X,
    y,
    np.linspace(-1.0, 1.0, 9),
    "geometricsmote__truncation_factor",
    SCORER,
예제 #16
0
파일: grid_search.py 프로젝트: xzsl/pygsom
    'learning_rate': [0.01],
    'max_depth': [3]
}]
gs = GridSearchCV(MeanClassifier(), parameters)
gs.fit(X, y)

params = gs.best_params_
print(params)

#find performance
X_t, X_test, y_t, y_test = train_test_split(X,
                                            y,
                                            test_size=0.2,
                                            random_state=0)
gsmote = GeometricSMOTE(random_state=1,
                        truncation_factor=params["truncation_factor"],
                        deformation_factor=params["deformation_factor"],
                        k_neighbors=params["k_neighbors"],
                        sampling_rate=params["sampling_rate"])
X_train, y_train = gsmote.fit_resample(X_t, y_t)
# Fitting Gradient boosting
gbc = GradientBoostingClassifier(n_estimators=params["n_estimators"],
                                 learning_rate=params["learning_rate"],
                                 max_depth=params["max_depth"])
gbc.fit(X_train, y_train)

# Predicting the Test set results
y_predict = gbc.predict(X_test)
y_pred = np.where(y_predict.astype(int) > 0.5, 1, 0)

evaluate("Gradient Boosting", y_test, y_pred)
        X_largest, y_largest, X_smallest, y_smallest = X2, y2, X1, y1

    intersecting_vals = np.in1d(X_largest, X_smallest).reshape(X_largest.shape)
    disjoin_indexes = np.where(~np.all(intersecting_vals, axis=1))[0]
    return X_largest.iloc[disjoin_indexes], y_largest.iloc[disjoin_indexes]


for strategy in ["combined", "majority", "minority"]:
    X_gsmote_final = np.empty(shape=(0, X_train.shape[-1]))
    y_gsmote_final = np.empty(shape=(0))
    for d in [0, 0.5, 1]:
        for t in [-1, 0, 1]:
            gsmote_sampling = GeometricSMOTE(
                k_neighbors=1,
                deformation_factor=d,
                truncation_factor=t,
                n_jobs=-1,
                selection_strategy=strategy,
            ).fit_resample(X_train, y_train)
            X_gsmote, _ = get_disjoin(X_train, y_train, gsmote_sampling[0],
                                      gsmote_sampling[1])
            X_gsmote_final = np.append(X_gsmote_final, X_gsmote, axis=0)
            y_gsmote_final = np.append(y_gsmote_final,
                                       np.array([f"t={t}, d={d}"] *
                                                X_gsmote.shape[0]),
                                       axis=0)
    plot_mnist_samples(
        pd.DataFrame(X_gsmote_final),
        pd.Series(y_gsmote_final),
        f"Generated Using G-SMOTE: {strategy}",
    )
예제 #18
0
    """Geometric mean score with macro average."""
    return geometric_mean_score(y_true, y_pred, average='macro')


SCORERS['geometric_mean_score_macro'] = make_scorer(geometric_mean_score_macro)
CONFIG = {
    'oversamplers':
    [('NONE', None, {}), ('ROS', RandomOverSampler(), {}),
     ('SMOTE', SMOTE(), {
         'k_neighbors': [3, 5]
     }), ('B-SMOTE', BorderlineSMOTE(), {
         'k_neighbors': [3, 5]
     }), ('ADASYN', ADASYN(), {
         'n_neighbors': [2, 3]
     }),
     ('G-SMOTE', GeometricSMOTE(), {
         'k_neighbors': [3, 5],
         'selection_strategy': ['combined', 'minority', 'majority'],
         'truncation_factor': [-1.0, -0.5, .0, 0.25, 0.5, 0.75, 1.0],
         'deformation_factor': [.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0]
     })],
    'classifiers': [
        ('CONSTANT CLASSIFIER', DummyClassifier(strategy='constant',
                                                constant=0), {}),
        ('LR', LogisticRegression(solver='liblinear', multi_class='auto'), {}),
        ('KNN', KNeighborsClassifier(), {
            'n_neighbors': [3, 5]
        }), ('DT', DecisionTreeClassifier(), {
            'max_depth': [3, 6]
        }),
        ('GBC', GradientBoostingClassifier(), {
예제 #19
0
	def runSMOTEvariationsGen(self, folder):
		"""
		Create files with SMOTE preprocessing and without preprocessing.
		:param datasets: datasets.
		:param folder:   cross-validation folders.
		:return:
		"""
		smote = SMOTE()
		borderline1 = BorderlineSMOTE(kind='borderline-1')
		borderline2 = BorderlineSMOTE(kind='borderline-2')
		smoteSVM = SVMSMOTE()
		geometric_smote = GeometricSMOTE(n_jobs=-1)
		
		for dataset in datasets:
			for fold in range(5):
				path = os.path.join(folder, dataset, str(fold), ''.join([dataset, "_train.csv"]))
				train = np.genfromtxt(path, delimiter=',')
				X = train[:, 0:train.shape[1] - 1]
				Y = train[:, train.shape[1] - 1]
				Y = Y.reshape(len(Y), 1)
				
				# SMOTE
				print("SMOTE..." + dataset)
				data_r = np.hstack([X, Y])
				data_r = pd.DataFrame(data_r)
				data_r.columns = data_r.columns.astype(str)
				colunas = list(data_r.columns)
				y_name = colunas[-1]
				
				dtoregression = dtosmoter(
						
						data=data_r,
						y=y_name,
						oversampler=smote
				)
				
				dtoregression.to_csv(os.path.join(folder, dataset, str(fold), ''.join([dataset, "_SMOTE.csv"])),
				                     header=False, index=False)
				# SMOTE BORDERLINE1
				print("Borderline1..." + dataset)
				data_r = np.hstack([X, Y])
				data_r = pd.DataFrame(data_r)
				data_r.columns = data_r.columns.astype(str)
				colunas = list(data_r.columns)
				y_name = colunas[-1]
				dtoregression = dtosmoter(
						
						data=data_r,
						y=y_name,
						oversampler=borderline1
				)
				
				dtoregression.to_csv(os.path.join(folder, dataset, str(fold), ''.join([dataset, "_Borderline1.csv"])),
				                     header=False, index=False)
				# SMOTE BORDERLINE2
				print("Borderline2..." + dataset)
				data_r = np.hstack([X, Y])
				data_r = pd.DataFrame(data_r)
				data_r.columns = data_r.columns.astype(str)
				colunas = list(data_r.columns)
				y_name = colunas[-1]
				dtoregression = dtosmoter(
						
						data=data_r,
						y=y_name,
						oversampler=borderline2
				)
				
				dtoregression.to_csv(os.path.join(folder, dataset, str(fold), ''.join([dataset, "_Borderline2.csv"])),
				                     header=False, index=False)
				# SMOTE SVM
				print("SMOTE SVM..." + dataset)
				data_r = np.hstack([X, Y])
				data_r = pd.DataFrame(data_r)
				data_r.columns = data_r.columns.astype(str)
				colunas = list(data_r.columns)
				y_name = colunas[-1]
				dtoregression = dtosmoter(
						
						data=data_r,
						y=y_name,
						oversampler=smoteSVM
				)
				
				dtoregression.to_csv(os.path.join(folder, dataset, str(fold), ''.join([dataset, "_smoteSVM.csv"])),
				                     header=False, index=False)
				
				# GEOMETRIC SMOTE
				print("GEOMETRIC SMOTE..." + dataset)
				data_r = np.hstack([X, Y])
				data_r = pd.DataFrame(data_r)
				data_r.columns = data_r.columns.astype(str)
				colunas = list(data_r.columns)
				y_name = colunas[-1]
				dtoregression = dtosmoter(
						
						data=data_r,
						y=y_name,
						oversampler=geometric_smote
				)
				
				dtoregression.to_csv(
						os.path.join(folder, dataset, str(fold), ''.join([dataset, "_Geometric_SMOTE.csv"])),
						header=False, index=False)