示例#1
0
def generate_oversamplers(oversamplers_names):
    "Generate oversamplers."
    oversamplers = [
        ('NO OVERSAMPLING', None, {}),
        ('RANDOM OVERSAMPLING', RandomOverSampler(), {}),
        ('SMOTE', SMOTE(), {
            'k_neighbors': [3, 5]
        }), ('BORDERLINE SMOTE', BorderlineSMOTE(), {
            'k_neighbors': [3, 5]
        }), ('ADASYN', ADASYN(), {
            'n_neighbors': [2, 3]
        }),
        ('G-SMOTE', GeometricSMOTE(), {
            'k_neighbors': [3, 5],
            'selection_strategy': ['combined', 'minority', 'majority'],
            'truncation_factor': [-1.0, -0.5, .0, 0.25, 0.5, 0.75, 1.0],
            'deformation_factor': [.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0]
        }),
        ('K-MEANS RANDOM OVERSAMPLING',
         RandomOverSampler(clusterer=KMeans(),
                           distributor=DensityDistributor()),
         {
             'k_neighbors': [3,
                             5],
             'clusterer__n_clusters':
             [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
             'distributor__distances_exponent': [0, 1, 2, 5],
             'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0]
         }),
        ('K-MEANS SMOTE',
         SMOTE(clusterer=KMeans(), distributor=DensityDistributor()), {
             'k_neighbors': [3, 5],
             'clusterer__n_clusters':
             [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
             'distributor__distances_exponent': [0, 1, 2, 5],
             'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0]
         }),
        ('K-MEANS BORDERLINE SMOTE',
         BorderlineSMOTE(clusterer=KMeans(), distributor=DensityDistributor()),
         {
             'k_neighbors': [3, 5],
             'clusterer__n_clusters':
             [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
             'distributor__distances_exponent': [0, 1, 2, 5],
             'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0]
         }),
        ('K-MEANS G-SMOTE',
         GeometricSMOTE(clusterer=KMeans(), distributor=DensityDistributor()),
         {
             'k_neighbors':
             [3, 5],
             'selection_strategy': ['combined', 'minority', 'majority'],
             'truncation_factor': [-1.0, -0.5, .0, 0.25, 0.5, 0.75, 1.0],
             'deformation_factor': [.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0],
             'clusterer__n_clusters':
             [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
             'distributor__distances_exponent': [0, 1, 2, 5],
             'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0]
         }),
        ('SOMO', SMOTE(clusterer=SOM(), distributor=DensityDistributor()), {
            'k_neighbors': [3, 5],
            'clusterer__n_clusters':
            [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
            'distributor__distances_exponent': [0, 1, 2, 5],
            'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0],
            'distributor__distribution_ratio': [0.0, 0.25, 0.5, 0.75, 1.0]
        }),
        ('G-SOMO',
         GeometricSMOTE(clusterer=SOM(), distributor=DensityDistributor()), {
             'k_neighbors': [3, 5],
             'selection_strategy': ['combined', 'minority', 'majority'],
             'truncation_factor': [-1.0, -0.5, .0, 0.25, 0.5, 0.75, 1.0],
             'deformation_factor': [.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0],
             'clusterer__n_clusters':
             [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
             'distributor__distances_exponent': [0, 1, 2, 5],
             'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0],
             'distributor__distribution_ratio': [0.0, 0.25, 0.5, 0.75, 1.0]
         })
    ]
    if oversamplers_names in ('basic', 'scaled', 'undersampled'):
        oversamplers = select_pipelines(
            oversamplers, ('NO OVERSAMPLING', 'RANDOM OVERSAMPLING', 'SMOTE',
                           'BORDERLINE SMOTE', 'ADASYN', 'G-SMOTE'))
    if oversamplers_names == 'scaled':
        oversamplers = append_transformer(MinMaxScaler(), oversamplers)
    elif oversamplers_names == 'undersampled':
        oversamplers = set_sampling_strategy(
            lambda y: generate_sampling_strategy(y, 1 / 3), oversamplers)
        oversamplers = append_transformer(
            RandomUnderSampler(
                sampling_strategy=lambda y: generate_sampling_strategy(y, 3)),
            oversamplers)

    return oversamplers
示例#2
0
from sklearnext.over_sampling.base import DensityDistributor
from sklearnext.tools import report_model_search_results

# Load data
X, y = make_classification(n_informative=15,
                           n_clusters_per_class=3,
                           weights=[0.9, 0.1])

# Define estimators
estimators = [('GBC', GradientBoostingClassifier()),
              ('SMOTE+GBC',
               Pipeline([('smote', SMOTE()),
                         ('gbc', GradientBoostingClassifier())])),
              ('KMeanSMOTE+GBC',
               Pipeline([('smote',
                          SMOTE(clusterer=KMeans(n_init=1),
                                distributor=DensityDistributor())),
                         ('gbc', GradientBoostingClassifier())]))]

# Define parameters grid
param_grids = [{
    'SMOTE+GBC__smote__k_neighbors': [2, 3, 4, 5],
    'SMOTE+GBC__gbc__max_depth': [2, 4]
}, {
    'KMeanSMOTE+GBC__smote__k_neighbors': [2, 3, 4, 5],
    'KMeanSMOTE+GBC__smote__clusterer__n_clusters':
    [0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
    'KMeanSMOTE+GBC__smote__distributor__filtering_threshold':
    [0.8, 1.0, 1.2, 1.5],
    'KMeanSMOTE+GBC__gbc__max_depth': [2, 4]
}]
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors.classification import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearnext.cluster import KMeans
from sklearnext.tools import evaluate_binary_imbalanced_experiments, read_csv_dir, summarize_binary_datasets
from sklearnext.over_sampling import RandomOverSampler, DensityDistributor

# Paths
datasets_path = join(dirname(__file__), '..', '..', 'data', 'binary-numerical-imbalanced')
results_path = join(dirname(__file__), '..', '..', 'data', 'results', 'kmeans-oversampling', 'random-oversampler')

# Oversamplers and classifiers
oversamplers = [
    ('NO OVERSAMPLING', None),
    ('RANDOM OVERSAMPLING', RandomOverSampler(random_state=0)),
    ('K-MEANS RANDOM OVERSAMPLING', RandomOverSampler(clusterer=KMeans(random_state=1, n_init=1), distributor=DensityDistributor(), random_state=0), {
        'clusterer__n_clusters': [0.0, 0.25, 0.5, 0.75, 1.0],
        'distributor__distances_exponent': [0, 1, 2],
        'distributor__filtering_threshold': [0.5, 1.0]
        }
    )
]
classifiers = [
    ('LR', LogisticRegression()),
    ('KNN', KNeighborsClassifier(), {'n_neighbors': [3, 5]}),
    ('DT', DecisionTreeClassifier(random_state=2), {'max_depth': [3, 6]}),
    ('GBC', GradientBoostingClassifier(random_state=3), {'max_depth': [3, 6], 'n_estimators': [50, 100]})
]

# Load datasets
imbalanced_datasets = read_csv_dir(datasets_path)
import pytest
import numpy as np
from sklearnext.cluster import KMeans

from ...over_sampling import RandomOverSampler, SMOTE, GeometricSMOTE
from ...cluster import SOM
from ...utils.validation import _TrivialOversampler
from ..distribution import DensityDistributor

X = np.array(list(product(range(5), range(4))))
y = np.array([0] * 10 + [1] * 6 + [2] * 4)
LABELS = np.array([0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2, 0, 3, 3, 3, 0, 3, 3, 3])
NEIGHBORS = [(0, 1), (0, 2), (0, 3), (1, 2), (2, 3)]


@pytest.mark.parametrize('clusterer', [None, KMeans(), SOM()])
def test_fit(clusterer):
    """Test the fit method of the extended base oversampler."""
    oversampler = _TrivialOversampler(clusterer=clusterer).fit(X, y)
    assert oversampler.sampling_strategy_ == OrderedDict({1: 4, 2: 6})


@pytest.mark.parametrize('clusterer', [None, KMeans(), SOM()])
def test_fit(clusterer):
    """Test the fit and resample method of the extended base oversampler."""
    oversampler = _TrivialOversampler(clusterer=clusterer)
    X_resampled, y_resampled = oversampler.fit_resample(X, y)
    assert hasattr(oversampler, 'distributor_')
    assert hasattr(oversampler.distributor_, 'intra_distribution_')
    assert hasattr(oversampler.distributor_, 'inter_distribution_')
    if isinstance(clusterer, SOM):
示例#5
0
from sklearnext.tools import evaluate_binary_imbalanced_experiments, read_csv_dir, summarize_binary_datasets
from sklearnext.over_sampling import SMOTE, DensityDistributor

# Paths
datasets_path = join(dirname(__file__), '..', '..', 'data',
                     'binary-numerical-imbalanced')
results_path = join(dirname(__file__), '..', '..', 'data', 'results',
                    'kmeans-oversampling', 'smote')

# Oversamplers and classifiers
oversamplers = [('NO OVERSAMPLING', None),
                ('SMOTE', SMOTE(random_state=0), {
                    'k_neighbors': [3, 5]
                }),
                ('K-MEANS SMOTE',
                 SMOTE(clusterer=KMeans(random_state=1, n_init=1),
                       distributor=DensityDistributor(),
                       random_state=0), {
                           'k_neighbors': [3, 5],
                           'clusterer__n_clusters':
                           [0.0, 0.25, 0.5, 0.75, 1.0],
                           'distributor__distances_exponent': [0, 1, 2],
                           'distributor__filtering_threshold': [0.5, 1.0]
                       })]
classifiers = [('LR', LogisticRegression()),
               ('KNN', KNeighborsClassifier(), {
                   'n_neighbors': [3, 5]
               }),
               ('DT', DecisionTreeClassifier(random_state=2), {
                   'max_depth': [3, 6]
               }),
示例#6
0
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors.classification import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearnext.cluster import KMeans
from sklearnext.tools import evaluate_binary_imbalanced_experiments, read_csv_dir, summarize_binary_datasets
from sklearnext.over_sampling import SMOTE, DensityDistributor

# Paths
datasets_path = join(dirname(__file__), '..', '..', 'data', 'binary-numerical-imbalanced')
results_path = join(dirname(__file__), '..', '..', 'data', 'results', 'kmeans-oversampling', 'borderline-smote')

# Oversamplers and classifiers
oversamplers = [
    ('NO OVERSAMPLING', None),
    ('BORDERLINE-SMOTE', SMOTE(random_state=0, kind='borderline1'), {'k_neighbors': [3, 5]}),
    ('K-MEANS BORDERLINE-SMOTE', SMOTE(clusterer=KMeans(random_state=1, n_init=1), distributor=DensityDistributor(), random_state=0, kind='borderline1'), {
        'k_neighbors': [3, 5],
        'clusterer__n_clusters': [0.0, 0.25, 0.5, 0.75, 1.0],
        'distributor__distances_exponent': [0, 1, 2],
        'distributor__filtering_threshold': [0.5, 1.0]
        }
    ) 
]
classifiers = [
    ('LR', LogisticRegression()),
    ('KNN', KNeighborsClassifier(), {'n_neighbors': [3, 5]}),
    ('DT', DecisionTreeClassifier(random_state=2), {'max_depth': [3, 6]}),
    ('GBC', GradientBoostingClassifier(random_state=3), {'max_depth':[3, 6], 'n_estimators': [50, 100]})
]

# Load datasets
示例#7
0
from sklearnext.over_sampling import GeometricSMOTE, DensityDistributor

# Paths
datasets_path = join(dirname(__file__), '..', '..', 'data', 'binary-numerical-imbalanced')
results_path = join(dirname(__file__), '..', '..', 'data', 'results', 'kmeans-oversampling', 'gsmote')

# Oversamplers and classifiers
oversamplers = [
    ('NO OVERSAMPLING', None),
    ('G-SMOTE', GeometricSMOTE(random_state=0), {
        'k_neighbors': [3, 5],
        'truncation_factor': [-1.0, 0.0, 1.0],
        'deformation_factor': [0.0, 0.5, 1.0]
        }
    ),
    ('K-MEANS G-SMOTE', GeometricSMOTE(clusterer=KMeans(random_state=1, n_init=1), distributor=DensityDistributor(), random_state=0), {
        'k_neighbors': [3, 5],
        'truncation_factor': [-1.0, 0.0, 1.0],
        'deformation_factor': [0.0, 0.5, 1.0],
        'clusterer__n_clusters': [0.0, 0.25, 0.5, 0.75, 1.0],
        'distributor__distances_exponent': [0, 1, 2],
        'distributor__filtering_threshold': [0.5, 1.0]
        }
    ) 
]
classifiers = [
    ('LR', LogisticRegression()),
    ('KNN', KNeighborsClassifier(), {'n_neighbors': [3, 5]}),
    ('DT', DecisionTreeClassifier(random_state=2), {'max_depth': [3, 6]}),
    ('GBC', GradientBoostingClassifier(random_state=3), {'max_depth': [3, 6], 'n_estimators': [50, 100]})
]