from sklearn.neighbors.classification import KNeighborsClassifier from sklearn.datasets import make_classification from sklearnext.tools import BinaryExperiment from sklearnext.over_sampling import SMOTE, GeometricSMOTE # Generate datasets datasets = [ ('A', make_classification(random_state=1, weights=[0.80, 0.20], n_features=10)), ('B', make_classification(random_state=1, weights=[0.85, 0.15], n_features=10)), ('C', make_classification(random_state=1, weights=[0.90, 0.10], n_features=10)) ] # Oversamplers and classifiers oversamplers = [ ('NO OVERSAMPLING', None), ('SMOTE', SMOTE(random_state=0), {'k_neighbors':[3, 4]}), ('G-SMOTE', GeometricSMOTE(random_state=0), { 'k_neighbors':[3, 4], 'deformation_factor': [0.25, 0.50, 0.75], 'truncation_factor': [-0.5, 0.0, 0.5] } ) ] classifiers = [ ('DT', DecisionTreeClassifier(), {'max_depth': [3, 4, 5]}), ('KNN', KNeighborsClassifier(), {'n_neighbors':[3, 5]}), ] # Define experiment experiment = BinaryExperiment( name='example',
from sklearn.neighbors.classification import KNeighborsClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearnext.over_sampling import RandomOverSampler, SMOTE, GeometricSMOTE, DensityDistributor from sklearnext.cluster import SOM from sklearnext.tools import evaluate_binary_imbalanced_experiments, read_csv_dir, summarize_binary_datasets # Paths datasets_path = join(dirname(__file__), '..', '..', 'data', 'binary-numerical-imbalanced') results_path = join(dirname(__file__), '..', '..', 'data', 'results', 'gsomo') # Oversamplers and classifiers oversamplers = [ ('NO OVERSAMPLING', None), ('RANDOM OVERSAMPLING', RandomOverSampler(random_state=0)), ('SMOTE', SMOTE(random_state=1), { 'k_neighbors': [3, 5] }), ('G-SOMO', GeometricSMOTE(clusterer=SOM(), distributor=DensityDistributor(distances_exponent=2, filtering_threshold=1.0), random_state=3), { 'k_neighbors': [3, 5], 'truncation_factor': [-1.0, 0.0, 0.25, 1.0], 'deformation_factor': [0.0, 0.5, 1.0], 'clusterer__n_clusters': [0.2, 0.5], 'distributor__distribution_ratio': [0.75, 1.0] }) ] classifiers = [('LR', LogisticRegression()),
from imblearn.pipeline import Pipeline from sklearnext.cluster import KMeans from sklearnext.model_selection import ModelSearchCV from sklearnext.over_sampling import SMOTE from sklearnext.over_sampling.base import DensityDistributor from sklearnext.tools import report_model_search_results # Load data X, y = make_classification(n_informative=15, n_clusters_per_class=3, weights=[0.9, 0.1]) # Define estimators estimators = [('GBC', GradientBoostingClassifier()), ('SMOTE+GBC', Pipeline([('smote', SMOTE()), ('gbc', GradientBoostingClassifier())])), ('KMeanSMOTE+GBC', Pipeline([('smote', SMOTE(clusterer=KMeans(n_init=1), distributor=DensityDistributor())), ('gbc', GradientBoostingClassifier())]))] # Define parameters grid param_grids = [{ 'SMOTE+GBC__smote__k_neighbors': [2, 3, 4, 5], 'SMOTE+GBC__gbc__max_depth': [2, 4] }, { 'KMeanSMOTE+GBC__smote__k_neighbors': [2, 3, 4, 5], 'KMeanSMOTE+GBC__smote__clusterer__n_clusters': [0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
def generate_oversamplers(oversamplers_names): "Generate oversamplers." oversamplers = [ ('NO OVERSAMPLING', None, {}), ('RANDOM OVERSAMPLING', RandomOverSampler(), {}), ('SMOTE', SMOTE(), { 'k_neighbors': [3, 5] }), ('BORDERLINE SMOTE', BorderlineSMOTE(), { 'k_neighbors': [3, 5] }), ('ADASYN', ADASYN(), { 'n_neighbors': [2, 3] }), ('G-SMOTE', GeometricSMOTE(), { 'k_neighbors': [3, 5], 'selection_strategy': ['combined', 'minority', 'majority'], 'truncation_factor': [-1.0, -0.5, .0, 0.25, 0.5, 0.75, 1.0], 'deformation_factor': [.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0] }), ('K-MEANS RANDOM OVERSAMPLING', RandomOverSampler(clusterer=KMeans(), distributor=DensityDistributor()), { 'k_neighbors': [3, 5], 'clusterer__n_clusters': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'distributor__distances_exponent': [0, 1, 2, 5], 'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0] }), ('K-MEANS SMOTE', SMOTE(clusterer=KMeans(), distributor=DensityDistributor()), { 'k_neighbors': [3, 5], 'clusterer__n_clusters': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'distributor__distances_exponent': [0, 1, 2, 5], 'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0] }), ('K-MEANS BORDERLINE SMOTE', BorderlineSMOTE(clusterer=KMeans(), distributor=DensityDistributor()), { 'k_neighbors': [3, 5], 'clusterer__n_clusters': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'distributor__distances_exponent': [0, 1, 2, 5], 'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0] }), ('K-MEANS G-SMOTE', GeometricSMOTE(clusterer=KMeans(), distributor=DensityDistributor()), { 'k_neighbors': [3, 5], 'selection_strategy': ['combined', 'minority', 'majority'], 'truncation_factor': [-1.0, -0.5, .0, 0.25, 0.5, 0.75, 1.0], 'deformation_factor': [.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0], 'clusterer__n_clusters': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'distributor__distances_exponent': [0, 1, 2, 5], 'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0] }), ('SOMO', SMOTE(clusterer=SOM(), distributor=DensityDistributor()), { 'k_neighbors': [3, 5], 'clusterer__n_clusters': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'distributor__distances_exponent': [0, 1, 2, 5], 'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0], 'distributor__distribution_ratio': [0.0, 0.25, 0.5, 0.75, 1.0] }), ('G-SOMO', GeometricSMOTE(clusterer=SOM(), distributor=DensityDistributor()), { 'k_neighbors': [3, 5], 'selection_strategy': ['combined', 'minority', 'majority'], 'truncation_factor': [-1.0, -0.5, .0, 0.25, 0.5, 0.75, 1.0], 'deformation_factor': [.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0], 'clusterer__n_clusters': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'distributor__distances_exponent': [0, 1, 2, 5], 'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0], 'distributor__distribution_ratio': [0.0, 0.25, 0.5, 0.75, 1.0] }) ] if oversamplers_names in ('basic', 'scaled', 'undersampled'): oversamplers = select_pipelines( oversamplers, ('NO OVERSAMPLING', 'RANDOM OVERSAMPLING', 'SMOTE', 'BORDERLINE SMOTE', 'ADASYN', 'G-SMOTE')) if oversamplers_names == 'scaled': oversamplers = append_transformer(MinMaxScaler(), oversamplers) elif oversamplers_names == 'undersampled': oversamplers = set_sampling_strategy( lambda y: generate_sampling_strategy(y, 1 / 3), oversamplers) oversamplers = append_transformer( RandomUnderSampler( sampling_strategy=lambda y: generate_sampling_strategy(y, 3)), oversamplers) return oversamplers
from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors.classification import KNeighborsClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearnext.cluster import KMeans from sklearnext.tools import evaluate_binary_imbalanced_experiments, read_csv_dir, summarize_binary_datasets from sklearnext.over_sampling import SMOTE, DensityDistributor # Paths datasets_path = join(dirname(__file__), '..', '..', 'data', 'binary-numerical-imbalanced') results_path = join(dirname(__file__), '..', '..', 'data', 'results', 'kmeans-oversampling', 'smote') # Oversamplers and classifiers oversamplers = [('NO OVERSAMPLING', None), ('SMOTE', SMOTE(random_state=0), { 'k_neighbors': [3, 5] }), ('K-MEANS SMOTE', SMOTE(clusterer=KMeans(random_state=1, n_init=1), distributor=DensityDistributor(), random_state=0), { 'k_neighbors': [3, 5], 'clusterer__n_clusters': [0.0, 0.25, 0.5, 0.75, 1.0], 'distributor__distances_exponent': [0, 1, 2], 'distributor__filtering_threshold': [0.5, 1.0] })] classifiers = [('LR', LogisticRegression()), ('KNN', KNeighborsClassifier(), { 'n_neighbors': [3, 5]
from sklearn.neighbors.classification import KNeighborsClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearnext.cluster import KMeans, SOM, AgglomerativeClustering, Birch, SpectralClustering from sklearnext.tools import evaluate_binary_imbalanced_experiments, read_csv_dir, summarize_binary_datasets from sklearnext.over_sampling import SMOTE, DensityDistributor # Paths datasets_path = join(dirname(__file__), '..', '..', 'data', 'binary-numerical-imbalanced') results_path = join(dirname(__file__), '..', '..', 'data', 'results', 'clustering-smote') # Oversamplers and classifiers oversamplers = [ ('NO OVERSAMPLING', None), ('SMOTE', SMOTE(random_state=0), { 'k_neighbors': [3, 4, 5] }), ('K-MEANS SMOTE', SMOTE(clusterer=KMeans(random_state=1), distributor=DensityDistributor(), random_state=0), { 'k_neighbors': [3, 4, 5], 'clusterer__n_clusters': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'distributor__distances_exponent': [0, 1, 2, 5], 'distributor__filtering_threshold': [0.0, 0.5, 1.0, 2.0] }), ('SOMO', SMOTE(clusterer=SOM(), distributor=DensityDistributor(), random_state=0), {
from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors.classification import KNeighborsClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearnext.cluster import KMeans from sklearnext.tools import evaluate_binary_imbalanced_experiments, read_csv_dir, summarize_binary_datasets from sklearnext.over_sampling import SMOTE, DensityDistributor # Paths datasets_path = join(dirname(__file__), '..', '..', 'data', 'binary-numerical-imbalanced') results_path = join(dirname(__file__), '..', '..', 'data', 'results', 'kmeans-oversampling', 'borderline-smote') # Oversamplers and classifiers oversamplers = [ ('NO OVERSAMPLING', None), ('BORDERLINE-SMOTE', SMOTE(random_state=0, kind='borderline1'), {'k_neighbors': [3, 5]}), ('K-MEANS BORDERLINE-SMOTE', SMOTE(clusterer=KMeans(random_state=1, n_init=1), distributor=DensityDistributor(), random_state=0, kind='borderline1'), { 'k_neighbors': [3, 5], 'clusterer__n_clusters': [0.0, 0.25, 0.5, 0.75, 1.0], 'distributor__distances_exponent': [0, 1, 2], 'distributor__filtering_threshold': [0.5, 1.0] } ) ] classifiers = [ ('LR', LogisticRegression()), ('KNN', KNeighborsClassifier(), {'n_neighbors': [3, 5]}), ('DT', DecisionTreeClassifier(random_state=2), {'max_depth': [3, 6]}), ('GBC', GradientBoostingClassifier(random_state=3), {'max_depth':[3, 6], 'n_estimators': [50, 100]}) ]