예제 #1
0
def smote(target_df, target_outcome, grouping):
    y = target_df[target_outcome]
    target_df.drop(target_outcome, axis=1, inplace=True)

    target_df[grouping] = [
        float((x.partition('_')[2])) for x in target_df[grouping]
    ]
    #target_df.drop(grouping, axis=1, inplace=True)
    X = target_df

    target_columns = target_df.columns
    #target_columns = target_columns.insert(0, grouping)
    target_columns = target_columns.insert(len(target_columns), target_outcome)
    kmeans_smote = KMeansSMOTE(kmeans_args={'n_clusters': 5},
                               smote_args={'k_neighbors': 10})
    X_resampled, y_resampled = kmeans_smote.fit_sample(X, y)
    X_resampled = pd.DataFrame(X_resampled)
    y_resampled = pd.DataFrame(y_resampled)
    frames = [X_resampled, y_resampled]

    total_df = pd.concat(frames, axis=1)
    total_df.columns = target_columns
    total_df[grouping] = ['p_' + str(x) for x in total_df[grouping]]

    return total_df
예제 #2
0
def test_random_oversampling_limit_case(plot=False):
    """Execute k-means SMOTE with parameters equivalent to random oversampling"""
    kmeans_smote = KMeansSMOTE(
        random_state=RND_SEED,
        imbalance_ratio_threshold=float('Inf'),
        kmeans_args={
            'n_clusters': 1
        },
        smote_args={
            'k_neighbors': 0
        }
    )
    random_oversampler = RandomOverSampler(random_state=RND_SEED)
    X_resampled, y_resampled = kmeans_smote.fit_sample(X, Y)
    X_resampled_random_oversampler, y_resampled_random_oversampler = random_oversampler.fit_sample(
        X, Y)

    if plot:
        plot_resampled(X_resampled, y_resampled,
                       'random_oversampling_limit_case_test_kmeans_smote')
        plot_resampled(X_resampled_random_oversampler, y_resampled_random_oversampler,
                       'random_oversampling_limit_case_test_random_oversampling')

    assert_array_equal(X_resampled, X_resampled_random_oversampler)
    assert_array_equal(y_resampled, y_resampled_random_oversampler)
예제 #3
0
def test_smote_fallback(plot=False):
    """Assert that regular SMOTE is applied if no minority clusters are found."""
    kmeans_smote = KMeansSMOTE(
        random_state=RND_SEED,
        kmeans_args={
            'n_clusters': 1
        }
    )
    smote = SMOTE(random_state=RND_SEED)
    with warnings.catch_warnings(record=True) as w:
        X_resampled, y_resampled = kmeans_smote.fit_sample(X, Y)

        assert len(w) == 1
        assert "No minority clusters found" in str(w[0].message)
        assert "Performing regular SMOTE" in str(w[0].message)
        assert issubclass(w[0].category, UserWarning)

        X_resampled_smote, y_resampled_smote = smote.fit_sample(X, Y)

        if plot:
            plot_resampled(X_resampled, y_resampled,
                        'smote_fallback_test_kmeans_smote')
            plot_resampled(X_resampled_smote, y_resampled_smote,
                        'smote_fallback_test_smote')

        assert_array_equal(X_resampled, X_resampled_smote)
        assert_array_equal(y_resampled, y_resampled_smote)
예제 #4
0
def test_smoke(plot=False):
    """Execute k-means SMOTE with default parameters"""
    kmeans_smote = KMeansSMOTE(random_state=RND_SEED)
    X_resampled, y_resampled = kmeans_smote.fit_sample(X, Y)

    assert (np.unique(y_resampled, return_counts=True)[1] == np.unique(
        Y_EXPECTED, return_counts=True)[1]).all()
    assert (X_resampled.shape == X_SHAPE_EXPECTED)
    if plot:
        plot_resampled(X, X_resampled, Y, y_resampled, 'smoke_test')
예제 #5
0
def test_backwards_compatibility(plot=False):
    """Test if deprecated parameter ratio can still be used without error"""
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=DeprecationWarning)
        kmeans_smote = KMeansSMOTE(random_state=RND_SEED, ratio={0: Y.sum()})
        X_resampled, y_resampled = kmeans_smote.fit_sample(X, Y)
    assert (np.unique(y_resampled, return_counts=True)[1] == np.unique(
        Y_EXPECTED, return_counts=True)[1]).all()
    assert (X_resampled.shape == X_SHAPE_EXPECTED)
    if plot:
        plot_resampled(X, X_resampled, Y, y_resampled, 'smoke_test')
예제 #6
0
def test_smoke_multiclass(plot=False):
    """Execute k-means SMOTE with default parameters for multi-class dataset"""
    kmeans_smote = KMeansSMOTE(random_state=RND_SEED)
    X_resampled, y_resampled = kmeans_smote.fit_sample(X_MULTICLASS,
                                                       Y_MULTICLASS)

    assert (np.unique(y_resampled, return_counts=True)[1] == np.unique(
        Y_MULTICLASS_EXPECTED, return_counts=True)[1]).all()
    assert (X_resampled.shape == X_MULTICLASS_SHAPE_EXPECTED)
    if plot:
        plot_resampled(X_MULTICLASS, X_resampled, Y_MULTICLASS, y_resampled,
                       'smoke_multiclass_test')
예제 #7
0
def test_multiclass(plot=False):
    """Execute k-means SMOTE for multi-class dataset with user-defined n_clusters"""
    kmeans_smote = KMeansSMOTE(random_state=RND_SEED,
                               kmeans_args={'n_clusters': 10})
    X_resampled, y_resampled = kmeans_smote.fit_sample(X_MULTICLASS,
                                                       Y_MULTICLASS)

    assert (np.unique(y_resampled, return_counts=True)[1] == np.unique(
        Y_MULTICLASS_EXPECTED, return_counts=True)[1]).all()
    assert (X_resampled.shape == X_MULTICLASS_SHAPE_EXPECTED)
    if plot:
        plot_resampled(X_MULTICLASS, X_resampled, Y_MULTICLASS, y_resampled,
                       'multiclass_test')
예제 #8
0
def test_documentation_example():
    """Test basic code example shown in documentation"""
    from imblearn.datasets import fetch_datasets

    datasets = fetch_datasets(filter_data=['oil'])
    X, y = datasets['oil']['data'], datasets['oil']['target']

    labels, counts = np.unique(y, return_counts=True)
    assert counts[0] > counts[1]

    kmeans_smote = KMeansSMOTE(kmeans_args={'n_clusters': 100},
                               smote_args={'k_neighbors': 10})
    X_resampled, y_resampled = kmeans_smote.fit_sample(X, y)

    labels, counts = np.unique(y_resampled, return_counts=True)
    assert counts[0] == counts[1]
예제 #9
0
def test_smote_limit_case(plot=False):
    """Execute k-means SMOTE with parameters equivalent to SMOTE"""
    kmeans_smote = KMeansSMOTE(random_state=RND_SEED,
                               imbalance_ratio_threshold=float('Inf'),
                               kmeans_args={'n_clusters': 1})
    smote = SMOTE(random_state=RND_SEED)
    X_resampled, y_resampled = kmeans_smote.fit_sample(X, Y)
    X_resampled_smote, y_resampled_smote = smote.fit_sample(X, Y)

    if plot:
        plot_resampled(X, X_resampled, Y, y_resampled,
                       'smote_limit_case_test_kmeans_smote')
        plot_resampled(X, X_resampled_smote, Y, y_resampled_smote,
                       'smote_limit_case_test_smote')

    assert_array_equal(X_resampled, X_resampled_smote)
    assert_array_equal(y_resampled, y_resampled_smote)
예제 #10
0
def test_multiclass_irt_dict(plot=False):
    """
    Execute k-means SMOTE for multi-class dataset with
    different imbalance ratio thresholds per class.
    """
    kmeans_smote = KMeansSMOTE(random_state=RND_SEED,
                               kmeans_args={'n_clusters': 10},
                               imbalance_ratio_threshold={
                                   1: 1,
                                   2: np.inf
                               })
    X_resampled, y_resampled = kmeans_smote.fit_sample(X_MULTICLASS,
                                                       Y_MULTICLASS)

    assert (np.unique(y_resampled, return_counts=True)[1] == np.unique(
        Y_MULTICLASS_EXPECTED, return_counts=True)[1]).all()
    assert (X_resampled.shape == X_MULTICLASS_SHAPE_EXPECTED)
    if plot:
        plot_resampled(X_MULTICLASS, X_resampled, Y_MULTICLASS, y_resampled,
                       'multiclass_test')
예제 #11
0
def test_smote_limit_case_multiclass(plot=False):
    """Execute k-means SMOTE with parameters equivalent to SMOTE"""
    kmeans_smote = KMeansSMOTE(random_state=RND_SEED,
                               imbalance_ratio_threshold=float('Inf'),
                               kmeans_args={'n_clusters': 1},
                               smote_args={'k_neighbors': 3})
    smote = SMOTE(random_state=RND_SEED, k_neighbors=3)
    X_resampled, y_resampled = kmeans_smote.fit_sample(X_MULTICLASS,
                                                       Y_MULTICLASS)
    X_resampled_smote, y_resampled_smote = smote.fit_sample(
        X_MULTICLASS, Y_MULTICLASS)

    if plot:
        plot_resampled(X_MULTICLASS, X_resampled, Y_MULTICLASS, y_resampled,
                       'smote_limit_case_multiclass_test_kmeans_smote')
        plot_resampled(X_MULTICLASS, X_resampled_smote, Y_MULTICLASS,
                       y_resampled_smote,
                       'smote_limit_case_multiclass_test_smote')

    assert_array_equal(X_resampled, X_resampled_smote)
    assert_array_equal(y_resampled, y_resampled_smote)
def split_dataset(X_1,
                  y_1,
                  X_0,
                  y_0,
                  test_size=0.2,
                  random_seed=0,
                  smote=True):
    if smote:
        dataX_1, testX_1, datay_1, testy_1 = train_test_split(
            X_1, y_1, test_size=0.16, random_state=random_seed)
        dataX_0, testX_0, datay_0, testy_0 = train_test_split(
            X_0, y_0, test_size=0.082, random_state=random_seed)
        test_X, test_y = shuffled(array_joint(testX_1, testX_0),
                                  array_joint(testy_1, testy_0),
                                  random_seed=random_seed)
        # print(len(testy_1), len(testy_0), len(testy_1) + len(testy_0))
        addX, addy = shuffled(array_joint(dataX_1, dataX_0),
                              array_joint(datay_1, datay_0),
                              random_seed=random_seed)
        kmeans_smote = KMeansSMOTE(kmeans_args={'n_clusters': 2},
                                   smote_args={'k_neighbors': 2},
                                   random_state=random_seed)
        X_resampled, y_resampled = kmeans_smote.fit_sample(addX, addy)
        # datay_1 = np.ones(len(smote_dataX_1), dtype=np.int16)
        train_X, train_y = shuffled(X_resampled,
                                    y_resampled,
                                    random_seed=random_seed)
    else:
        dataX_1, testX_1, datay_1, testy_1 = train_test_split(
            X_1, y_1, test_size=0.2, random_state=random_seed)
        dataX_0, testX_0, datay_0, testy_0 = train_test_split(
            X_0, y_0, test_size=0.067, random_state=random_seed)
        test_X, test_y = shuffled(array_joint(testX_1, testX_0),
                                  array_joint(testy_1, testy_0),
                                  random_seed=random_seed)
        train_X, train_y = shuffled(array_joint(dataX_1, dataX_0),
                                    array_joint(datay_1, datay_0),
                                    random_seed=random_seed)

    return train_X, test_X, train_y, test_y
def main():
    experiment_config = {
        'comment': 'Keel run',
        'experiment_repetitions': 1,
        'n_splits': 2,
        'random_seed': int(os.urandom(1)[0] / 255 * (2**32)),
    }

    classifiers = [('LR', LogisticRegression())]
    oversampling_methods = [
        ('None', None),
        ('RandomOverSampler', RandomOverSampler()),
        ('SMOTE', SMOTE(), [{
            'k_neighbors': [3]
        }]),
        (
            'KMeansSMOTE',
            KMeansSMOTE(),
            [{
                'imbalance_ratio_threshold': [1],
                'density_power': [None],  # None corresponds to n_features
                'smote_args': [
                    {
                        'k_neighbors': 5
                    },
                ],
                'kmeans_args': [{
                    'n_clusters': 2
                }],
                'use_minibatch_kmeans': [True],
                'n_jobs': [-1]
            }])
    ]

    datasets = read_csv_dir(cfg['dataset_dir'])
    experiment = BinaryExperiment(
        datasets,
        classifiers,
        oversampling_methods,
        n_jobs=-1,
        experiment_repetitions=experiment_config['experiment_repetitions'],
        random_state=experiment_config['random_seed'],
        n_splits=experiment_config['n_splits'],
        scoring=[
            'geometric_mean_score', 'average_precision', 'roc_auc', 'f1', 'fp',
            'fn', 'tp', 'tn'
        ])

    with warnings.catch_warnings():
        warnings.filterwarnings(action='ignore',
                                message='Adapting smote_args\.k_neighbors')
        experiment.run()

    path = cfg['results_dir']
    if 'session_id' not in globals():
        session_id = (datetime.utcnow() +
                      timedelta(hours=2, minutes=0)).strftime("%Y-%m-%d %Hh%M")

    os.makedirs('{}/{}'.format(path, session_id))

    experiment.save('{}/{}/experiment.p'.format(path, session_id))

    # stringify oversampling methods
    experiment_config['oversampling_methods'] = re.sub(
        '\\n *', ' ', str(oversampling_methods))
    # save experiment config
    pd.Series(experiment_config).to_csv('{}/{}/experiment_config.csv'.format(
        path, session_id))
예제 #14
0
from kmeans_smote import KMeansSMOTE
from TestDataGeneration.data_generation import DataGenerator
from experiment import Experiment

if __name__ == "__main__":
    total_number = 10000
    ratio = 0.9
    dg1 = DataGenerator(total_number=total_number, ratio=ratio)
    data_train, label_train = dg1.generate()
    dg2 = DataGenerator(total_number=total_number, ratio=ratio)
    data_test, label_test = dg2.generate()
    Ratio = "minority"
    kmeans_args = {"n_clusters": 20}
    imbalance_ratio_threshold = 20
    smote_args = {"k_neighbors": 10}
    kmeans_smote = KMeansSMOTE(
        ratio=Ratio,
        kmeans_args=kmeans_args,
        smote_args=smote_args,
        imbalance_ratio_threshold=imbalance_ratio_threshold)
    X_resampled, Y_resampled = kmeans_smote.fit_sample(data_train, label_train)
    exp = Experiment(data=X_resampled, label=Y_resampled)
    n_neighbors = [1, 3, 5, 7, 9, 11, 13, 15, 17]
    for neighbor in n_neighbors:
        true_posi, false_posi, true_neg, false_neg = exp.get_confusion_matrix(
            data=data_test, label=label_test, n_neighbors=neighbor)
        print("true_posi:", true_posi, "false_posi:", false_posi, "true_neg:",
              true_neg, "false_neg:", false_neg)
예제 #15
0
#We perform target encoding
from sklearn.preprocessing import LabelEncoder
ly = LabelEncoder()
y = ly.fit_transform(y)

import numpy as np
#from imblearn.datasets import fetch_datasets
from kmeans_smote import KMeansSMOTE

[
    print('Class {} has {} instances'.format(label, count))
    for label, count in zip(*np.unique(y, return_counts=True))
]

kmeans_smote = KMeansSMOTE(sampling_strategy='minority',
                           kmeans_args={'n_clusters': 100},
                           smote_args={'k_neighbors': 10})
X_resampled, y_resampled = kmeans_smote.fit_sample(X, y)

[
    print('Class {} has {} instances after oversampling'.format(label, count))
    for label, count in zip(*np.unique(y_resampled, return_counts=True))
]

#Splitting Training and Test Set
#Since we have a very small dataset, we will train our model with all availabe data.
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_resampled,
                                                    y_resampled,
                                                    test_size=0.2)
plot_before_after_oversampling(
    dataset_a.iloc[:, 0:2],
    dataset_a.iloc[:, 2],
    ('SMOTE', SMOTE()),
    'A'
)

# <markdowncell>
# ## Oversampling with k-means SMOTE
# <codecell>
np.random.seed(1)
plot_before_after_oversampling(
    dataset_a.iloc[:, 0:2],
    dataset_a.iloc[:, 2],
    ('k-means SMOTE', KMeansSMOTE(
        kmeans_args={'n_clusters': 6},
        use_minibatch_kmeans=False
    )),
    'A',
    additional_text_after_oversampling='k = 6'
)


# <markdowncell>
# # Dataset B
# <codecell>
dataset_b = pd.read_csv(os.path.join(cfg['dataset_dir'], 'b.csv'), header=None)

# <markdowncell>
# ## Oversampling with SMOTE
# <codecell>
np.random.seed(1)