def smote(target_df, target_outcome, grouping): y = target_df[target_outcome] target_df.drop(target_outcome, axis=1, inplace=True) target_df[grouping] = [ float((x.partition('_')[2])) for x in target_df[grouping] ] #target_df.drop(grouping, axis=1, inplace=True) X = target_df target_columns = target_df.columns #target_columns = target_columns.insert(0, grouping) target_columns = target_columns.insert(len(target_columns), target_outcome) kmeans_smote = KMeansSMOTE(kmeans_args={'n_clusters': 5}, smote_args={'k_neighbors': 10}) X_resampled, y_resampled = kmeans_smote.fit_sample(X, y) X_resampled = pd.DataFrame(X_resampled) y_resampled = pd.DataFrame(y_resampled) frames = [X_resampled, y_resampled] total_df = pd.concat(frames, axis=1) total_df.columns = target_columns total_df[grouping] = ['p_' + str(x) for x in total_df[grouping]] return total_df
def test_random_oversampling_limit_case(plot=False): """Execute k-means SMOTE with parameters equivalent to random oversampling""" kmeans_smote = KMeansSMOTE( random_state=RND_SEED, imbalance_ratio_threshold=float('Inf'), kmeans_args={ 'n_clusters': 1 }, smote_args={ 'k_neighbors': 0 } ) random_oversampler = RandomOverSampler(random_state=RND_SEED) X_resampled, y_resampled = kmeans_smote.fit_sample(X, Y) X_resampled_random_oversampler, y_resampled_random_oversampler = random_oversampler.fit_sample( X, Y) if plot: plot_resampled(X_resampled, y_resampled, 'random_oversampling_limit_case_test_kmeans_smote') plot_resampled(X_resampled_random_oversampler, y_resampled_random_oversampler, 'random_oversampling_limit_case_test_random_oversampling') assert_array_equal(X_resampled, X_resampled_random_oversampler) assert_array_equal(y_resampled, y_resampled_random_oversampler)
def test_smote_fallback(plot=False): """Assert that regular SMOTE is applied if no minority clusters are found.""" kmeans_smote = KMeansSMOTE( random_state=RND_SEED, kmeans_args={ 'n_clusters': 1 } ) smote = SMOTE(random_state=RND_SEED) with warnings.catch_warnings(record=True) as w: X_resampled, y_resampled = kmeans_smote.fit_sample(X, Y) assert len(w) == 1 assert "No minority clusters found" in str(w[0].message) assert "Performing regular SMOTE" in str(w[0].message) assert issubclass(w[0].category, UserWarning) X_resampled_smote, y_resampled_smote = smote.fit_sample(X, Y) if plot: plot_resampled(X_resampled, y_resampled, 'smote_fallback_test_kmeans_smote') plot_resampled(X_resampled_smote, y_resampled_smote, 'smote_fallback_test_smote') assert_array_equal(X_resampled, X_resampled_smote) assert_array_equal(y_resampled, y_resampled_smote)
def test_smoke(plot=False): """Execute k-means SMOTE with default parameters""" kmeans_smote = KMeansSMOTE(random_state=RND_SEED) X_resampled, y_resampled = kmeans_smote.fit_sample(X, Y) assert (np.unique(y_resampled, return_counts=True)[1] == np.unique( Y_EXPECTED, return_counts=True)[1]).all() assert (X_resampled.shape == X_SHAPE_EXPECTED) if plot: plot_resampled(X, X_resampled, Y, y_resampled, 'smoke_test')
def test_backwards_compatibility(plot=False): """Test if deprecated parameter ratio can still be used without error""" with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=DeprecationWarning) kmeans_smote = KMeansSMOTE(random_state=RND_SEED, ratio={0: Y.sum()}) X_resampled, y_resampled = kmeans_smote.fit_sample(X, Y) assert (np.unique(y_resampled, return_counts=True)[1] == np.unique( Y_EXPECTED, return_counts=True)[1]).all() assert (X_resampled.shape == X_SHAPE_EXPECTED) if plot: plot_resampled(X, X_resampled, Y, y_resampled, 'smoke_test')
def test_smoke_multiclass(plot=False): """Execute k-means SMOTE with default parameters for multi-class dataset""" kmeans_smote = KMeansSMOTE(random_state=RND_SEED) X_resampled, y_resampled = kmeans_smote.fit_sample(X_MULTICLASS, Y_MULTICLASS) assert (np.unique(y_resampled, return_counts=True)[1] == np.unique( Y_MULTICLASS_EXPECTED, return_counts=True)[1]).all() assert (X_resampled.shape == X_MULTICLASS_SHAPE_EXPECTED) if plot: plot_resampled(X_MULTICLASS, X_resampled, Y_MULTICLASS, y_resampled, 'smoke_multiclass_test')
def test_multiclass(plot=False): """Execute k-means SMOTE for multi-class dataset with user-defined n_clusters""" kmeans_smote = KMeansSMOTE(random_state=RND_SEED, kmeans_args={'n_clusters': 10}) X_resampled, y_resampled = kmeans_smote.fit_sample(X_MULTICLASS, Y_MULTICLASS) assert (np.unique(y_resampled, return_counts=True)[1] == np.unique( Y_MULTICLASS_EXPECTED, return_counts=True)[1]).all() assert (X_resampled.shape == X_MULTICLASS_SHAPE_EXPECTED) if plot: plot_resampled(X_MULTICLASS, X_resampled, Y_MULTICLASS, y_resampled, 'multiclass_test')
def test_documentation_example(): """Test basic code example shown in documentation""" from imblearn.datasets import fetch_datasets datasets = fetch_datasets(filter_data=['oil']) X, y = datasets['oil']['data'], datasets['oil']['target'] labels, counts = np.unique(y, return_counts=True) assert counts[0] > counts[1] kmeans_smote = KMeansSMOTE(kmeans_args={'n_clusters': 100}, smote_args={'k_neighbors': 10}) X_resampled, y_resampled = kmeans_smote.fit_sample(X, y) labels, counts = np.unique(y_resampled, return_counts=True) assert counts[0] == counts[1]
def test_smote_limit_case(plot=False): """Execute k-means SMOTE with parameters equivalent to SMOTE""" kmeans_smote = KMeansSMOTE(random_state=RND_SEED, imbalance_ratio_threshold=float('Inf'), kmeans_args={'n_clusters': 1}) smote = SMOTE(random_state=RND_SEED) X_resampled, y_resampled = kmeans_smote.fit_sample(X, Y) X_resampled_smote, y_resampled_smote = smote.fit_sample(X, Y) if plot: plot_resampled(X, X_resampled, Y, y_resampled, 'smote_limit_case_test_kmeans_smote') plot_resampled(X, X_resampled_smote, Y, y_resampled_smote, 'smote_limit_case_test_smote') assert_array_equal(X_resampled, X_resampled_smote) assert_array_equal(y_resampled, y_resampled_smote)
def test_multiclass_irt_dict(plot=False): """ Execute k-means SMOTE for multi-class dataset with different imbalance ratio thresholds per class. """ kmeans_smote = KMeansSMOTE(random_state=RND_SEED, kmeans_args={'n_clusters': 10}, imbalance_ratio_threshold={ 1: 1, 2: np.inf }) X_resampled, y_resampled = kmeans_smote.fit_sample(X_MULTICLASS, Y_MULTICLASS) assert (np.unique(y_resampled, return_counts=True)[1] == np.unique( Y_MULTICLASS_EXPECTED, return_counts=True)[1]).all() assert (X_resampled.shape == X_MULTICLASS_SHAPE_EXPECTED) if plot: plot_resampled(X_MULTICLASS, X_resampled, Y_MULTICLASS, y_resampled, 'multiclass_test')
def test_smote_limit_case_multiclass(plot=False): """Execute k-means SMOTE with parameters equivalent to SMOTE""" kmeans_smote = KMeansSMOTE(random_state=RND_SEED, imbalance_ratio_threshold=float('Inf'), kmeans_args={'n_clusters': 1}, smote_args={'k_neighbors': 3}) smote = SMOTE(random_state=RND_SEED, k_neighbors=3) X_resampled, y_resampled = kmeans_smote.fit_sample(X_MULTICLASS, Y_MULTICLASS) X_resampled_smote, y_resampled_smote = smote.fit_sample( X_MULTICLASS, Y_MULTICLASS) if plot: plot_resampled(X_MULTICLASS, X_resampled, Y_MULTICLASS, y_resampled, 'smote_limit_case_multiclass_test_kmeans_smote') plot_resampled(X_MULTICLASS, X_resampled_smote, Y_MULTICLASS, y_resampled_smote, 'smote_limit_case_multiclass_test_smote') assert_array_equal(X_resampled, X_resampled_smote) assert_array_equal(y_resampled, y_resampled_smote)
def split_dataset(X_1, y_1, X_0, y_0, test_size=0.2, random_seed=0, smote=True): if smote: dataX_1, testX_1, datay_1, testy_1 = train_test_split( X_1, y_1, test_size=0.16, random_state=random_seed) dataX_0, testX_0, datay_0, testy_0 = train_test_split( X_0, y_0, test_size=0.082, random_state=random_seed) test_X, test_y = shuffled(array_joint(testX_1, testX_0), array_joint(testy_1, testy_0), random_seed=random_seed) # print(len(testy_1), len(testy_0), len(testy_1) + len(testy_0)) addX, addy = shuffled(array_joint(dataX_1, dataX_0), array_joint(datay_1, datay_0), random_seed=random_seed) kmeans_smote = KMeansSMOTE(kmeans_args={'n_clusters': 2}, smote_args={'k_neighbors': 2}, random_state=random_seed) X_resampled, y_resampled = kmeans_smote.fit_sample(addX, addy) # datay_1 = np.ones(len(smote_dataX_1), dtype=np.int16) train_X, train_y = shuffled(X_resampled, y_resampled, random_seed=random_seed) else: dataX_1, testX_1, datay_1, testy_1 = train_test_split( X_1, y_1, test_size=0.2, random_state=random_seed) dataX_0, testX_0, datay_0, testy_0 = train_test_split( X_0, y_0, test_size=0.067, random_state=random_seed) test_X, test_y = shuffled(array_joint(testX_1, testX_0), array_joint(testy_1, testy_0), random_seed=random_seed) train_X, train_y = shuffled(array_joint(dataX_1, dataX_0), array_joint(datay_1, datay_0), random_seed=random_seed) return train_X, test_X, train_y, test_y
def main(): experiment_config = { 'comment': 'Keel run', 'experiment_repetitions': 1, 'n_splits': 2, 'random_seed': int(os.urandom(1)[0] / 255 * (2**32)), } classifiers = [('LR', LogisticRegression())] oversampling_methods = [ ('None', None), ('RandomOverSampler', RandomOverSampler()), ('SMOTE', SMOTE(), [{ 'k_neighbors': [3] }]), ( 'KMeansSMOTE', KMeansSMOTE(), [{ 'imbalance_ratio_threshold': [1], 'density_power': [None], # None corresponds to n_features 'smote_args': [ { 'k_neighbors': 5 }, ], 'kmeans_args': [{ 'n_clusters': 2 }], 'use_minibatch_kmeans': [True], 'n_jobs': [-1] }]) ] datasets = read_csv_dir(cfg['dataset_dir']) experiment = BinaryExperiment( datasets, classifiers, oversampling_methods, n_jobs=-1, experiment_repetitions=experiment_config['experiment_repetitions'], random_state=experiment_config['random_seed'], n_splits=experiment_config['n_splits'], scoring=[ 'geometric_mean_score', 'average_precision', 'roc_auc', 'f1', 'fp', 'fn', 'tp', 'tn' ]) with warnings.catch_warnings(): warnings.filterwarnings(action='ignore', message='Adapting smote_args\.k_neighbors') experiment.run() path = cfg['results_dir'] if 'session_id' not in globals(): session_id = (datetime.utcnow() + timedelta(hours=2, minutes=0)).strftime("%Y-%m-%d %Hh%M") os.makedirs('{}/{}'.format(path, session_id)) experiment.save('{}/{}/experiment.p'.format(path, session_id)) # stringify oversampling methods experiment_config['oversampling_methods'] = re.sub( '\\n *', ' ', str(oversampling_methods)) # save experiment config pd.Series(experiment_config).to_csv('{}/{}/experiment_config.csv'.format( path, session_id))
from kmeans_smote import KMeansSMOTE from TestDataGeneration.data_generation import DataGenerator from experiment import Experiment if __name__ == "__main__": total_number = 10000 ratio = 0.9 dg1 = DataGenerator(total_number=total_number, ratio=ratio) data_train, label_train = dg1.generate() dg2 = DataGenerator(total_number=total_number, ratio=ratio) data_test, label_test = dg2.generate() Ratio = "minority" kmeans_args = {"n_clusters": 20} imbalance_ratio_threshold = 20 smote_args = {"k_neighbors": 10} kmeans_smote = KMeansSMOTE( ratio=Ratio, kmeans_args=kmeans_args, smote_args=smote_args, imbalance_ratio_threshold=imbalance_ratio_threshold) X_resampled, Y_resampled = kmeans_smote.fit_sample(data_train, label_train) exp = Experiment(data=X_resampled, label=Y_resampled) n_neighbors = [1, 3, 5, 7, 9, 11, 13, 15, 17] for neighbor in n_neighbors: true_posi, false_posi, true_neg, false_neg = exp.get_confusion_matrix( data=data_test, label=label_test, n_neighbors=neighbor) print("true_posi:", true_posi, "false_posi:", false_posi, "true_neg:", true_neg, "false_neg:", false_neg)
#We perform target encoding from sklearn.preprocessing import LabelEncoder ly = LabelEncoder() y = ly.fit_transform(y) import numpy as np #from imblearn.datasets import fetch_datasets from kmeans_smote import KMeansSMOTE [ print('Class {} has {} instances'.format(label, count)) for label, count in zip(*np.unique(y, return_counts=True)) ] kmeans_smote = KMeansSMOTE(sampling_strategy='minority', kmeans_args={'n_clusters': 100}, smote_args={'k_neighbors': 10}) X_resampled, y_resampled = kmeans_smote.fit_sample(X, y) [ print('Class {} has {} instances after oversampling'.format(label, count)) for label, count in zip(*np.unique(y_resampled, return_counts=True)) ] #Splitting Training and Test Set #Since we have a very small dataset, we will train our model with all availabe data. from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2)
plot_before_after_oversampling( dataset_a.iloc[:, 0:2], dataset_a.iloc[:, 2], ('SMOTE', SMOTE()), 'A' ) # <markdowncell> # ## Oversampling with k-means SMOTE # <codecell> np.random.seed(1) plot_before_after_oversampling( dataset_a.iloc[:, 0:2], dataset_a.iloc[:, 2], ('k-means SMOTE', KMeansSMOTE( kmeans_args={'n_clusters': 6}, use_minibatch_kmeans=False )), 'A', additional_text_after_oversampling='k = 6' ) # <markdowncell> # # Dataset B # <codecell> dataset_b = pd.read_csv(os.path.join(cfg['dataset_dir'], 'b.csv'), header=None) # <markdowncell> # ## Oversampling with SMOTE # <codecell> np.random.seed(1)