Exemplo n.º 1
0
def test_multiclass():
    dataset = datasets.load_wine()

    oversampler = sv.MulticlassOversampling(sv.distance_SMOTE())

    X_samp, y_samp = oversampler.sample(dataset['data'], dataset['target'])

    assert len(X_samp) > 0

    oversampler = sv.MulticlassOversampling(sv.distance_SMOTE(),
                                            strategy='equalize_1_vs_many')

    X_samp, y_samp = oversampler.sample(dataset['data'], dataset['target'])
Exemplo n.º 2
0
    def balanceData(self):
        shape = self.trainInputDict["data"].shape
        print("trainInputDict[data].shape : ", shape)
        copy = self.trainInputDict["data"]
        copy = copy.reshape(shape[0], -1)
        print("copy.shape : ", copy.shape)
        npDict = copy.numpy()
        copyLabel = self.trainInputDict["label"]
        print("copyLabel.shape : ", copyLabel.shape)
        # copyLabel = copyLabel.view(-1)
        npLabel = copyLabel.numpy()
        # [print('Class {} had {} instances originally'.format(label, count)) for label, count in zip(*np.unique(npLabel, return_counts=True))]
        # X_resampled, y_resampled = kmeans_smote.fit_sample(npDict, npLabel)

        # print(sv.get_all_oversamplers_multiclass())

        oversampler = sv.MulticlassOversampling(sv.SMOTE(n_jobs=6))

        # oversampler = sv.SMOTE(n_jobs=8)
        X_resampled, y_resampled = oversampler.sample(npDict, npLabel)
        [
            print('Class {} has {} instances after oversampling'.format(
                label, count)) for label, count in zip(
                    *np.unique(y_resampled, return_counts=True))
        ]

        newData = torch.from_numpy(
            X_resampled.reshape(len(X_resampled), shape[1], shape[2],
                                shape[3]))
        newLabel = torch.from_numpy(y_resampled)
        newData = newData.float()
        return newData, newLabel
Exemplo n.º 3
0
def smote_data(images, labels):
    # images = images[:50]
    # labels = labels[:50]
    shape = np.shape(images)
    nums = shape[0] // 2
    oversampler = sv.MulticlassOversampling(
        sv.Borderline_SMOTE2(proportion=0.7,
                             n_neighbors=3,
                             k_neighbors=3,
                             n_jobs=12))  # MDO
    X, y = oversampler.sample(np.reshape(images, (len(images), -1)), labels)
    X = X.reshape((len(y), shape[1], shape[2], shape[3])).astype(np.uint8)
    mkdir('new_train')
    with open('new_train_label.csv', 'a', encoding='utf-8') as f:
        f_csv = csv.writer(f)
        for i, x in enumerate(X):
            im = Image.fromarray(x)
            im.save('./new_train/' + str(i) + '.jpg', 'jpeg')
            f_csv.writerow([str(i) + '.jpg', y[i] + 1])

    print('org: %d -> x: %d' % (len(labels), len(y)))
    ys = [0] * 10
    for i in y:
        ys[i + 1] += 1
    print(ys)
Exemplo n.º 4
0
def over_sample(X, y, len_data, random_seed=None):  # -> 没有使用
    """
    对样本较少的类别进行过采样,增加样本数目,实现样本平衡
    """
    oversampler = sv.MulticlassOversampling(
        sv.distance_SMOTE(random_state=random_seed))
    X_samp, y_samp = oversampler.sample(X, y)
    return X_samp, y_samp
Exemplo n.º 5
0
def generate_multiclass_figures():
    oversamplers = sv.get_all_oversamplers()
    oversamplers = [
        o for o in oversamplers
        if not sv.OverSampling.cat_changes_majority in o.categories
        and 'proportion' in o().get_params()
    ]

    import sklearn.datasets as datasets

    dataset = datasets.load_wine()

    X = dataset['data']
    y = dataset['target']

    import matplotlib.pyplot as plt

    import sklearn.preprocessing as preprocessing

    ss = preprocessing.StandardScaler()

    X_ss = ss.fit_transform(X)

    def plot_and_save(X, y, filename, oversampler_name):
        plt.figure(figsize=(4, 3))
        plt.scatter(X[y == 0][:, 0],
                    X[y == 0][:, 1],
                    c='r',
                    marker='o',
                    label='class 0')
        plt.scatter(X[y == 1][:, 0],
                    X[y == 1][:, 1],
                    c='b',
                    marker='P',
                    label='class 1')
        plt.scatter(X[y == 2][:, 0],
                    X[y == 2][:, 1],
                    c='green',
                    marker='x',
                    label='class 2')
        plt.xlabel('feature 0')
        plt.ylabel('feature 1')
        plt.title(", ".join(["wine dataset", oversampler_name]))
        plt.savefig(filename)
        plt.show()

    plot_and_save(X, y, 'figures/multiclass-base.png', "No Oversampling")

    for o in oversamplers:
        print(o.__name__)
        mcos = sv.MulticlassOversampling(o())
        X_samp, y_samp = mcos.sample(X_ss, y)
        plot_and_save(ss.inverse_transform(X_samp), y_samp,
                      "figures/multiclass-%s" % o.__name__, o.__name__)
 def oversampling_data(self):
     for i in range(1, self.no_of_splits + 1):
         train_data = np.load(
             os.sep.join([self.saving_dir,
                          'Fold_{}_X_train.npy'.format(i)]))
         train_data = train_data.reshape(-1, self.data_shape)
         train_onehot = np.load(
             os.sep.join([self.saving_dir,
                          'Fold_{}_y_train.npy'.format(i)]))
         le_encoder = LabelEncoder()
         y_train = le_encoder.fit_transform(train_onehot)
         analysis_groups = self.groups_to_analyse
         for analysis in analysis_groups:
             group = self.Groups[analysis]
             X = []
             Y = []
             try:
                 os.makedirs(
                     os.sep.join([
                         self.oversampled_data_dir, 'fold_' + str(i),
                         analysis
                     ]))
             except Exception as e:
                 print('Directory already exists')
             for method in group.keys():
                 oversampler = sv.MulticlassOversampling(group[method])
                 x, y = oversampler.sample(train_data, y_train)
                 X.extend(x)
                 Y.extend(y)
                 x = pd.DataFrame(x)
                 x['label'] = y
                 x.to_csv(os.sep.join([
                     self.oversampled_data_dir, 'fold_' + str(i), analysis,
                     str(method) + '.csv'
                 ]),
                          index=False)
             X = pd.DataFrame(X)
             X['label'] = Y
             X.to_csv(os.sep.join([
                 self.oversampled_data_dir, 'fold_' + str(i), analysis,
                 str(analysis) + '.csv'
             ]),
                      index=False)
Exemplo n.º 7
0
# In[4]:

# printing the number of samples

for i in np.unique(y):
    print("class %d - samples: %d" % (i, np.sum(y == i)))

# ## Oversampling
#
# In this section multiclass oversampling is driven by the binary oversampler ```distance_SMOTE```.

# In[5]:

# chosing an oversampler supporting multiclass oversampling

oversampler = sv.MulticlassOversampling(sv.distance_SMOTE())

# In[6]:

X_samp, y_samp = oversampler.sample(X, y)

# ## Illustrating the outcome

# In[7]:

# printing the number of samples

for i in np.unique(y_samp):
    print("class %d - samples: %d" % (i, np.sum(y_samp == i)))

# In[8]:
Exemplo n.º 8
0
    data_x_tst = pd.read_csv('nepal_earthquake_tst.csv')
    # Comprueba balanceo de clases
    # GraficoComprobarVar(data_y, "damage_grade")
    # Comprueba valores perdidos
    #ComprobarValPer(data_x)
    # Calcula la matriz de correlación
    #MatrizCorrelacion(data_x)
    # Elimina etiquetas
    eliminaLabels(data_x, data_y, data_x_tst, ['building_id'])
    # Preprocesado category to number

    X = catToNum(data_x).values
    y = np.ravel(data_y.values)
    X_tst = catToNum(data_x_tst).values

    oversampler = sv.MulticlassOversampling(sv.distance_SMOTE(proportion=0.75))

    X_sample, y_sample = oversampler.sample(X, y)
    '''
  print("------ RandomForest...")
  rfm = RandomForestClassifier(max_features = 'sqrt', criterion='gini', n_estimators=500, \
                               max_depth=25, random_state=76592621, \
                               n_jobs=-1)
  # Hago la validación cruzada para el algoritmo
  #rfm, y_train_clf, y_test_clf = validacion_cruzada(rfm, X_sample, y_sample)
  print("------ Generando submission...")
  submission(X_sample, y_sample, X_tst, rfm)
  '''

    print("------ Catboost...")
    cbc = CatBoostClassifier(n_estimators=450,
Exemplo n.º 9
0
                           validator= RepeatedStratifiedKFold(n_repeats= 8,
                                                              n_splits= 5))
print(results.T[['sampler', 'auc', 'gacc']])

np.random.seed(random_seed)
results= sv.cross_validate(dataset= libras, 
                           sampler= sv.NoSMOTE(), 
                           classifier= KNeighborsClassifier(),
                           validator= RepeatedStratifiedKFold(n_repeats= 8,
                                                              n_splits= 5))
print(results.T[['sampler', 'auc', 'gacc']])

#%% running multiclass oversampling

np.random.seed(random_seed)
mc_oversampler= sv.MulticlassOversampling(sv.distance_SMOTE(), strategy= 'equalize_1_vs_many_successive')
X_os, y_os= mc_oversampler.sample(wine['data'], wine['target'])

plot_mc(wine['data'], wine['target'], 'wine', 0, 1, 2, 'wine.eps')
plot_mc(X_os, y_os, 'wine oversampled by distance-SMOTE', 0, 1, 2, 'wine_distance_smote.eps')

#%% oversampler evaluation

import os.path

ecoli['name']= 'ecoli'
cache_path= os.path.join(os.path.expanduser('~'), 'smote_cache')

np.random.seed(random_seed)
results= sv.evaluate_oversamplers(datasets= [ecoli], 
                                  samplers= [sv.SPY,
Exemplo n.º 10
0
X, X_tst, y = PreprocesadoDatos(3, data_training, data_test, data_labels)


#Sustituir el valor de n por el modelo que se desea ejecutar
n=1
    
print("##################################")
print("Ejecutando algoritmo número " + str(n))
print("##################################")
    
    
    
import smote_variants as sv
#oversampler = sv.ProWSyn(proportion=1.0, n_neighbors=5,L=5, theta=1.0, n_jobs=-1, random_state=2)
#oversampler= sv.MulticlassOversampling(sv.distance_SMOTE(random_state=2))
oversampler = sv.MulticlassOversampling(sv.polynom_fit_SMOTE(topology='star',random_state=2))
    
    
# X_sam and y_sam contain the oversampled dataset
X_sam, y_sam= oversampler.sample(X, y)
    
'''
import collections
height=[collections.Counter(y_sam)[1],collections.Counter(y_sam)[2],collections.Counter(y_sam)[3]]
print(height)
print(collections.Counter(y))
plt.bar( ['low damage','medium damage','high damage'], height,color=['blue', 'orange', 'green'])

colors = {'class1 - Smote Polynom/star':'blue', 'class2 - Smote Polynom/star':'orange', 'class3 - Smote Polynom/star':'green'}         
labels = list(colors.keys())
handles = [plt.Rectangle((1,0),-1,-1, color=colors[label]) for label in labels]