Exemplo n.º 1
0
def pca_components(X, y, X_train, y_train, X_test, y_test):
    for n in range(1, 11):
        sm = ADASYN(random_state=2)
        X_sm, y_sm = sm.fit_sample(X, y)
        X_train_sm, y_train_sm = sm.fit_sample(X_train, y_train)

        pca_test = PCA(n_components=n).fit(X_sm)
        X_sm_pca_test = pca_test.transform(X_sm)

        pca_test = PCA(n_components=n).fit(X_train_sm)
        X_train_sm_pca_test = pca_test.transform(X_train_sm)
        X_test_sm_pca_test = pca_test.transform(X_test)

        # pca_test = PCA(n_components=n).fit(X)
        # X_pca_test = pca_test.transform(X)

        # pca_test = PCA(n_components=n).fit(X_train)
        # X_train_pca_test = pca_test.transform(X_train)
        # X_test_pca_test = pca_test.transform(X_test)

        clf = GaussianNB()
        clf.fit(X_train_sm_pca_test, y_train_sm)
        y_pred = clf.predict(X_test_sm_pca_test)
        print("Accuracy score for %d components: %f" %
              (n, (accuracy_score(y_test, y_pred))))
Exemplo n.º 2
0
def dataSampling(dados, label):

    #sm = SMOTE(ratio='minority') #to sample data
    sm = ADASYN(ratio='minority')
    dadosSample, labelSample = sm.fit_sample(dados, label)

    return dadosSample, labelSample
Exemplo n.º 3
0
def _ANASYN(self):
    """ADAptive SYNthetic (ADASYN) is based on the idea of
    adaptively generating minority data samples according to their distributions using K nearest neighbor.
    The algorithm adaptively updates the distribution and
    there are no assumptions made for the underlying distribution of the data."""
    print("before: ", len(self.x_train))
    resampler = uns.InstanceHardnessThreshold(
        sampling_strategy=0.2, random_state=self.seed
    )
    self.X_train_smote2, self.y_train_smote2 = resampler.fit_resample(
        self.x_train, self.y_train
    )
    self.x_train = pd.DataFrame(self.X_train_smote2, columns=self.x_train.columns)
    self.y_train = pd.DataFrame(
        self.y_train_smote2, columns=["Local Relapse Y(1) /N(0)"]
    )
    print("after: ", len(self.x_train))

    adasyn = ADASYN(random_state=self.seed)
    self.X_train_smote, self.y_train_smote = adasyn.fit_sample(
        self.x_train, self.y_train
    )
    print("X_train_SMOTE:\n", self.X_train_smote[1])

    self.x_train = pd.DataFrame(self.X_train_smote, columns=self.x_train.columns)
    self.y_train = pd.DataFrame(
        self.y_train_smote, columns=["Local Relapse Y(1) /N(0)"]
    )

    print("len smote: \n", len(self.X_train_smote))
    print("len new x_train: \n", len(self.x_train))

    number_pos_x = self.y_train.loc[self.y_train["Local Relapse Y(1) /N(0)"] == 1]
    print("number positive responses y_train:\n", len(number_pos_x))
Exemplo n.º 4
0
def over_sampling(x_train, y_train, model='ADASYN', ratio='minority'):
    """
    It generate synthetic sampling for the minority class using the model specificed. Always it has
    to be applied to the training set.
    :param x_train: X training set.
    :param y_train: Y training set.
    :param model: 'ADASYN' or 'SMOTE'
    :param neighbors: number of nearest neighbours to used to construct synthetic samples.
    :param ratio
    :return: xTrain and yTrain oversampled
    """
    neighbors = config.parameters.get("neighbors")
    x_train_names = x_train.columns.values.tolist()
    y_train_names = y_train.columns.values.tolist()

    if model == 'ADASYN':
        model = ADASYN(random_state=42, ratio=ratio, n_neighbors=neighbors)

    if model == 'SMOTE':
        model = SMOTE(random_state=42,
                      ratio=ratio,
                      k_neighbors=neighbors,
                      m_neighbors='svm')

    x_train, y_train = model.fit_sample(x_train, y_train)

    x_train = pd.DataFrame(x_train, columns=[x_train_names])
    y_train = pd.DataFrame(y_train, columns=[y_train_names])

    return x_train, y_train
Exemplo n.º 5
0
def ada_model(X, y, names):
    ada = ADASYN(random_state=42)
    X_syn, y_syn = ada.fit_sample(X, y)
    #X_train, X_test, y_train, y_test = train_test_split(X_syn, y_syn, test_size=0.25, random_state=1)
    #logistic = linear_model.LogisticRegressionCV()
    #yy = logistic.fit(X_train, y_train).predict(X_test)
    logistic = linear_model.LogisticRegressionCV(penalty='l1',
                                                 solver='liblinear')
    #logistic = linear_model.LogisticRegression(C=1000,penalty='l1')
    auc = cross_val_score(logistic, X_syn, y_syn, cv=5,
                          scoring='roc_auc').mean()
    acc = cross_val_score(logistic, X_syn, y_syn, cv=5,
                          scoring='accuracy').mean()
    recall = cross_val_score(logistic, X_syn, y_syn, cv=5,
                             scoring='recall').mean()
    print("cross validation results:")
    print("-------------------------")
    print("auc:", auc)
    print("acc:", acc)
    print("recall:", recall)

    X_train, X_test, y_train, y_test = train_test_split(X_syn,
                                                        y_syn,
                                                        test_size=0.25,
                                                        random_state=1)
    print(logistic.fit(X_train, y_train).coef_)

    RFC = RandomForestClassifier(max_depth=8, random_state=0)
    yy = RFC.fit(X_train, y_train).predict(X_test)
    importance = pd.DataFrame(RFC.feature_importances_,
                              columns=['Feature Importance'])
    importance.index = names
    importance.sort_values(
        'Feature Importance',
        ascending=True)[len(names) - 10:len(names)].plot.barh(figsize=(8, 16))
Exemplo n.º 6
0
def resampling(datadict, labldict, savepath):
  ratiodic = {}
  for domnitem in datadict:
    ratiodic[domnitem] = {}
  for lablcode in range(0, 31):
    ratiodic['amazon'][lablcode] = 145
    ratiodic['dslr'][lablcode]   = 100
    ratiodic['webcam'][lablcode] = 100
  
  for domnitem in datadict:
    lablcout, lablnumb = {}, {}
    sorcdata = datadict[domnitem]['X']
    sorclabl = datadict[domnitem]['Y']
    print('Resampling data in domain {}'.format(domnitem))
    adasyn = ADASYN(ratio = ratiodic[domnitem], random_state = 42)
    targdata, targlabl = adasyn.fit_sample(sorcdata, sorclabl)
    print('Saving data in domain {}'.format(domnitem))
    for imagcode, targimag in enumerate(targdata):
      lablcode = targlabl[imagcode]
      if lablcode not in lablcout:
        lablcout[lablcode] = 0
        lablnumb[lablcode] = 0
      else:
        lablcout[lablcode] += 1
    for imagcode, targimag in enumerate(targdata):
      lablcode = targlabl[imagcode]
      lablname = labldict[domnitem][lablcode]
      lablnumb[lablcode] += 1
      strsleng = len(str(lablcout[lablcode]))
      numbstrs = str(lablnumb[lablcode]).zfill(strsleng)
      targpath = os.path.join(savepath, domnitem, lablname)
      if not os.path.exists(targpath): os.makedirs(targpath)
      imagpath = os.path.join(targpath, 'img_' + numbstrs)
      targimag = targimag.reshape(256, 256, 3)
      cv2.imwrite(imagpath + '.jpg', targimag)
Exemplo n.º 7
0
def plot_roc_curves(X, y):
    plt.figure(figsize=(10, 6))
    lw = 2

    # train-val split and oversample
    X_train, X_val, y_train, y_val = train_test_split(X,
                                                      y,
                                                      test_size=.2,
                                                      random_state=0)
    adasyn = ADASYN(random_state=44)
    X_oversampled_train, y_oversampled_train = adasyn.fit_sample(
        X_train, y_train)

    # Logistic Regression
    # fit model and predict probabilities of validation data
    log_reg = LogisticRegression(max_iter=5000, n_jobs=-1, random_state=44)
    log_reg.fit(X_oversampled_train, y_oversampled_train)
    y_pred = log_reg.predict_proba(X_val)

    fpr, tpr, thresholds = roc_curve(y_val, y_pred[:, 1])
    model_auc = roc_auc_score(y_val, y_pred[:, 1])
    plt.plot(fpr,
             tpr,
             color='b',
             lw=lw,
             label=f'Logistic Regression, AUC: {model_auc:.4f}')

    # Naive Bayes
    # fit model and predict probabilities of validation data
    nb = BernoulliNB()
    nb.fit(X_oversampled_train, y_oversampled_train)
    y_pred = nb.predict_proba(X_val)

    fpr, tpr, thresholds = roc_curve(y_val, y_pred[:, 1])
    model_auc = roc_auc_score(y_val, y_pred[:, 1])
    plt.plot(fpr,
             tpr,
             color='r',
             lw=lw,
             label=f'Bernoulli Naive Bayes, AUC: {model_auc:.4f}')

    # SVC
    # fit model and predict probabilities of validation data
    svc = SVC(probability=True, random_state=1)
    svc.fit(X_oversampled_train, y_oversampled_train)
    y_pred = svc.predict_proba(X_val)

    fpr, tpr, thresholds = roc_curve(y_val, y_pred[:, 1])
    model_auc = roc_auc_score(y_val, y_pred[:, 1])
    plt.plot(fpr, tpr, color='g', lw=lw, label=f'SVC, AUC: {model_auc:.4f}')

    plt.plot([0, 1], [0, 1], c='violet', ls='--', label='Chance Line')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])

    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC Curves for top 3 Contending Models')
    plt.legend(loc='lower right', prop={'size': 10}, frameon=True)
    plt.savefig('ROC Curves for top 3 Contending Models')
Exemplo n.º 8
0
def test_ada_fit_sample_half():
    """Test the fit sample routine with a 0.5 ratio"""

    # Resample the data
    ratio = 0.8
    ada = ADASYN(ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = ada.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206],
                     [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336],
                     [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052],
                     [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463],
                     [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734],
                     [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484],
                     [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049],
                     [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929],
                     [1.70580611, -0.11219234]])
    y_gt = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1,
                     0])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Exemplo n.º 9
0
def test_ada_fit_sample_nn_obj():
    """Test fit-sample with nn object"""

    # Resample the data
    nn = NearestNeighbors(n_neighbors=6)
    ada = ADASYN(random_state=RND_SEED, n_neighbors=nn)
    X_resampled, y_resampled = ada.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336], [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052], [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463], [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734], [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484], [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.29427267, 0.21740707], [0.68118697, -0.25220353],
                     [1.37180201, 0.37279378], [-0.59243851, -0.80715327]])
    y_gt = np.array([
        0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0
    ])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Exemplo n.º 10
0
def synthetic_balance(data):
    """
    Balances samples with ADASYN algorithm:
    http://sci2s.ugr.es/keel/pdf/algorithm/congreso/2008-He-ieee.pdf
    :param data: the dataframe
    :return: balanced dataframe
    """

    target = data[TARGET]
    features = data.drop(TARGET, axis=1)

    print('unbalanced positive weight: ' + str(np.mean(target)))

    # Apply the random over-sampling
    ada = ADASYN()
    try:
        features, target = ada.fit_sample(features, target)
    except ValueError:  # ValueError: No samples will be generated with the provided ratio settings.
        pass

    print('balanced positive weight: ' + str(np.mean(target)))

    columns = list(data)
    columns.remove(TARGET)
    data = pd.DataFrame(features, columns=columns)
    data.loc[:, TARGET] = target
    return data
def over_sampling(xTrain, yTrain, model='ADASYN', neighbors=200):
    """
    It generate synthetic sampling for the minority class using the model specificed. Always it has
    to be applied to the training set.
    :param xTrain: X training set.
    :param yTrain: Y training set.
    :param model: 'ADASYN' or 'SMOTE'
    :param neighbors: number of nearest neighbours to used to construct synthetic samples.
    :return: xTrain and yTrain oversampled
    """

    xTrainNames = xTrain.columns.values.tolist()
    yTrainNames = ['target']

    if model == 'ADASYN':
        model = ADASYN(random_state=42,
                       ratio='minority',
                       n_neighbors=neighbors)

    if model == 'SMOTE':
        model = SMOTE(random_state=42,
                      ratio='minority',
                      k_neighbors=neighbors,
                      m_neighbors='svm')

    xTrain, yTrain = model.fit_sample(xTrain, yTrain)

    xTrain = pd.DataFrame(xTrain, columns=[xTrainNames])
    yTrain = pd.DataFrame(yTrain, columns=[yTrainNames])

    return xTrain, yTrain
Exemplo n.º 12
0
def ADASYN_oversampling(X, y):
    # input DataFrame
    # X →Independent Variable in DataFrame\
    # y →dependent Variable in Pandas DataFrame format
    sm = ADASYN()
    X, y = sm.fit_sample(X, y)
    return (X, y)
Exemplo n.º 13
0
def test_ada_fit_sample_nn_obj():
    """Test fit-sample with nn object"""

    # Resample the data
    nn = NearestNeighbors(n_neighbors=6)
    ada = ADASYN(random_state=RND_SEED, n_neighbors=nn)
    X_resampled, y_resampled = ada.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336], [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052], [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463], [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734], [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484], [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.29427267, 0.21740707], [0.68118697, -0.25220353],
                     [1.37180201, 0.37279378], [-0.59243851, -0.80715327]])
    y_gt = np.array([
        0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
Exemplo n.º 14
0
def oversample_ADASYN(X, y, ratio=0.15):
    """ Oversample minority class using the ADASYN algorithm

    Arguments:
        X (2d array-like): feature set
        y (1d array-lile): target values
        ratio (float): desired ratio between minority and majority (optional)

    Return:
        X_os (2d array-like): oversampled feature set
        y_os (1d array-lile): oversampled target values

    Example:
        X_train_os, y_train_os = models.oversample_ADASYN(X_train, y_train, 0.3)
    """

    # construct the ADASYN object
    os = ADASYN(sampling_strategy=ratio, n_neighbors=5, random_state=42)

    # oversample X and y data
    X_os, y_os = os.fit_sample(X, y)
    print('Oversampled minority-ratio of: {:3.1f}%'.format(100 * sum(y_os) /
                                                           y_os.count()))

    return X_os, y_os
Exemplo n.º 15
0
def test_ada_fit_sample_nn_obj():
    nn = NearestNeighbors(n_neighbors=6)
    ada = ADASYN(random_state=RND_SEED, n_neighbors=nn)
    X_resampled, y_resampled = ada.fit_sample(X, Y)
    X_gt = np.array([[0.11622591, -0.0317206],
                     [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336],
                     [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052],
                     [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463],
                     [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734],
                     [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484],
                     [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049],
                     [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929],
                     [1.70580611, -0.11219234],
                     [0.94899098, -0.30508981],
                     [0.28204936, -0.13953426],
                     [1.58028868, -0.04089947],
                     [0.66117333, -0.28009063]])
    y_gt = np.array([
        0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
Exemplo n.º 16
0
def augment_data_adasyn(input_data, desired_samples=50):
    """
    Augments data using the ADASYN algorithm. For more information see the
    documentation:  http://contrib.scikit-learn.org/imbalanced-learn/stable/generated/imblearn.over_sampling.ADASYN.html # noqa

    Will probably give a user warning stating: "The number of smaples in class x
    will be larger than the number of samples in the majority class", but we can
    ignore this since we are using ADASYN to augment data, not to correct for
    imbalanced data.

    Args:
        input_data (tuple): x_train, y_train, x_test, y_test
        desired_samples (int): The number of samples to be added to each class.

    Returns:
        tuple: x_train, y_train, x_test, y_test, with samples added to x_train
               and y_train.
    """
    x_train = input_data[0]
    y_train = input_data[1]
    x_test = input_data[2]
    y_test = input_data[3]

    classes, counts = np.unique(y_train, return_counts=True)
    ratio = {}
    for index, item in enumerate(classes):
        ratio[item] = counts[index] + desired_samples
    adasyn = ADASYN(ratio=ratio)
    x_train, y_train = adasyn.fit_sample(x_train, y_train)
    return (x_train, y_train, x_test, y_test)
Exemplo n.º 17
0
def apply_simple_adasyn(X, y):
    from imblearn.over_sampling import ADASYN
    from collections import Counter
    simple_adasyn = ADASYN(sampling_strategy='minority')
    print(Counter(y))
    X_smt, y_smt = simple_adasyn.fit_sample(X, y)
    print(Counter(y_smt))
    return X_smt, y_smt
Exemplo n.º 18
0
def makeOverSamplesADASYN(X, y):
    #input DataFrame
    #X →Independent Variable in DataFrame\
    #y →dependent Variable in Pandas DataFrame format
    from imblearn.over_sampling import ADASYN
    sm = ADASYN()
    X, y = sm.fit_sample(X, y)
    return (X, y)
Exemplo n.º 19
0
def ADASYN_oversampling(x, y):
    print('Original dataset shape {}'.format(Counter(y)))

    adasyn = ADASYN(random_state=42)
    x_sampled, y_sampled = adasyn.fit_sample(x, y)

    print('With ADASYN sampled dataset shape {}'.format(Counter(y_sampled)))

    return x_sampled, y_sampled
 def ADASYNOversampling(self, featureMatrix, Labels):
     ada = ADASYN(random_state=42)
     #print type(featureMatrix[0][0])
     #print type(Labels[0])
     feature_Resampled, Labels_Resampled = ada.fit_sample(
         featureMatrix, Labels)
     #print type(feature_Resampled[0][0])
     #print type(Labels_Resampled[0])
     #print ("ADASYN Oversampling Completed")
     return feature_Resampled, Labels_Resampled
Exemplo n.º 21
0
def makeOverSamplesADASYN(X, y):
    """
    Creates new data with oversampled variables by using ADASYN
    @param X: Independent Variable in DataFrame
    @param y: dependent variable in Pandas DataFrame formats
    @return: an oversampled version of the variables
    """
    sm = ADASYN()
    X, y = sm.fit_sample(X, y)
    return X, y
Exemplo n.º 22
0
def oversample_dataset(X, y):

    under = RandomUnderSampler(sampling_strategy={0.0: 700})
    X, y = under.fit_sample(X, y)
    # print('Under {}'.format(Counter(y)))

    sampler = ADASYN(random_state=42)
    X_rs, y_rs = sampler.fit_sample(X, y)
    # print('ADASYN {}'.format(Counter(y_rs)))

    return X_rs, y_rs
Exemplo n.º 23
0
def resample_data(x, y, sample_choice=RUS_CONSTANT):
    if sample_choice == SMOTE_CONSTANT:
        sm = SMOTE(random_state=42)
        x, y = sm.fit_sample(x, y)
    elif sample_choice == ADASYN_CONSTANT:
        ada = ADASYN(random_state=42)
        x, y = ada.fit_sample(x, y)
    elif sample_choice == RUS_CONSTANT:
        rus = RandomUnderSampler(random_state=42)
        x, y = rus.fit_sample(x, y)
    return x, y
Exemplo n.º 24
0
def test_ada_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    ada = ADASYN(random_state=RND_SEED)
    X_resampled, y_resampled = ada.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'ada_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'ada_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Exemplo n.º 25
0
def test_ada_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    ada = ADASYN(random_state=RND_SEED)
    X_resampled, y_resampled = ada.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'ada_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'ada_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
Exemplo n.º 26
0
 def runADASYN(self):
     ada = ADASYN()
     self.Xadasyn, self.Yadasyn = ada.fit_sample(self.X, self.Y)
     self.rebalanced['ADASYN'] = {
         'X': self.Xadasyn,
         'y': self.Yadasyn,
         'f': self.featureList
     }
     self.log.emit('ADASYN: Original dataset shape {}'.format(
         Counter(self.Y)),
                   indents=1)
     self.log.emit('ADASYN: Resampled dataset shape {}'.format(
         Counter(self.Yadasyn)),
                   indents=1)
Exemplo n.º 27
0
 def balanceDataset(self,train):
     from imblearn.over_sampling import ADASYN
     
     ada = ADASYN(random_state=10, ratio="minority")
     x = train.loc[:,train.columns != "TARGET"]
     y = train.loc[:,train.columns == "TARGET"]
     
     #pdb.set_trace()
     X,Y = ada.fit_sample(x,y)
     
     tmpDs = pd.concat(
             [pd.DataFrame(X,columns = x.columns),pd.DataFrame(Y,columns = y.columns)]
             ,axis = 1)
     return tmpDs
Exemplo n.º 28
0
def cross_validate(X, y, model):
    # Split into train and test to crossvalidate
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    # Balance training data
    ads = ADASYN(random_state=10)
    X_train_b, y_train_b = ads.fit_sample(X_train, y_train)
    if model == 'RF':
        return RF(X_train_b, y_train_b, X_test, y_test)
    elif model == 'GBC':
        return GBC(X_train_b, y_train_b, X_test, y_test)
    elif model == 'ABC':
        return ABC(X_train_b, y_train_b, X_test, y_test)
    else:
        print('Enter a valid model')
Exemplo n.º 29
0
def oversample(X: pd.DataFrame, y: pd.DataFrame, technique: str = 'adasyn'):
    """
    Oversamples the minority class to balance the classes
    :param X: unbalanced dataset as a dataframe
    :param y: labels for the dataset
    :param technique: either 'SMOTE' or 'ADASYN'
    :return: the balanced dataset and labels
    """
    if technique is 'adasyn':
        os_method = ADASYN()
    elif technique is 'smote':
        os_method = SMOTE()
    X, y = os_method.fit_sample(X, y)
    return X, y
    def runAdasyn(self, ensem_folder, model_h5, save_dir):

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # build and load models
        autoencoder, encoder, decoder = self.loadAutoencoder(model_h5)

        for ensem in range(self.Config.NUM_ENSEMBLES):

            dat = np.load(ensem_folder + 'ensem_dat' + str(ensem) + '.npy')
            lab = np.load(ensem_folder + 'ensem_lab' + str(ensem) + '.npy')
            dat_ = encoder.predict(dat)

            # resize data
            if len(lab.shape) == 3:
                lab = lab[:, -1, :]
                lab = np.argmax(lab, axis=1)
            else:
                lab = np.argmax(lab, axis=1)

            # run adasyn
            print(ensem)
            print('run ADASYN')

            ada = ADASYN(ratio='minority', random_state=42)

            # fit smote object
            print('fit smote object for ensem ' + str(ensem))
            x_res, y_res = ada.fit_sample(dat_, lab)

            x_syn = decoder.predict(x_res)

            y_res_ = []
            for i in range(len(y_res)):
                if y_res[i] == 0:
                    y_res_ += [np.array([1, 0])]
                else:
                    y_res_ += [np.array([0, 1])]

            y_res_ = np.array(y_res_)

            # save data
            print('save ensem ' + str(ensem))
            np.save(save_dir + 'ensem_dat' + str(ensem) + '.npy', x_syn)
            np.save(save_dir + 'ensem_lab' + str(ensem) + '.npy', y_res_)

        return
Exemplo n.º 31
0
def balance_classes_adasyn(X, y, ratio='auto', random_state=None, k=5):
    """
    Function to balance the distribute of classes by using Adaptive Synthetic
    Sampling Approach for Imbalanced Learning (ADASYN)
    :param X: Feature data
    :param y: Class labels
    :param ratio: (str/float) If ‘auto’, the ratio will be defined automatically to balance the dataset. Otherwise, the
        ratio is defined as the number of samples in the minority class over the the number of samples in the majority
        class.
    :param random_state: (None/Int) If int, seed used for random number generator
    :param k: (int) Number of nearest neighbors used to construct synthetic samples
    :return: Data set with synthetic samples added
    """
    ad = ADASYN(ratio=ratio, random_state=random_state, n_jobs=1, k=k)
    X_adasyn, y_adasyn = ad.fit_sample(X, y)

    return X_adasyn, y_adasyn
Exemplo n.º 32
0
def sensor_balancing(X_train, y_train):

    # Drop all rows with a very rare results, since SMOTEEN cannot handle them
    cc = y_train.value_counts()[y_train.value_counts() <= 3]
    y_train = y_train[~y_train.isin(cc.index.values)]
    X_train = pd.DataFrame(X_train[X_train.index.isin(list(y_train.index))])

    y_train = pd.Series(y_train)
    columns = pd.DataFrame(X_train).columns.values

    # Perform oversampling
    adasyn = ADASYN(sampling_strategy='not majority', n_neighbors=2, n_jobs=1)
    # X_train, y_train = adasyn.fit_sample(X_train, np.ravel(y_train.values))
    X_train, y_train = adasyn.fit_sample(X_train, y_train)

    X_train = pd.DataFrame(X_train, columns=list(columns))
    return X_train, pd.Series(y_train)
Exemplo n.º 33
0
def oversample(X, y, bal_strategy):

	if(bal_strategy == "SMOTESVN"  or bal_strategy == "ALL"):
		# Apply SMOTE SVM
		sm = SMOTE(kind='svm')
		X_sampled, y_sampled = sm.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == "SMOTE"  or bal_strategy == "ALL"):
		# Apply regular SMOTE
		sm = SMOTE(kind='regular')
		X_sampled, y_sampled = sm.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == "ADASYN"  or bal_strategy == "ALL"):
	# Apply the random over-sampling
		ada = ADASYN()
		X_sampled, y_sampled = ada.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == 'NONE'):
		X_sampled = X
		y_sampled = y

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	else:
		print 'bal_stragegy not in SMOTESVN, SMOTE, ADASYN, ALL, NONE'
		sys.exit(1)


	return (X_sampled, y_sampled)
Exemplo n.º 34
0
from imblearn.over_sampling import ADASYN

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random over-sampling
ada = ADASYN()
X_resampled, y_resampled = ada.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
Exemplo n.º 35
0
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
#define X y
X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
#define the size of test
#sklearn.model_selection.train_test_split随机划分训练集与测试集
#train_test_split(train_data,train_target,test_size=数字, random_state=0)

#ADASYN
ada = ADASYN()
os_X,os_y = ada.fit_sample(X_train,y_train)
os_X = pd.DataFrame(os_X)
os_y = pd.DataFrame(os_y)

#logistic
best_c = printing_Kfold_scores(os_X,os_y)
clf_l = LogisticRegression(C = best_c, penalty = 'l1')
clf_l.fit(os_X,os_y.values.ravel())
y_pred = clf_l.predict(X_test)
#调用ravel()函数将矩阵转变成一维数组
#(ravel()函数与flatten()的区别)
# 两者所要实现的功能是一致的(将多维数组降为一维),
# 两者的区别在于返回拷贝(copy)还是返回视图(view),
# numpy.flatten() 返回一份拷贝,对拷贝所做的修改不会影响(reflects)原始矩阵,
# 而numpy.ravel()返回的是视图(view),会影响(reflects)原始矩阵。
y_true, y_pred = y_test, clf_l.predict(X_test)
Exemplo n.º 36
0
def test_ada_wrong_nn_obj():
    nn = 'rnd'
    ada = ADASYN(random_state=RND_SEED, n_neighbors=nn)
    with raises(ValueError, match="has to be one of"):
        ada.fit_sample(X, Y)
Exemplo n.º 37
0
def test_ada_fit_ratio_error():
    ratio = {0: 9, 1: 12}
    ada = ADASYN(ratio=ratio, random_state=RND_SEED)
    with raises(ValueError, match="No samples will be generated."):
        ada.fit_sample(X, Y)