Exemplos de ADASYN.fit_sample em Python, exemplos de imblearn.over_sampling.ADASYN.fit_sample em Python

Exemplo n.º 1

0

Exibir arquivo

def pca_components(X, y, X_train, y_train, X_test, y_test):
    for n in range(1, 11):
        sm = ADASYN(random_state=2)
        X_sm, y_sm = sm.fit_sample(X, y)
        X_train_sm, y_train_sm = sm.fit_sample(X_train, y_train)

        pca_test = PCA(n_components=n).fit(X_sm)
        X_sm_pca_test = pca_test.transform(X_sm)

        pca_test = PCA(n_components=n).fit(X_train_sm)
        X_train_sm_pca_test = pca_test.transform(X_train_sm)
        X_test_sm_pca_test = pca_test.transform(X_test)

        # pca_test = PCA(n_components=n).fit(X)
        # X_pca_test = pca_test.transform(X)

        # pca_test = PCA(n_components=n).fit(X_train)
        # X_train_pca_test = pca_test.transform(X_train)
        # X_test_pca_test = pca_test.transform(X_test)

        clf = GaussianNB()
        clf.fit(X_train_sm_pca_test, y_train_sm)
        y_pred = clf.predict(X_test_sm_pca_test)
        print("Accuracy score for %d components: %f" %
              (n, (accuracy_score(y_test, y_pred))))

Exemplo n.º 2

0

Exibir arquivo

Arquivo: create-model-test.py Projeto: tinchoa/normalizer

def dataSampling(dados, label):

    #sm = SMOTE(ratio='minority') #to sample data
    sm = ADASYN(ratio='minority')
    dadosSample, labelSample = sm.fit_sample(dados, label)

    return dadosSample, labelSample

Exemplo n.º 3

0

Exibir arquivo

def _ANASYN(self):
    """ADAptive SYNthetic (ADASYN) is based on the idea of
    adaptively generating minority data samples according to their distributions using K nearest neighbor.
    The algorithm adaptively updates the distribution and
    there are no assumptions made for the underlying distribution of the data."""
    print("before: ", len(self.x_train))
    resampler = uns.InstanceHardnessThreshold(
        sampling_strategy=0.2, random_state=self.seed
    )
    self.X_train_smote2, self.y_train_smote2 = resampler.fit_resample(
        self.x_train, self.y_train
    )
    self.x_train = pd.DataFrame(self.X_train_smote2, columns=self.x_train.columns)
    self.y_train = pd.DataFrame(
        self.y_train_smote2, columns=["Local Relapse Y(1) /N(0)"]
    )
    print("after: ", len(self.x_train))

    adasyn = ADASYN(random_state=self.seed)
    self.X_train_smote, self.y_train_smote = adasyn.fit_sample(
        self.x_train, self.y_train
    )
    print("X_train_SMOTE:\n", self.X_train_smote[1])

    self.x_train = pd.DataFrame(self.X_train_smote, columns=self.x_train.columns)
    self.y_train = pd.DataFrame(
        self.y_train_smote, columns=["Local Relapse Y(1) /N(0)"]
    )

    print("len smote: \n", len(self.X_train_smote))
    print("len new x_train: \n", len(self.x_train))

    number_pos_x = self.y_train.loc[self.y_train["Local Relapse Y(1) /N(0)"] == 1]
    print("number positive responses y_train:\n", len(number_pos_x))

Exemplo n.º 4

0

Exibir arquivo

Arquivo: model_utils.py Projeto: sebalp1987/fraud_detection

def over_sampling(x_train, y_train, model='ADASYN', ratio='minority'):
    """
    It generate synthetic sampling for the minority class using the model specificed. Always it has
    to be applied to the training set.
    :param x_train: X training set.
    :param y_train: Y training set.
    :param model: 'ADASYN' or 'SMOTE'
    :param neighbors: number of nearest neighbours to used to construct synthetic samples.
    :param ratio
    :return: xTrain and yTrain oversampled
    """
    neighbors = config.parameters.get("neighbors")
    x_train_names = x_train.columns.values.tolist()
    y_train_names = y_train.columns.values.tolist()

    if model == 'ADASYN':
        model = ADASYN(random_state=42, ratio=ratio, n_neighbors=neighbors)

    if model == 'SMOTE':
        model = SMOTE(random_state=42,
                      ratio=ratio,
                      k_neighbors=neighbors,
                      m_neighbors='svm')

    x_train, y_train = model.fit_sample(x_train, y_train)

    x_train = pd.DataFrame(x_train, columns=[x_train_names])
    y_train = pd.DataFrame(y_train, columns=[y_train_names])

    return x_train, y_train

Exemplo n.º 5

0

Exibir arquivo

Arquivo: model.py Projeto: heyouxin/PythonCodes

def ada_model(X, y, names):
    ada = ADASYN(random_state=42)
    X_syn, y_syn = ada.fit_sample(X, y)
    #X_train, X_test, y_train, y_test = train_test_split(X_syn, y_syn, test_size=0.25, random_state=1)
    #logistic = linear_model.LogisticRegressionCV()
    #yy = logistic.fit(X_train, y_train).predict(X_test)
    logistic = linear_model.LogisticRegressionCV(penalty='l1',
                                                 solver='liblinear')
    #logistic = linear_model.LogisticRegression(C=1000,penalty='l1')
    auc = cross_val_score(logistic, X_syn, y_syn, cv=5,
                          scoring='roc_auc').mean()
    acc = cross_val_score(logistic, X_syn, y_syn, cv=5,
                          scoring='accuracy').mean()
    recall = cross_val_score(logistic, X_syn, y_syn, cv=5,
                             scoring='recall').mean()
    print("cross validation results:")
    print("-------------------------")
    print("auc：", auc)
    print("acc：", acc)
    print("recall：", recall)

    X_train, X_test, y_train, y_test = train_test_split(X_syn,
                                                        y_syn,
                                                        test_size=0.25,
                                                        random_state=1)
    print(logistic.fit(X_train, y_train).coef_)

    RFC = RandomForestClassifier(max_depth=8, random_state=0)
    yy = RFC.fit(X_train, y_train).predict(X_test)
    importance = pd.DataFrame(RFC.feature_importances_,
                              columns=['Feature Importance'])
    importance.index = names
    importance.sort_values(
        'Feature Importance',
        ascending=True)[len(names) - 10:len(names)].plot.barh(figsize=(8, 16))

Exemplo n.º 6

0

Exibir arquivo

def resampling(datadict, labldict, savepath):
  ratiodic = {}
  for domnitem in datadict:
    ratiodic[domnitem] = {}
  for lablcode in range(0, 31):
    ratiodic['amazon'][lablcode] = 145
    ratiodic['dslr'][lablcode]   = 100
    ratiodic['webcam'][lablcode] = 100
  
  for domnitem in datadict:
    lablcout, lablnumb = {}, {}
    sorcdata = datadict[domnitem]['X']
    sorclabl = datadict[domnitem]['Y']
    print('Resampling data in domain {}'.format(domnitem))
    adasyn = ADASYN(ratio = ratiodic[domnitem], random_state = 42)
    targdata, targlabl = adasyn.fit_sample(sorcdata, sorclabl)
    print('Saving data in domain {}'.format(domnitem))
    for imagcode, targimag in enumerate(targdata):
      lablcode = targlabl[imagcode]
      if lablcode not in lablcout:
        lablcout[lablcode] = 0
        lablnumb[lablcode] = 0
      else:
        lablcout[lablcode] += 1
    for imagcode, targimag in enumerate(targdata):
      lablcode = targlabl[imagcode]
      lablname = labldict[domnitem][lablcode]
      lablnumb[lablcode] += 1
      strsleng = len(str(lablcout[lablcode]))
      numbstrs = str(lablnumb[lablcode]).zfill(strsleng)
      targpath = os.path.join(savepath, domnitem, lablname)
      if not os.path.exists(targpath): os.makedirs(targpath)
      imagpath = os.path.join(targpath, 'img_' + numbstrs)
      targimag = targimag.reshape(256, 256, 3)
      cv2.imwrite(imagpath + '.jpg', targimag)

Exemplo n.º 7

0

Exibir arquivo

def plot_roc_curves(X, y):
    plt.figure(figsize=(10, 6))
    lw = 2

    # train-val split and oversample
    X_train, X_val, y_train, y_val = train_test_split(X,
                                                      y,
                                                      test_size=.2,
                                                      random_state=0)
    adasyn = ADASYN(random_state=44)
    X_oversampled_train, y_oversampled_train = adasyn.fit_sample(
        X_train, y_train)

    # Logistic Regression
    # fit model and predict probabilities of validation data
    log_reg = LogisticRegression(max_iter=5000, n_jobs=-1, random_state=44)
    log_reg.fit(X_oversampled_train, y_oversampled_train)
    y_pred = log_reg.predict_proba(X_val)

    fpr, tpr, thresholds = roc_curve(y_val, y_pred[:, 1])
    model_auc = roc_auc_score(y_val, y_pred[:, 1])
    plt.plot(fpr,
             tpr,
             color='b',
             lw=lw,
             label=f'Logistic Regression, AUC: {model_auc:.4f}')

    # Naive Bayes
    # fit model and predict probabilities of validation data
    nb = BernoulliNB()
    nb.fit(X_oversampled_train, y_oversampled_train)
    y_pred = nb.predict_proba(X_val)

    fpr, tpr, thresholds = roc_curve(y_val, y_pred[:, 1])
    model_auc = roc_auc_score(y_val, y_pred[:, 1])
    plt.plot(fpr,
             tpr,
             color='r',
             lw=lw,
             label=f'Bernoulli Naive Bayes, AUC: {model_auc:.4f}')

    # SVC
    # fit model and predict probabilities of validation data
    svc = SVC(probability=True, random_state=1)
    svc.fit(X_oversampled_train, y_oversampled_train)
    y_pred = svc.predict_proba(X_val)

    fpr, tpr, thresholds = roc_curve(y_val, y_pred[:, 1])
    model_auc = roc_auc_score(y_val, y_pred[:, 1])
    plt.plot(fpr, tpr, color='g', lw=lw, label=f'SVC, AUC: {model_auc:.4f}')

    plt.plot([0, 1], [0, 1], c='violet', ls='--', label='Chance Line')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])

    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC Curves for top 3 Contending Models')
    plt.legend(loc='lower right', prop={'size': 10}, frameon=True)
    plt.savefig('ROC Curves for top 3 Contending Models')

Exemplo n.º 8

0

Exibir arquivo

Arquivo: test_adasyn.py Projeto: yiweichen04/imbalanced-learn

def test_ada_fit_sample_half():
    """Test the fit sample routine with a 0.5 ratio"""

    # Resample the data
    ratio = 0.8
    ada = ADASYN(ratio=ratio, random_state=RND_SEED)
    X_resampled, y_resampled = ada.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206],
                     [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336],
                     [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052],
                     [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463],
                     [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734],
                     [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484],
                     [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049],
                     [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929],
                     [1.70580611, -0.11219234]])
    y_gt = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1,
                     0])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: test_adasyn.py Projeto: kellyhennigan/cueexp_scripts

def test_ada_fit_sample_nn_obj():
    """Test fit-sample with nn object"""

    # Resample the data
    nn = NearestNeighbors(n_neighbors=6)
    ada = ADASYN(random_state=RND_SEED, n_neighbors=nn)
    X_resampled, y_resampled = ada.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336], [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052], [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463], [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734], [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484], [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.29427267, 0.21740707], [0.68118697, -0.25220353],
                     [1.37180201, 0.37279378], [-0.59243851, -0.80715327]])
    y_gt = np.array([
        0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0
    ])
    assert_array_almost_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)

Exemplo n.º 10

0

Exibir arquivo

def synthetic_balance(data):
    """
    Balances samples with ADASYN algorithm:
    http://sci2s.ugr.es/keel/pdf/algorithm/congreso/2008-He-ieee.pdf
    :param data: the dataframe
    :return: balanced dataframe
    """

    target = data[TARGET]
    features = data.drop(TARGET, axis=1)

    print('unbalanced positive weight: ' + str(np.mean(target)))

    # Apply the random over-sampling
    ada = ADASYN()
    try:
        features, target = ada.fit_sample(features, target)
    except ValueError:  # ValueError: No samples will be generated with the provided ratio settings.
        pass

    print('balanced positive weight: ' + str(np.mean(target)))

    columns = list(data)
    columns.remove(TARGET)
    data = pd.DataFrame(features, columns=columns)
    data.loc[:, TARGET] = target
    return data

Exemplo n.º 11

0

Exibir arquivo

Arquivo: sampling.py Projeto: sebalp1987/anomaly_detection_answers

def over_sampling(xTrain, yTrain, model='ADASYN', neighbors=200):
    """
    It generate synthetic sampling for the minority class using the model specificed. Always it has
    to be applied to the training set.
    :param xTrain: X training set.
    :param yTrain: Y training set.
    :param model: 'ADASYN' or 'SMOTE'
    :param neighbors: number of nearest neighbours to used to construct synthetic samples.
    :return: xTrain and yTrain oversampled
    """

    xTrainNames = xTrain.columns.values.tolist()
    yTrainNames = ['target']

    if model == 'ADASYN':
        model = ADASYN(random_state=42,
                       ratio='minority',
                       n_neighbors=neighbors)

    if model == 'SMOTE':
        model = SMOTE(random_state=42,
                      ratio='minority',
                      k_neighbors=neighbors,
                      m_neighbors='svm')

    xTrain, yTrain = model.fit_sample(xTrain, yTrain)

    xTrain = pd.DataFrame(xTrain, columns=[xTrainNames])
    yTrain = pd.DataFrame(yTrain, columns=[yTrainNames])

    return xTrain, yTrain

Exemplo n.º 12

0

Exibir arquivo

def ADASYN_oversampling(X, y):
    # input DataFrame
    # X →Independent Variable in DataFrame\
    # y →dependent Variable in Pandas DataFrame format
    sm = ADASYN()
    X, y = sm.fit_sample(X, y)
    return (X, y)

Exemplo n.º 13

0

Exibir arquivo

def test_ada_fit_sample_nn_obj():
    """Test fit-sample with nn object"""

    # Resample the data
    nn = NearestNeighbors(n_neighbors=6)
    ada = ADASYN(random_state=RND_SEED, n_neighbors=nn)
    X_resampled, y_resampled = ada.fit_sample(X, Y)

    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336], [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504], [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342], [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052], [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463], [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734], [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484], [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049], [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.29427267, 0.21740707], [0.68118697, -0.25220353],
                     [1.37180201, 0.37279378], [-0.59243851, -0.80715327]])
    y_gt = np.array([
        0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)

Exemplo n.º 14

0

Exibir arquivo

def oversample_ADASYN(X, y, ratio=0.15):
    """ Oversample minority class using the ADASYN algorithm

    Arguments:
        X (2d array-like): feature set
        y (1d array-lile): target values
        ratio (float): desired ratio between minority and majority (optional)

    Return:
        X_os (2d array-like): oversampled feature set
        y_os (1d array-lile): oversampled target values

    Example:
        X_train_os, y_train_os = models.oversample_ADASYN(X_train, y_train, 0.3)
    """

    # construct the ADASYN object
    os = ADASYN(sampling_strategy=ratio, n_neighbors=5, random_state=42)

    # oversample X and y data
    X_os, y_os = os.fit_sample(X, y)
    print('Oversampled minority-ratio of: {:3.1f}%'.format(100 * sum(y_os) /
                                                           y_os.count()))

    return X_os, y_os

Exemplo n.º 15

0

Exibir arquivo

Arquivo: test_adasyn.py Projeto: RonKG/Imbalanced-Learn

def test_ada_fit_sample_nn_obj():
    nn = NearestNeighbors(n_neighbors=6)
    ada = ADASYN(random_state=RND_SEED, n_neighbors=nn)
    X_resampled, y_resampled = ada.fit_sample(X, Y)
    X_gt = np.array([[0.11622591, -0.0317206],
                     [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336],
                     [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052],
                     [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463],
                     [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734],
                     [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484],
                     [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049],
                     [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929],
                     [1.70580611, -0.11219234],
                     [0.94899098, -0.30508981],
                     [0.28204936, -0.13953426],
                     [1.58028868, -0.04089947],
                     [0.66117333, -0.28009063]])
    y_gt = np.array([
        0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)

Exemplo n.º 16

0

Exibir arquivo

def augment_data_adasyn(input_data, desired_samples=50):
    """
    Augments data using the ADASYN algorithm. For more information see the
    documentation:  http://contrib.scikit-learn.org/imbalanced-learn/stable/generated/imblearn.over_sampling.ADASYN.html # noqa

    Will probably give a user warning stating: "The number of smaples in class x
    will be larger than the number of samples in the majority class", but we can
    ignore this since we are using ADASYN to augment data, not to correct for
    imbalanced data.

    Args:
        input_data (tuple): x_train, y_train, x_test, y_test
        desired_samples (int): The number of samples to be added to each class.

    Returns:
        tuple: x_train, y_train, x_test, y_test, with samples added to x_train
               and y_train.
    """
    x_train = input_data[0]
    y_train = input_data[1]
    x_test = input_data[2]
    y_test = input_data[3]

    classes, counts = np.unique(y_train, return_counts=True)
    ratio = {}
    for index, item in enumerate(classes):
        ratio[item] = counts[index] + desired_samples
    adasyn = ADASYN(ratio=ratio)
    x_train, y_train = adasyn.fit_sample(x_train, y_train)
    return (x_train, y_train, x_test, y_test)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: MLAlgo.py Projeto: umkhanqta/MLwakelockleak

def apply_simple_adasyn(X, y):
    from imblearn.over_sampling import ADASYN
    from collections import Counter
    simple_adasyn = ADASYN(sampling_strategy='minority')
    print(Counter(y))
    X_smt, y_smt = simple_adasyn.fit_sample(X, y)
    print(Counter(y_smt))
    return X_smt, y_smt

Exemplo n.º 18

0

Exibir arquivo

Arquivo: test.py Projeto: ryanliwag/suicide_visualization

def makeOverSamplesADASYN(X, y):
    #input DataFrame
    #X →Independent Variable in DataFrame\
    #y →dependent Variable in Pandas DataFrame format
    from imblearn.over_sampling import ADASYN
    sm = ADASYN()
    X, y = sm.fit_sample(X, y)
    return (X, y)

Exemplo n.º 19

0

Exibir arquivo

Arquivo: sampling.py Projeto: brunnurs/PA1

def ADASYN_oversampling(x, y):
    print('Original dataset shape {}'.format(Counter(y)))

    adasyn = ADASYN(random_state=42)
    x_sampled, y_sampled = adasyn.fit_sample(x, y)

    print('With ADASYN sampled dataset shape {}'.format(Counter(y_sampled)))

    return x_sampled, y_sampled

Exemplo n.º 20

0

Exibir arquivo

Arquivo: Resampling.py Projeto: tudou2015/classification_with_feature_selection

 def ADASYNOversampling(self, featureMatrix, Labels):
     ada = ADASYN(random_state=42)
     #print type(featureMatrix[0][0])
     #print type(Labels[0])
     feature_Resampled, Labels_Resampled = ada.fit_sample(
         featureMatrix, Labels)
     #print type(feature_Resampled[0][0])
     #print type(Labels_Resampled[0])
     #print ("ADASYN Oversampling Completed")
     return feature_Resampled, Labels_Resampled

Exemplo n.º 21

0

Exibir arquivo

Arquivo: views.py Projeto: abdullah-coskun/Cmpe-492

def makeOverSamplesADASYN(X, y):
    """
    Creates new data with oversampled variables by using ADASYN
    @param X: Independent Variable in DataFrame
    @param y: dependent variable in Pandas DataFrame formats
    @return: an oversampled version of the variables
    """
    sm = ADASYN()
    X, y = sm.fit_sample(X, y)
    return X, y

Exemplo n.º 22

0

Exibir arquivo

def oversample_dataset(X, y):

    under = RandomUnderSampler(sampling_strategy={0.0: 700})
    X, y = under.fit_sample(X, y)
    # print('Under {}'.format(Counter(y)))

    sampler = ADASYN(random_state=42)
    X_rs, y_rs = sampler.fit_sample(X, y)
    # print('ADASYN {}'.format(Counter(y_rs)))

    return X_rs, y_rs

Exemplo n.º 23

0

Exibir arquivo

Arquivo: getUsers.py Projeto: jonli123/229-221-joint-project

def resample_data(x, y, sample_choice=RUS_CONSTANT):
    if sample_choice == SMOTE_CONSTANT:
        sm = SMOTE(random_state=42)
        x, y = sm.fit_sample(x, y)
    elif sample_choice == ADASYN_CONSTANT:
        ada = ADASYN(random_state=42)
        x, y = ada.fit_sample(x, y)
    elif sample_choice == RUS_CONSTANT:
        rus = RandomUnderSampler(random_state=42)
        x, y = rus.fit_sample(x, y)
    return x, y

Exemplo n.º 24

0

Exibir arquivo

def test_ada_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    ada = ADASYN(random_state=RND_SEED)
    X_resampled, y_resampled = ada.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'ada_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'ada_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)

Exemplo n.º 25

0

Exibir arquivo

Arquivo: test_adasyn.py Projeto: vivounicorn/imbalanced-learn

def test_ada_fit_sample():
    """Test the fit sample routine"""

    # Resample the data
    ada = ADASYN(random_state=RND_SEED)
    X_resampled, y_resampled = ada.fit_sample(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'ada_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'ada_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)

Exemplo n.º 26

0

Exibir arquivo

 def runADASYN(self):
     ada = ADASYN()
     self.Xadasyn, self.Yadasyn = ada.fit_sample(self.X, self.Y)
     self.rebalanced['ADASYN'] = {
         'X': self.Xadasyn,
         'y': self.Yadasyn,
         'f': self.featureList
     }
     self.log.emit('ADASYN: Original dataset shape {}'.format(
         Counter(self.Y)),
                   indents=1)
     self.log.emit('ADASYN: Resampled dataset shape {}'.format(
         Counter(self.Yadasyn)),
                   indents=1)

Exemplo n.º 27

0

Exibir arquivo

Arquivo: Model - bkup.py Projeto: sidtandon2014/KaggleProblems

 def balanceDataset(self,train):
     from imblearn.over_sampling import ADASYN
     
     ada = ADASYN(random_state=10, ratio="minority")
     x = train.loc[:,train.columns != "TARGET"]
     y = train.loc[:,train.columns == "TARGET"]
     
     #pdb.set_trace()
     X,Y = ada.fit_sample(x,y)
     
     tmpDs = pd.concat(
             [pd.DataFrame(X,columns = x.columns),pd.DataFrame(Y,columns = y.columns)]
             ,axis = 1)
     return tmpDs

Exemplo n.º 28

0

Exibir arquivo

Arquivo: algorithms.py Projeto: zbn123/Fraud-Prediction

def cross_validate(X, y, model):
    # Split into train and test to crossvalidate
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    # Balance training data
    ads = ADASYN(random_state=10)
    X_train_b, y_train_b = ads.fit_sample(X_train, y_train)
    if model == 'RF':
        return RF(X_train_b, y_train_b, X_test, y_test)
    elif model == 'GBC':
        return GBC(X_train_b, y_train_b, X_test, y_test)
    elif model == 'ABC':
        return ABC(X_train_b, y_train_b, X_test, y_test)
    else:
        print('Enter a valid model')

Exemplo n.º 29

0

Exibir arquivo

Arquivo: utils.py Projeto: tobyfielding1/data-mining

def oversample(X: pd.DataFrame, y: pd.DataFrame, technique: str = 'adasyn'):
    """
    Oversamples the minority class to balance the classes
    :param X: unbalanced dataset as a dataframe
    :param y: labels for the dataset
    :param technique: either 'SMOTE' or 'ADASYN'
    :return: the balanced dataset and labels
    """
    if technique is 'adasyn':
        os_method = ADASYN()
    elif technique is 'smote':
        os_method = SMOTE()
    X, y = os_method.fit_sample(X, y)
    return X, y

Exemplo n.º 30

0

Exibir arquivo

Arquivo: keras_models.py Projeto: stephanieger/imbalanced-sequence-classification

    def runAdasyn(self, ensem_folder, model_h5, save_dir):

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # build and load models
        autoencoder, encoder, decoder = self.loadAutoencoder(model_h5)

        for ensem in range(self.Config.NUM_ENSEMBLES):

            dat = np.load(ensem_folder + 'ensem_dat' + str(ensem) + '.npy')
            lab = np.load(ensem_folder + 'ensem_lab' + str(ensem) + '.npy')
            dat_ = encoder.predict(dat)

            # resize data
            if len(lab.shape) == 3:
                lab = lab[:, -1, :]
                lab = np.argmax(lab, axis=1)
            else:
                lab = np.argmax(lab, axis=1)

            # run adasyn
            print(ensem)
            print('run ADASYN')

            ada = ADASYN(ratio='minority', random_state=42)

            # fit smote object
            print('fit smote object for ensem ' + str(ensem))
            x_res, y_res = ada.fit_sample(dat_, lab)

            x_syn = decoder.predict(x_res)

            y_res_ = []
            for i in range(len(y_res)):
                if y_res[i] == 0:
                    y_res_ += [np.array([1, 0])]
                else:
                    y_res_ += [np.array([0, 1])]

            y_res_ = np.array(y_res_)

            # save data
            print('save ensem ' + str(ensem))
            np.save(save_dir + 'ensem_dat' + str(ensem) + '.npy', x_syn)
            np.save(save_dir + 'ensem_lab' + str(ensem) + '.npy', y_res_)

        return

Exemplo n.º 31

0

Exibir arquivo

def balance_classes_adasyn(X, y, ratio='auto', random_state=None, k=5):
    """
    Function to balance the distribute of classes by using Adaptive Synthetic
    Sampling Approach for Imbalanced Learning (ADASYN)
    :param X: Feature data
    :param y: Class labels
    :param ratio: (str/float) If ‘auto’, the ratio will be defined automatically to balance the dataset. Otherwise, the
        ratio is defined as the number of samples in the minority class over the the number of samples in the majority
        class.
    :param random_state: (None/Int) If int, seed used for random number generator
    :param k: (int) Number of nearest neighbors used to construct synthetic samples
    :return: Data set with synthetic samples added
    """
    ad = ADASYN(ratio=ratio, random_state=random_state, n_jobs=1, k=k)
    X_adasyn, y_adasyn = ad.fit_sample(X, y)

    return X_adasyn, y_adasyn

Exemplo n.º 32

0

Exibir arquivo

def sensor_balancing(X_train, y_train):

    # Drop all rows with a very rare results, since SMOTEEN cannot handle them
    cc = y_train.value_counts()[y_train.value_counts() <= 3]
    y_train = y_train[~y_train.isin(cc.index.values)]
    X_train = pd.DataFrame(X_train[X_train.index.isin(list(y_train.index))])

    y_train = pd.Series(y_train)
    columns = pd.DataFrame(X_train).columns.values

    # Perform oversampling
    adasyn = ADASYN(sampling_strategy='not majority', n_neighbors=2, n_jobs=1)
    # X_train, y_train = adasyn.fit_sample(X_train, np.ravel(y_train.values))
    X_train, y_train = adasyn.fit_sample(X_train, y_train)

    X_train = pd.DataFrame(X_train, columns=list(columns))
    return X_train, pd.Series(y_train)

Exemplo n.º 33

0

Exibir arquivo

Arquivo: oversample.py Projeto: brettin/pilot1-docs

def oversample(X, y, bal_strategy):

	if(bal_strategy == "SMOTESVN"  or bal_strategy == "ALL"):
		# Apply SMOTE SVM
		sm = SMOTE(kind='svm')
		X_sampled, y_sampled = sm.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == "SMOTE"  or bal_strategy == "ALL"):
		# Apply regular SMOTE
		sm = SMOTE(kind='regular')
		X_sampled, y_sampled = sm.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == "ADASYN"  or bal_strategy == "ALL"):
	# Apply the random over-sampling
		ada = ADASYN()
		X_sampled, y_sampled = ada.fit_sample(X, y)

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	elif(bal_strategy == 'NONE'):
		X_sampled = X
		y_sampled = y

		print 'Shape of X_sampled: ', X_sampled.shape
		print 'Shape of y_sampled: ', y_sampled.shape

	else:
		print 'bal_stragegy not in SMOTESVN, SMOTE, ADASYN, ALL, NONE'
		sys.exit(1)


	return (X_sampled, y_sampled)

Exemplo n.º 34

0

Exibir arquivo

Arquivo: plot_adasyn.py Projeto: apyeh/UnbalancedDataset

from imblearn.over_sampling import ADASYN

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random over-sampling
ada = ADASYN()
X_resampled, y_resampled = ada.fit_sample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],

Exemplo n.º 35

0

Exibir arquivo

Arquivo: logistic+ADASYN.py Projeto: non27/The-final-assignment

                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
#define X y
X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
#define the size of test
#sklearn.model_selection.train_test_split随机划分训练集与测试集
#train_test_split(train_data,train_target,test_size=数字, random_state=0)

#ADASYN
ada = ADASYN()
os_X,os_y = ada.fit_sample(X_train,y_train)
os_X = pd.DataFrame(os_X)
os_y = pd.DataFrame(os_y)

#logistic
best_c = printing_Kfold_scores(os_X,os_y)
clf_l = LogisticRegression(C = best_c, penalty = 'l1')
clf_l.fit(os_X,os_y.values.ravel())
y_pred = clf_l.predict(X_test)
#调用ravel()函数将矩阵转变成一维数组
#（ravel()函数与flatten()的区别）
# 两者所要实现的功能是一致的（将多维数组降为一维），
# 两者的区别在于返回拷贝（copy）还是返回视图（view），
# numpy.flatten() 返回一份拷贝，对拷贝所做的修改不会影响（reflects）原始矩阵，
# 而numpy.ravel()返回的是视图（view），会影响（reflects）原始矩阵。
y_true, y_pred = y_test, clf_l.predict(X_test)

Exemplo n.º 36

0

Exibir arquivo

Arquivo: test_adasyn.py Projeto: glemaitre/imbalanced-learn

def test_ada_wrong_nn_obj():
    nn = 'rnd'
    ada = ADASYN(random_state=RND_SEED, n_neighbors=nn)
    with raises(ValueError, match="has to be one of"):
        ada.fit_sample(X, Y)

Exemplo n.º 37

0

Exibir arquivo

Arquivo: test_adasyn.py Projeto: glemaitre/imbalanced-learn

def test_ada_fit_ratio_error():
    ratio = {0: 9, 1: 12}
    ada = ADASYN(ratio=ratio, random_state=RND_SEED)
    with raises(ValueError, match="No samples will be generated."):
        ada.fit_sample(X, Y)