def test_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    ada = ADASYN(random_state=RND_SEED)
    ada.fit(X, Y)
    assert_raises(RuntimeError, ada.sample,
                  np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
示例#2
0
def test_sample_wrong_X():
    """Test either if an error is raised when X is different at fitting
    and sampling"""

    # Create the object
    ada = ADASYN(random_state=RND_SEED)
    ada.fit(X, Y)
    assert_raises(RuntimeError, ada.sample, np.random.random((100, 40)),
                  np.array([0] * 50 + [1] * 50))
def test_ada_fit():
    # Create the object
    ada = ADASYN(random_state=RND_SEED)
    # Fit the data
    ada.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(ada.min_c_, 0)
    assert_equal(ada.maj_c_, 1)
    assert_equal(ada.stats_c_[0], 8)
    assert_equal(ada.stats_c_[1], 12)
def test_ada_fit():
    """Test the fitting method"""

    # Create the object
    ada = ADASYN(random_state=RND_SEED)
    # Fit the data
    ada.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(ada.min_c_, 0)
    assert_equal(ada.maj_c_, 1)
    assert_equal(ada.stats_c_[0], 8)
    assert_equal(ada.stats_c_[1], 12)
示例#5
0
def test_ada_fit():
    ada = ADASYN(random_state=RND_SEED)
    ada.fit(X, Y)
    assert ada.ratio_ == {0: 4, 1: 0}
示例#6
0
def execute(trainfile, testfile, modeldir, logdir, epochs, batch_size, seed):

    print("--- Executing")
    print("Using trainfile:  ", trainfile)
    print("Using testfile:   ", testfile)
    print("Using modeldir:   ", modeldir)
    print("Using logdir:     ", logdir)
    print("Using epochs:     ", epochs)
    print("Using batch_size: ", batch_size)
    print("Using seed:       ", seed)

    print("--- Loading (transformed) data")
    data = Data.Data()
    df = data.load(trainfile)
    # print("df: ", df.shape)

    # For scoring (uses sample of training data)
    fraction = 0.1
    X_validation = df.sample(frac=fraction)
    Y_validation = X_validation["is_attributed"].values
    X_validation.drop(["is_attributed"], 1, inplace=True)
    # print("X_validation: ", X_validation.shape)

    # For training, use the part of data NOT in the validation fraction
    X_train = df.loc[~df.index.isin(X_validation.index)]
    Y_train = X_train["is_attributed"].values
    X_train.drop(["is_attributed"], 1, inplace=True)
    del df; gc.collect()

    is_oversampled = False
    if is_oversampled:
        print("Loaded X_train: ", X_train.shape)
        print("---- BEFORE:\n", X_train.head())
        print(Y_train)
        print("unique values: ", set(Y_train))

        columns = X_train.columns.values
        total = len(Y_train)
        ones = np.sum(Y_train)
        zeros = total - ones

        nration = 1.0
        nzeros = int(zeros)
        nones = int(nzeros * nration)
        ratio = {0:nzeros, 1:nones}
        print("ratio: ", ratio)

        oversampler = ADASYN(random_state=seed, ratio=ratio)
        oversampler.fit(X_train, Y_train)
        X_resampled, y_resampled = oversampler.sample(X_train, Y_train)
        X_resampled = X_resampled.astype(int)
        y_resampled = y_resampled.astype(int)
        X_train = pd.DataFrame(data=X_resampled, columns=columns)
        Y_train = y_resampled
        del X_resampled; del y_resampled; gc.collect()

        print("Oversampled (Random), ratio: ", ratio, " X_train: ", X_train.shape)
        print("---- AFTER:\n", X_train.head())
        print(Y_train)
        print("unique values: ", set(Y_train))

    X_test = data.load(testfile)

    print("X_train shape:      ", X_train.shape)
    print("Y_train shape:      ", Y_train.shape)
    print("X_test shape:       ", X_test.shape)
    print("X_validation shape: ", X_validation.shape)
    print("Y_validation shape: ", Y_validation.shape)

    print("--- Creating model")
    model = DenseModelFour.DenseModelFour()

    print("--- Configuring model")
    model.configure(X_train, X_test, X_validation)
    model.set_validation(X_validation, Y_validation)

    print("--- Training model")
    model.fit(
        X_train, Y_train, X_test,
        modeldir=modeldir, logdir=logdir,
        epochs=epochs, batch_size=batch_size)

    del X_train; gc.collect()
    del Y_train; gc.collect()
    del X_test; gc.collect()

    print("--- Scoring model")
    print("Fraction used for scoring: ", fraction)
    print("X_validation shape: ", X_validation.shape)
    print("Y_validation shape: ", Y_validation.shape)

    roc_auc, probabilities = model.score(X_validation, Y_validation)
    print("Score probabilities shape: ", probabilities.shape)
    roc_auc = "{:0.6f}".format(roc_auc)
    print("Score: ROC-AUC: ",roc_auc)

    print("--- Saving model")
    modelfile = modeldir + "/" + "dense-model-final-" + roc_auc + ".h5"
    model.save(modelfile)
    print("Model saved to: ", modelfile)
示例#7
0
def test_ada_fit():
    ada = ADASYN(random_state=RND_SEED)
    ada.fit(X, Y)
    assert ada.ratio_ == {0: 4, 1: 0}
示例#8
0
def oversample(method, pose_feats, d_list, labels):
    """Normalize data"""
    pose_feats_n, d_list_n = norm_feats(pose_feats, d_list)
    """Extract class indecies and equalize"""
    idx0 = np.flatnonzero(labels == 0)
    idx1 = np.flatnonzero(labels == 1)
    idx2 = np.flatnonzero(labels == 2)

    dom = np.min([len(idx0), len(idx1), len(idx2)])

    n_idx0 = idx0[0:dom - 2]
    n_idx1 = idx1[0:dom - 2]
    n_idx2 = idx2[0:dom - 2]

    n_pose_feats0 = pose_feats_n[n_idx0]
    n_pose_feats1 = pose_feats_n[n_idx1]
    n_pose_feats2 = pose_feats_n[n_idx2]

    pose_feats_n = np.concatenate(
        [n_pose_feats0, n_pose_feats1, n_pose_feats2])
    d_list_n = np.concatenate(
        [d_list_n[n_idx0], d_list_n[n_idx1], d_list_n[n_idx2]])
    labels_n = np.concatenate([labels[n_idx0], labels[n_idx1], labels[n_idx2]])
    nidx = np.concatenate([n_idx0, n_idx1, n_idx2])
    """Randomize the equalized data"""
    # Generate the permutation index array.
    permutation = np.random.permutation(pose_feats_n.shape[0])
    # Shuffle the arrays by giving the permutation in the square brackets.
    pose_feats_n = pose_feats_n[permutation]
    labels_n = labels_n[permutation]
    d_list_n = d_list_n[permutation]
    nidx = nidx[permutation]

    test = np.zeros([int(np.floor(len(pose_feats_n) / 4)), 66],
                    dtype=np.float64)
    gt_test = np.zeros([int(np.floor(len(pose_feats_n) / 4))])
    depth_test = np.zeros([int(np.floor(len(d_list_n) / 4)), 6],
                          dtype=np.float64)
    nidx_test = np.zeros([int(np.floor(len(nidx) / 4))])
    """Save validation data"""
    test[:, :] = np.array(pose_feats_n[0:int(np.floor(len(pose_feats_n) /
                                                      4)), :])
    depth_test[:, :] = np.array(d_list_n[0:int(np.floor(len(pose_feats_n) /
                                                        4)), :])
    gt_test = np.transpose(
        np.array(labels_n[0:int(np.floor(len(pose_feats_n) / 4))]))
    nidx_test = np.transpose(
        np.array(nidx[0:int(np.floor(len(pose_feats_n) / 4))]))
    """Extract the indecies used for validation"""
    pose_feats = np.delete(pose_feats, nidx_test, axis=0)
    d_list = np.delete(d_list, nidx_test, axis=0)
    labels = np.delete(labels, nidx_test)
    labels = labels.astype(int)
    """For SMOTE or ADASYN on training data"""
    if method == 2:
        pose_feats = np.concatenate([pose_feats, d_list], axis=1)
        fm = ADASYN(ratio='all', n_neighbors=5)
        fm = fm.fit(pose_feats, labels)
    elif method == 1:
        pose_feats = np.concatenate([pose_feats, d_list], axis=1)
        fm = SMOTE(ratio='all', kind='regular', k_neighbors=5)
        fm = fm.fit(pose_feats, labels)

    pose_feats, labels_train = fm.sample(pose_feats, labels)
    d_list = pose_feats[:, 66:72]
    """Apply normalization to over-sampled training data"""
    pose_feats, d_list = norm_feats(pose_feats, d_list)

    # Generate the permutation index array.
    permutation = np.random.permutation(pose_feats.shape[0])
    # Shuffle the arrays by giving the permutation in the square brackets.
    pose_feats = pose_feats[permutation]
    labels_train = labels_train[permutation]
    d_list = d_list[permutation]

    train = pose_feats
    gt_train = labels_train
    depth_train = d_list

    return test, train, gt_test, gt_train, depth_train, depth_test
示例#9
0
def test_ada_fit():
    ada = ADASYN(random_state=RND_SEED)
    ada.fit(X, Y)
    assert ada.sampling_strategy_ == {0: 4}