def test_sample_wrong_X(): """Test either if an error is raised when X is different at fitting and sampling""" # Create the object ada = ADASYN(random_state=RND_SEED) ada.fit(X, Y) assert_raises(RuntimeError, ada.sample, np.random.random((100, 40)), np.array([0] * 50 + [1] * 50))
def test_ada_fit(): # Create the object ada = ADASYN(random_state=RND_SEED) # Fit the data ada.fit(X, Y) # Check if the data information have been computed assert_equal(ada.min_c_, 0) assert_equal(ada.maj_c_, 1) assert_equal(ada.stats_c_[0], 8) assert_equal(ada.stats_c_[1], 12)
def test_ada_fit(): """Test the fitting method""" # Create the object ada = ADASYN(random_state=RND_SEED) # Fit the data ada.fit(X, Y) # Check if the data information have been computed assert_equal(ada.min_c_, 0) assert_equal(ada.maj_c_, 1) assert_equal(ada.stats_c_[0], 8) assert_equal(ada.stats_c_[1], 12)
def test_ada_fit(): ada = ADASYN(random_state=RND_SEED) ada.fit(X, Y) assert ada.ratio_ == {0: 4, 1: 0}
def execute(trainfile, testfile, modeldir, logdir, epochs, batch_size, seed): print("--- Executing") print("Using trainfile: ", trainfile) print("Using testfile: ", testfile) print("Using modeldir: ", modeldir) print("Using logdir: ", logdir) print("Using epochs: ", epochs) print("Using batch_size: ", batch_size) print("Using seed: ", seed) print("--- Loading (transformed) data") data = Data.Data() df = data.load(trainfile) # print("df: ", df.shape) # For scoring (uses sample of training data) fraction = 0.1 X_validation = df.sample(frac=fraction) Y_validation = X_validation["is_attributed"].values X_validation.drop(["is_attributed"], 1, inplace=True) # print("X_validation: ", X_validation.shape) # For training, use the part of data NOT in the validation fraction X_train = df.loc[~df.index.isin(X_validation.index)] Y_train = X_train["is_attributed"].values X_train.drop(["is_attributed"], 1, inplace=True) del df; gc.collect() is_oversampled = False if is_oversampled: print("Loaded X_train: ", X_train.shape) print("---- BEFORE:\n", X_train.head()) print(Y_train) print("unique values: ", set(Y_train)) columns = X_train.columns.values total = len(Y_train) ones = np.sum(Y_train) zeros = total - ones nration = 1.0 nzeros = int(zeros) nones = int(nzeros * nration) ratio = {0:nzeros, 1:nones} print("ratio: ", ratio) oversampler = ADASYN(random_state=seed, ratio=ratio) oversampler.fit(X_train, Y_train) X_resampled, y_resampled = oversampler.sample(X_train, Y_train) X_resampled = X_resampled.astype(int) y_resampled = y_resampled.astype(int) X_train = pd.DataFrame(data=X_resampled, columns=columns) Y_train = y_resampled del X_resampled; del y_resampled; gc.collect() print("Oversampled (Random), ratio: ", ratio, " X_train: ", X_train.shape) print("---- AFTER:\n", X_train.head()) print(Y_train) print("unique values: ", set(Y_train)) X_test = data.load(testfile) print("X_train shape: ", X_train.shape) print("Y_train shape: ", Y_train.shape) print("X_test shape: ", X_test.shape) print("X_validation shape: ", X_validation.shape) print("Y_validation shape: ", Y_validation.shape) print("--- Creating model") model = DenseModelFour.DenseModelFour() print("--- Configuring model") model.configure(X_train, X_test, X_validation) model.set_validation(X_validation, Y_validation) print("--- Training model") model.fit( X_train, Y_train, X_test, modeldir=modeldir, logdir=logdir, epochs=epochs, batch_size=batch_size) del X_train; gc.collect() del Y_train; gc.collect() del X_test; gc.collect() print("--- Scoring model") print("Fraction used for scoring: ", fraction) print("X_validation shape: ", X_validation.shape) print("Y_validation shape: ", Y_validation.shape) roc_auc, probabilities = model.score(X_validation, Y_validation) print("Score probabilities shape: ", probabilities.shape) roc_auc = "{:0.6f}".format(roc_auc) print("Score: ROC-AUC: ",roc_auc) print("--- Saving model") modelfile = modeldir + "/" + "dense-model-final-" + roc_auc + ".h5" model.save(modelfile) print("Model saved to: ", modelfile)
def oversample(method, pose_feats, d_list, labels): """Normalize data""" pose_feats_n, d_list_n = norm_feats(pose_feats, d_list) """Extract class indecies and equalize""" idx0 = np.flatnonzero(labels == 0) idx1 = np.flatnonzero(labels == 1) idx2 = np.flatnonzero(labels == 2) dom = np.min([len(idx0), len(idx1), len(idx2)]) n_idx0 = idx0[0:dom - 2] n_idx1 = idx1[0:dom - 2] n_idx2 = idx2[0:dom - 2] n_pose_feats0 = pose_feats_n[n_idx0] n_pose_feats1 = pose_feats_n[n_idx1] n_pose_feats2 = pose_feats_n[n_idx2] pose_feats_n = np.concatenate( [n_pose_feats0, n_pose_feats1, n_pose_feats2]) d_list_n = np.concatenate( [d_list_n[n_idx0], d_list_n[n_idx1], d_list_n[n_idx2]]) labels_n = np.concatenate([labels[n_idx0], labels[n_idx1], labels[n_idx2]]) nidx = np.concatenate([n_idx0, n_idx1, n_idx2]) """Randomize the equalized data""" # Generate the permutation index array. permutation = np.random.permutation(pose_feats_n.shape[0]) # Shuffle the arrays by giving the permutation in the square brackets. pose_feats_n = pose_feats_n[permutation] labels_n = labels_n[permutation] d_list_n = d_list_n[permutation] nidx = nidx[permutation] test = np.zeros([int(np.floor(len(pose_feats_n) / 4)), 66], dtype=np.float64) gt_test = np.zeros([int(np.floor(len(pose_feats_n) / 4))]) depth_test = np.zeros([int(np.floor(len(d_list_n) / 4)), 6], dtype=np.float64) nidx_test = np.zeros([int(np.floor(len(nidx) / 4))]) """Save validation data""" test[:, :] = np.array(pose_feats_n[0:int(np.floor(len(pose_feats_n) / 4)), :]) depth_test[:, :] = np.array(d_list_n[0:int(np.floor(len(pose_feats_n) / 4)), :]) gt_test = np.transpose( np.array(labels_n[0:int(np.floor(len(pose_feats_n) / 4))])) nidx_test = np.transpose( np.array(nidx[0:int(np.floor(len(pose_feats_n) / 4))])) """Extract the indecies used for validation""" pose_feats = np.delete(pose_feats, nidx_test, axis=0) d_list = np.delete(d_list, nidx_test, axis=0) labels = np.delete(labels, nidx_test) labels = labels.astype(int) """For SMOTE or ADASYN on training data""" if method == 2: pose_feats = np.concatenate([pose_feats, d_list], axis=1) fm = ADASYN(ratio='all', n_neighbors=5) fm = fm.fit(pose_feats, labels) elif method == 1: pose_feats = np.concatenate([pose_feats, d_list], axis=1) fm = SMOTE(ratio='all', kind='regular', k_neighbors=5) fm = fm.fit(pose_feats, labels) pose_feats, labels_train = fm.sample(pose_feats, labels) d_list = pose_feats[:, 66:72] """Apply normalization to over-sampled training data""" pose_feats, d_list = norm_feats(pose_feats, d_list) # Generate the permutation index array. permutation = np.random.permutation(pose_feats.shape[0]) # Shuffle the arrays by giving the permutation in the square brackets. pose_feats = pose_feats[permutation] labels_train = labels_train[permutation] d_list = d_list[permutation] train = pose_feats gt_train = labels_train depth_train = d_list return test, train, gt_test, gt_train, depth_train, depth_test
def test_ada_fit(): ada = ADASYN(random_state=RND_SEED) ada.fit(X, Y) assert ada.sampling_strategy_ == {0: 4}