def test_iRF_weight1(): # Check when label is random, whether the feature importance of every # feature is the same. n_samples = 1000 n_features = 10 random_state_classifier = 2018 np.random.seed(random_state_classifier) X_train = np.random.uniform(low=0, high=1, size=(n_samples, n_features)) y_train = np.random.choice([0, 1], size=(n_samples, ), p=[.5, .5]) X_test = np.random.uniform(low=0, high=1, size=(n_samples, n_features)) y_test = np.random.choice([0, 1], size=(n_samples, ), p=[.5, .5]) all_rf_weights, all_K_iter_rf_data, \ all_rf_bootstrap_output, all_rit_bootstrap_output, \ stability_score = irf_utils.run_iRF(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, K=5, n_estimators=20, B=30, random_state_classifier=2018, propn_n_samples=.2, bin_class_type=1, M=20, max_depth=5, noisy_split=False, num_splits=2, n_estimators_bootstrap=5) assert np.max(all_rf_weights['rf_weight5']) < .135
def test_iRF_weight2(): # Check when feature 1 fully predict the label, its importance should be 1. n_samples = 1000 n_features = 10 random_state_classifier = 2018 np.random.seed(random_state_classifier) X_train = np.random.uniform(low=0, high=1, size=(n_samples, n_features)) y_train = np.random.choice([0, 1], size=(n_samples, ), p=[.5, .5]) X_test = np.random.uniform(low=0, high=1, size=(n_samples, n_features)) y_test = np.random.choice([0, 1], size=(n_samples, ), p=[.5, .5]) # first feature is very important X_train[:, 1] = X_train[:, 1] + y_train X_test[:, 1] = X_test[:, 1] + y_test all_rf_weights, all_K_iter_rf_data, \ all_rf_bootstrap_output, all_rit_bootstrap_output, \ stability_score = irf_utils.run_iRF(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, K=5, n_estimators=20, B=30, random_state_classifier=2018, propn_n_samples=.2, bin_class_type=1, M=20, max_depth=5, noisy_split=False, num_splits=2, n_estimators_bootstrap=5) print(all_rf_weights['rf_weight5']) assert all_rf_weights['rf_weight5'][1] == 1
# prep X = dmx[1:, 1:].astype(float).T # assigning a nurmerical variable to the string classifiers # 0 for cntrol, 1 for compared # biclass y = np.array(itemgetter(*idx[1:, 2])({'Control': 0, 'D12': 1})) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # iRF irfres = irf_utils.run_iRF( X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, rf=RandomForestClassifierWithWeights(n_estimators=100), K=5, # number of iteration. This is recommended value by dev B=100, # The number of bootstrap samples. Play around with this, see what changes M=137, # number of trees (RIT) to build. Look into the effects of this parameter max_depth=5, ) rf_weights, K_iter_rf_data, rf_bootstrap_output, rit_bootstrap_output, stability_score = irfres # feature importance # i probably don't need to worry about the rest of this, i.e modifying etc fids = dmx[1:, 0] iteration = 'rf_iter5' impt = K_iter_rf_data[iteration]['feature_importances'] impt_std = K_iter_rf_data[iteration]['feature_importances_std']
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) # trying a better version of testtrain split kf = KFold(n_splits=5, random_state=15, shuffle=True) for count_k, (train_index, test_index) in enumerate(kf.split(X)): X_train = X[train_index] X_test = X[test_index] y_train = y[train_index] y_test = y[test_index] irfres = irf_utils.run_iRF( X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, rf=RandomForestClassifierWithWeights(n_estimators=30), K=10, # number of iteration B=30, # The number of bootstrap samples M=20, # number of trees (RIT) to build max_depth=5, ) rf_weights, K_iter_rf_data, rf_bootstrap_output, rit_bootstrap_output, stability_score = irfres # feature importance fids = dmx[1:, 0] iteration = 'rf_iter5' impt = K_iter_rf_data[iteration]['feature_importances'] impt_std = K_iter_rf_data[iteration]['feature_importances_std'] impt_rank_idx = K_iter_rf_data[iteration]['feature_importances_rank_idx']