示例#1
0
def test_iRF_weight1():
    # Check when label is random, whether the feature importance of every
    # feature is the same.
    n_samples = 1000
    n_features = 10
    random_state_classifier = 2018
    np.random.seed(random_state_classifier)
    X_train = np.random.uniform(low=0, high=1, size=(n_samples, n_features))
    y_train = np.random.choice([0, 1], size=(n_samples, ), p=[.5, .5])
    X_test = np.random.uniform(low=0, high=1, size=(n_samples, n_features))
    y_test = np.random.choice([0, 1], size=(n_samples, ), p=[.5, .5])
    all_rf_weights, all_K_iter_rf_data, \
        all_rf_bootstrap_output, all_rit_bootstrap_output, \
        stability_score = irf_utils.run_iRF(X_train=X_train,
                                            X_test=X_test,
                                            y_train=y_train,
                                            y_test=y_test,
                                            K=5,
                                            n_estimators=20,
                                            B=30,
                                            random_state_classifier=2018,
                                            propn_n_samples=.2,
                                            bin_class_type=1,
                                            M=20,
                                            max_depth=5,
                                            noisy_split=False,
                                            num_splits=2,
                                            n_estimators_bootstrap=5)
    assert np.max(all_rf_weights['rf_weight5']) < .135
示例#2
0
def test_iRF_weight2():
    # Check when feature 1 fully predict the label, its importance should be 1.
    n_samples = 1000
    n_features = 10
    random_state_classifier = 2018
    np.random.seed(random_state_classifier)
    X_train = np.random.uniform(low=0, high=1, size=(n_samples, n_features))
    y_train = np.random.choice([0, 1], size=(n_samples, ), p=[.5, .5])
    X_test = np.random.uniform(low=0, high=1, size=(n_samples, n_features))
    y_test = np.random.choice([0, 1], size=(n_samples, ), p=[.5, .5])
    # first feature is very important
    X_train[:, 1] = X_train[:, 1] + y_train
    X_test[:, 1] = X_test[:, 1] + y_test
    all_rf_weights, all_K_iter_rf_data, \
        all_rf_bootstrap_output, all_rit_bootstrap_output, \
        stability_score = irf_utils.run_iRF(X_train=X_train,
                                            X_test=X_test,
                                            y_train=y_train,
                                            y_test=y_test,
                                            K=5,
                                            n_estimators=20,
                                            B=30,
                                            random_state_classifier=2018,
                                            propn_n_samples=.2,
                                            bin_class_type=1,
                                            M=20,
                                            max_depth=5,
                                            noisy_split=False,
                                            num_splits=2,
                                            n_estimators_bootstrap=5)
    print(all_rf_weights['rf_weight5'])
    assert all_rf_weights['rf_weight5'][1] == 1
示例#3
0
# prep
X = dmx[1:, 1:].astype(float).T
# assigning a nurmerical variable to the string classifiers
# 0 for cntrol, 1 for compared
# biclass
y = np.array(itemgetter(*idx[1:, 2])({'Control': 0, 'D12': 1}))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# iRF
irfres = irf_utils.run_iRF(
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    rf=RandomForestClassifierWithWeights(n_estimators=100),
    K=5,  # number of iteration. This is recommended value by dev
    B=100,  # The number of bootstrap samples. Play around with this, see what changes
    M=137,  # number of trees (RIT) to build. Look into the effects of this parameter
    max_depth=5,
)

rf_weights, K_iter_rf_data, rf_bootstrap_output, rit_bootstrap_output, stability_score = irfres

# feature importance
# i probably don't need to worry about the rest of this, i.e modifying etc
fids = dmx[1:, 0]

iteration = 'rf_iter5'
impt = K_iter_rf_data[iteration]['feature_importances']
impt_std = K_iter_rf_data[iteration]['feature_importances_std']
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

# trying a better version of testtrain split
kf = KFold(n_splits=5, random_state=15, shuffle=True)

for count_k, (train_index, test_index) in enumerate(kf.split(X)):
    X_train = X[train_index]
    X_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]
    irfres = irf_utils.run_iRF(
        X_train=X_train,
        X_test=X_test,
        y_train=y_train,
        y_test=y_test,
        rf=RandomForestClassifierWithWeights(n_estimators=30),
        K=10,  # number of iteration
        B=30,  # The number of bootstrap samples
        M=20,  # number of trees (RIT) to build
        max_depth=5,
    )

rf_weights, K_iter_rf_data, rf_bootstrap_output, rit_bootstrap_output, stability_score = irfres

# feature importance
fids = dmx[1:, 0]

iteration = 'rf_iter5'
impt = K_iter_rf_data[iteration]['feature_importances']
impt_std = K_iter_rf_data[iteration]['feature_importances_std']
impt_rank_idx = K_iter_rf_data[iteration]['feature_importances_rank_idx']