示例#1
0
def train_job(train_idx, test_idx, t_size, rf_mode, m_rules, r_seed, X, y,
              feas, n_samples):
    """
    每个 fold 中进行训练和验证
    """
    p = current_process()
    print('process counter:', p._identity[0], 'pid:', os.getpid())
    # 初始化 estimator 训练集进入模型
    rf = RuleFit(tree_size=t_size,
                 rfmode=rf_mode,
                 max_rules=m_rules,
                 random_state=r_seed)
    print(
        "\nTree generator:{0}, \n\nMax rules:{1}, Tree size:{2}, Random state:{3}"
        .format(rf.tree_generator, rf.max_rules, rf.tree_size,
                rf.random_state))
    rf.fit(X[train_idx], y[train_idx], feas)
    # 验证测试集 (通过 index 去除 fake data)
    real_test_index = test_idx[test_idx < n_samples]
    batch_test_x = X[real_test_index]
    batch_test_y = y[real_test_index]
    batch_test_size = len(real_test_index)
    y_pred = rf.predict(batch_test_x)
    # 计算测试集 ACC
    accTest = accuracy_score(batch_test_y, y_pred)
    print("\nTest Accuracy:", "{:.6f}".format(accTest), "Test Size:",
          batch_test_size)

    print(
        "\n========================================================================="
    )
    # 返回测试集和预测结果用于统计
    return batch_test_y, y_pred
示例#2
0
    # 生成 k-fold 训练集、测试集索引
    cv_index_set = rs.split(y)
    k_fold_step = 1  # 初始化折数
    # 暂存每次选中的测试集和对应预测结果
    test_cache = pred_cache = np.array([], dtype=np.int)
    # 迭代训练 k-fold 交叉验证
    for train_index, test_index in cv_index_set:
        print("\nFold:", k_fold_step)
        # 初始化 estimator 训练集进入模型
        rf = RuleFit(tree_size=args.treesize,
                     rfmode=args.rfmode,
                     max_rules=args.maxrules,
                     random_state=args.randomseed)
        rf.fit(X[train_index], y[train_index], features)
        # 测试集验证
        y_pred = rf.predict(X[test_index])
        # 计算测试集 ACC
        accTest = accuracy_score(y[test_index], y_pred)
        print("\nFold:", k_fold_step, "Test Accuracy:",
              "{:.6f}".format(accTest), "Test Size:", test_index.size)
        # 暂存每次选中的测试集和预测结果
        test_cache = np.concatenate((test_cache, y[test_index]))
        pred_cache = np.concatenate((pred_cache, y_pred))
        print(
            "\n========================================================================="
        )
        # 每个fold训练结束后次数 +1
        k_fold_step += 1

    # 末尾输出rulefit模型参数
    print("\n=== Model parameters ===")
                  max_depth=100,
                  max_features=None,
                  max_leaf_nodes=15,
                  min_impurity_decrease=0.0,
                  min_impurity_split=None,
                  min_samples_leaf=1,
                  min_samples_split=2,
                  min_weight_fraction_leaf=0.0,
                  n_estimators=500,
                  n_iter_no_change=None,
                  presort='auto',
                  random_state=572,
                  subsample=0.46436099318265595,
                  tol=0.0001,
                  validation_fraction=0.1,
                  verbose=0,
                  warm_start=False),
              tree_size=3)
rgb.fit(x_train, y_train)
y_pred = rgb.predict(x_test)
rules = rgb.get_rules()


def scaled_absolute_error(y_test, y_pred):
    e1 = np.mean(y_test - y_pred)
    e2 = np.mean(y_test - np.median(y_test))
    return np.round(e1 / e2, 4)


scaled_absolute_error(y_test, y_pred)
boston_data = pd.read_csv("boston.csv", index_col=0)

y = boston_data.medv.values
X = boston_data.drop("medv", axis=1)
features = X.columns
X = X.values

typ = 'regressor' #regressor or classifier

if typ == 'regressor':
    rf = RuleFit(
        rfmode='regress',
        tree_generator=RandomForestRegressor()
    )
    rf.fit(X, y, feature_names=features)
    y_pred = rf.predict(X)
    insample_rmse = np.sqrt(np.sum((y_pred - y)**2)/len(y))
elif typ == 'classifier':
    y_class = y.copy()
    y_class[y_class < 21] = -1
    y_class[y_class >= 21] = +1
    N = X.shape[0]
    rf = RuleFit(   rfmode='classify',
                    tree_generator=RandomForestClassifier()
                )
    rf.fit(X, y_class, feature_names=features)
    y_pred = rf.predict(X)
    y_proba = rf.predict_proba(X)
    insample_acc = sum(y_pred == y_class) / len(y_class)
rules = rf.get_rules()
示例#5
0
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 25 23:54:35 2018

@author: Melanie
"""

import numpy as np
import pandas as pd

from rulefit import RuleFit

boston_data = pd.read_csv("prism_numeric.csv", index_col=0)

y = boston_data.medv.values
X = boston_data.drop("medv", axis=1)
features = X.columns
X = X.as_matrix()

rf = RuleFit()
rf.fit(X, y, feature_names=features)
rf.predict(X)
rules = rf.get_rules()

rules = rules[rules.coef != 0].sort_values("support", ascending=False)

print(rules)
示例#6
0
train = data[200:, :]
test = data[:200, :]
train_target = target[200:]
test_target = target[:200]

from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor(n_estimators=500,
                               max_depth=10,
                               learning_rate=0.01)

relu_fit = RuleFit()
relu_fit.max_iter = 4000
relu_fit.tree_generator = gb
relu_fit.fit(train, train_target, feature_names=feature_name)
f = relu_fit.predict(test)
ff = relu_fit.predict(train)
rule = relu_fit.get_rules()
truth = 0
for i in range(test_target.shape[0]):
    if abs(test_target[i] - f[i]) / test_target[i] < 0.1:
        truth += 1

print("truth: ", truth / test_target.shape[0])
#print(rule)
ruleset = pd.DataFrame(data=rule)
writer = pd.ExcelWriter('./rules.xlsx')
ruleset.to_excel(writer)
writer.save()
writer.close()
    # 迭代训练 k-fold 交叉验证
    for train_index, test_index in resampled_index_set:
        print("\nFold:", k_fold_step)
        # 初始化 estimator 训练集进入模型
        rf = RuleFit(tree_size=args.treesize,
                     rfmode=args.rfmode,
                     max_rules=args.maxrules,
                     random_state=args.randomseed)
        rf.fit(x_resampled[train_index], y_resampled[train_index], features)
        # 测试集验证
        # 验证测试集 (通过 index 去除 fake data)
        real_test_index = test_index[test_index < X.shape[0]]
        batch_test_x = x_resampled[real_test_index]
        batch_test_y = y_resampled[real_test_index]
        batch_test_size = len(real_test_index)
        y_pred = rf.predict(batch_test_x)
        # 计算测试集 ACC
        accTest = accuracy_score(batch_test_y, y_pred)
        print("\nFold:", k_fold_step, "Test Accuracy:",
              "{:.6f}".format(accTest), "Test Size:", batch_test_size)
        # 暂存每次选中的测试集和预测结果
        test_cache = np.concatenate((test_cache, batch_test_y))
        pred_cache = np.concatenate((pred_cache, y_pred))
        print(
            "\n========================================================================="
        )
        # 每个fold训练结束后次数 +1
        k_fold_step += 1

    # 末尾输出rulefit模型参数
    print("\n=== Model parameters ===")