def train_job(train_idx, test_idx, t_size, rf_mode, m_rules, r_seed, X, y, feas, n_samples): """ 每个 fold 中进行训练和验证 """ p = current_process() print('process counter:', p._identity[0], 'pid:', os.getpid()) # 初始化 estimator 训练集进入模型 rf = RuleFit(tree_size=t_size, rfmode=rf_mode, max_rules=m_rules, random_state=r_seed) print( "\nTree generator:{0}, \n\nMax rules:{1}, Tree size:{2}, Random state:{3}" .format(rf.tree_generator, rf.max_rules, rf.tree_size, rf.random_state)) rf.fit(X[train_idx], y[train_idx], feas) # 验证测试集 (通过 index 去除 fake data) real_test_index = test_idx[test_idx < n_samples] batch_test_x = X[real_test_index] batch_test_y = y[real_test_index] batch_test_size = len(real_test_index) y_pred = rf.predict(batch_test_x) # 计算测试集 ACC accTest = accuracy_score(batch_test_y, y_pred) print("\nTest Accuracy:", "{:.6f}".format(accTest), "Test Size:", batch_test_size) print( "\n=========================================================================" ) # 返回测试集和预测结果用于统计 return batch_test_y, y_pred
def handle(self, *args, **kwargs): # get model TARGET_MODEL = 5 job = Job.objects.filter(pk=TARGET_MODEL)[0] model = joblib.load(job.predictive_model.model_path) model = model[0] training_df, test_df = get_encoded_logs(job) feature_names = list( training_df.drop(['trace_id', 'label'], 1).columns.values) X_train = training_df.drop(['trace_id', 'label'], 1) Y_train = training_df.drop( ['trace_id', 'prefix_1', 'prefix_3', 'prefix_4', 'label'], 1) rf = RuleFit() columns = list(X_train.columns) X = X_train.as_matrix() rf.fit(X, Y_train.values.ravel(), feature_names=columns) rules = rf.get_rules() # rules = rules[rules.coef != 0].sort_values("support", ascending=False) rules = rules[(rules.coef > 0.) & (rules.type != 'linear')] rules['effect'] = rules['coef'] * rules['support'] pd.set_option('display.max_colwidth', -1) rules.nlargest(10, 'effect') # print(rules) rules
def _getRulesRulefit(df_aux, model_params): # Prepare data X_train = df_aux[feature_cols] y_train = df_aux["predictions"] # Fit model if "tree_size" not in model_params.keys(): model_params["tree_size"] = len(feature_cols) * 2 if "rfmode" not in model_params.keys(): model_params["rfmode"] = "classify" rf = RuleFit(**model_params) rf.fit(X_train.values, y_train.values, feature_names=feature_cols) # Get rules print("Obtaining Rules using RuleFit...") rules_all = rf.get_rules() rules_all = rules_all[rules_all.coef != 0] rules_all = rules_all[rules_all.importance > 0].sort_values( "support", ascending=False) rules_all = rules_all[rules_all.coef > 0] rules_all = rules_all.sort_values("support", ascending=False) rules_all = rules_all[rules_all["type"] == "rule"] rules_all["size_rules"] = rules_all.apply( lambda x: len(x["rule"].split("&")), axis=1) # Turn list of rules to dataframe print("Turning rules to hypercubes...") df_rules = turn_rules_to_df(list_rules=list(rules_all["rule"].values), list_cols=feature_cols) # Get corresponding rule size from the original rule extraction model, # not on the hypercubes obtained later df_rules["size_rules"] = list(rules_all["size_rules"].values) # Prune rules if simplify_rules: print("Prunning the rules obtained...") df_rules_pruned = df_rules.drop(columns=["size_rules"]).copy() df_rules_pruned = simplifyRules(df_rules_pruned, categorical_cols) df_rules_pruned = df_rules_pruned.reset_index().merge( df_rules.reset_index()[["index", "size_rules"]], how="left") df_rules_pruned.index = df_rules_pruned["index"] df_rules_pruned = df_rules_pruned.drop(columns=["index"], errors="ignore") df_rules = df_rules_pruned return df_rules
import numpy as np import pandas as pd from sklearn.ensemble import GradientBoostingRegressor from rulefit import RuleFit boston_data = pd.read_csv("boston.csv", index_col=0) y = boston_data.medv.values X = boston_data.drop("medv", axis=1) features = X.columns X = X.as_matrix() gb = GradientBoostingRegressor(n_estimators=100, max_depth=3, learning_rate=0.01) rf = RuleFit(gb) rf.fit(X, y, feature_names=features) rules = rf.get_rules() rules = rules[rules.coef != 0].sort("support")
shuffle=True, random_state=args.randomseed) # 生成 k-fold 训练集、测试集索引 cv_index_set = rs.split(y) k_fold_step = 1 # 初始化折数 # 暂存每次选中的测试集和对应预测结果 test_cache = pred_cache = np.array([], dtype=np.int) # 迭代训练 k-fold 交叉验证 for train_index, test_index in cv_index_set: print("\nFold:", k_fold_step) # 初始化 estimator 训练集进入模型 rf = RuleFit(tree_size=args.treesize, rfmode=args.rfmode, max_rules=args.maxrules, random_state=args.randomseed) rf.fit(X[train_index], y[train_index], features) # 测试集验证 y_pred = rf.predict(X[test_index]) # 计算测试集 ACC accTest = accuracy_score(y[test_index], y_pred) print("\nFold:", k_fold_step, "Test Accuracy:", "{:.6f}".format(accTest), "Test Size:", test_index.size) # 暂存每次选中的测试集和预测结果 test_cache = np.concatenate((test_cache, y[test_index])) pred_cache = np.concatenate((pred_cache, y_pred)) print( "\n=========================================================================" ) # 每个fold训练结束后次数 +1 k_fold_step += 1
max_depth=100, max_features=None, max_leaf_nodes=15, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=500, n_iter_no_change=None, presort='auto', random_state=572, subsample=0.46436099318265595, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False), tree_size=3) rgb.fit(x_train, y_train) y_pred = rgb.predict(x_test) rules = rgb.get_rules() def scaled_absolute_error(y_test, y_pred): e1 = np.mean(y_test - y_pred) e2 = np.mean(y_test - np.median(y_test)) return np.round(e1 / e2, 4) scaled_absolute_error(y_test, y_pred)
boston_data = pd.read_csv("boston.csv", index_col=0) y = boston_data.medv.values X = boston_data.drop("medv", axis=1) features = X.columns X = X.values typ = 'regressor' #regressor or classifier if typ == 'regressor': rf = RuleFit( rfmode='regress', tree_generator=RandomForestRegressor() ) rf.fit(X, y, feature_names=features) y_pred = rf.predict(X) insample_rmse = np.sqrt(np.sum((y_pred - y)**2)/len(y)) elif typ == 'classifier': y_class = y.copy() y_class[y_class < 21] = -1 y_class[y_class >= 21] = +1 N = X.shape[0] rf = RuleFit( rfmode='classify', tree_generator=RandomForestClassifier() ) rf.fit(X, y_class, feature_names=features) y_pred = rf.predict(X) y_proba = rf.predict_proba(X) insample_acc = sum(y_pred == y_class) / len(y_class) rules = rf.get_rules()
class FeatureVec(object): "Feature-vector class." def __init__( self, mode, max_depth=3, feature_names=None, max_sentences=20000, exp_rand_tree_size=True, tree_generator=None, ): ''' mode: 'classify' or 'regress' max_depth: maximum depth of trained trees feature_names: names of features max_sentences: maximum number of extracted sentences exp_rand_tree_size: Having trees with different sizes tree_generator: Tree generator model (overwrites above features) ''' self.feature_names = feature_names self.mode = mode max_leafs = 2**max_depth num_trees = max_sentences // max_leafs if tree_generator is None: tree_generator = RandomForestClassifier(num_trees, max_depth=max_depth) self.exp_rand_tree_size = exp_rand_tree_size self.rf = RuleFit(rfmode=mode, tree_size=max_leafs, max_rules=max_sentences, tree_generator=tree_generator, exp_rand_tree_size=True, fit_lasso=False, Cs=10.**np.arange(-4, 1), cv=3) def fit(self, X, y, restart=True, bagging=0): '''Fit the tree model. X: inputs y: outputs (integer class label or real value) restart: To train from scratch tree generator model bagging: If >0 applies bagging on trees to compute confidence intervals ''' if not bagging: bagging = 0 dimred = TruncatedSVD(2) self.rf.fit(X, y, restart=restart) rules = self.rf.get_rules()['rule'].values cm = cooccurance_matrix(rules, X.shape[-1]) vectors = dimred.fit_transform(cm) vectors = normalize_angles(vectors) self.norms = np.clip(np.linalg.norm(vectors, axis=-1, keepdims=True), 1e-12, None) vectors /= np.max(self.norms) self.vectors = vectors self.importance = np.linalg.norm(self.vectors, axis=-1) self.angles = np.arctan2(self.vectors[:, 1], self.vectors[:, 0]) self.stds = np.zeros(vectors.shape) self.predictor = self.rf.tree_generator if bagging: all_vectors = [] for _ in range(bagging): self.rf.bag_trees(X, y) rules_bag = self.rf.get_rules()['rule'].values cm_bag = cooccurance_matrix(rules_bag, X.shape[-1]) vectors_bag = dimred.fit_transform(cm_bag) vectors_bag = normalize_angles(vectors_bag) norms_bag = np.clip( np.linalg.norm(vectors_bag, axis=-1, keepdims=True), 1e-12, None) all_vectors.append(vectors_bag / norms_bag) self.stds = np.std(all_vectors, 0) def plot(self, dynamic=True, confidence=True, path=None): '''Plot the feature-vectors. dynamic: If True the output is a dynamic html plot. Otherwise, it will be an image. confidence: To show confidence intervals or not path: Path to save the image. If dy ''' mx = 1.1 angles = np.arctan2(self.vectors[:, 1], self.vectors[:, 0]) max_angle = np.max(np.abs(angles)) feature_names = self.feature_names + ['origin', ''] plot_vectors = np.concatenate([self.vectors, [[0, 0], [0, 0]]]) vectors_sizes = np.linalg.norm(plot_vectors, axis=-1) plot_angles = np.concatenate([angles, [-max_angle, max_angle]]) plot_data = np.stack([ plot_vectors[:, 1], plot_vectors[:, 0], plot_angles, feature_names ], axis=-1) plot_df = pd.DataFrame(data=plot_data, columns=['x', 'y', 'angles', 'names']) plot_df[["x", "y", "angles"]] = plot_df[["x", "y", "angles"]].apply(pd.to_numeric) if dynamic: fig = px.scatter( plot_df, x='x', y='y', color='angles', width=1000, height=500, hover_name=feature_names, hover_data={ 'x': False, 'y': False, 'angles': False, 'names': False }, color_continuous_scale=px.colors.sequential.Rainbow) fig.update_yaxes(visible=False, showticklabels=False, range=[0, mx]) fig.update_xaxes(visible=False, showticklabels=False, range=[-mx, mx]) else: fig = px.scatter( plot_df, x='x', y='y', color='angles', width=1000, height=500, hover_name='names', hover_data={ 'x': False, 'y': False, 'angles': False, 'names': False }, color_continuous_scale=px.colors.sequential.Rainbow) max_name_len = max([len(i) for i in feature_names]) for i in range(len(plot_vectors) - 2): if plot_vectors[:, 1][i] > 0: name = feature_names[i] + ''.join( [' '] * (max_name_len - len(feature_names[i]))) ax = plot_vectors[:, 1][i] + 0.2 else: name = ''.join([' '] * (max_name_len - len(feature_names[i]))) + feature_names[i] ax = plot_vectors[:, 1][i] - 0.2 if vectors_sizes[i] < 0.2: continue fig.add_annotation( x=plot_vectors[:, 1][i], y=plot_vectors[:, 0][i], text=feature_names[i] + ''.join([' '] * (max_name_len - len(feature_names[i]))), font=dict(size=15), axref="x", ayref="y", ax=ax, ay=plot_vectors[:, 0][i], arrowhead=2, ) fig.update_yaxes(visible=False, showticklabels=False, range=[0, mx]) fig.update_xaxes(visible=False, showticklabels=False, range=[-mx, mx]) fig.update_traces(marker=dict(size=10), textfont_size=15) fig.update(layout_coloraxis_showscale=False) fig.update_layout(showlegend=False) for i in range(10): fig.add_shape(type='circle', x0=(i + 1) / 10 * mx, y0=(i + 1) / 10 * mx, x1=-(i + 1) / 10 * mx, y1=-(i + 1) / 10 * mx, line_color="red", opacity=0.5, line=dict(dash='dot', width=3)) if confidence: for vector, std, angle in zip(self.vectors, self.stds, angles): fig.add_shape(type='circle', x0=vector[1] + 3 * std[1], y0=vector[0] + 3 * std[0], x1=vector[1] - 3 * std[1], y1=vector[0] - 3 * std[0], line_color='gray', opacity=0.5, line=dict(dash='solid', width=1)) fig.show() if path: if len(path.split('/')) > 1 and not os.path.exists('/'.join( path.split('/')[:-1])): os.makedirs('/'.join(path.split('/')[:-1])) if dynamic: assert path.split( '.' )[-1] == 'html', 'For a dynamic figure, path should be an html file!' fig.write_html(path) else: fig.write_image(path)
feature_name = ['QNH', 'TEMP', 'RH', 'absolute_temp', 'WS2A', 'CW2A'] train = data[200:, :] test = data[:200, :] train_target = target[200:] test_target = target[:200] from sklearn.ensemble import GradientBoostingRegressor gb = GradientBoostingRegressor(n_estimators=500, max_depth=10, learning_rate=0.01) relu_fit = RuleFit() relu_fit.max_iter = 4000 relu_fit.tree_generator = gb relu_fit.fit(train, train_target, feature_names=feature_name) f = relu_fit.predict(test) ff = relu_fit.predict(train) rule = relu_fit.get_rules() truth = 0 for i in range(test_target.shape[0]): if abs(test_target[i] - f[i]) / test_target[i] < 0.1: truth += 1 print("truth: ", truth / test_target.shape[0]) #print(rule) ruleset = pd.DataFrame(data=rule) writer = pd.ExcelWriter('./rules.xlsx') ruleset.to_excel(writer) writer.save() writer.close()
import numpy as np import pandas as pd from rulefit import RuleFit ## Create artificial data set with n = 10000 x1 = np.random.normal(scale=1, size=n) x2 = np.random.normal(loc=0, scale=1, size=n) x3 = np.random.normal(size=n) x4 = np.random.normal(size=n) eps = np.random.normal(loc=0, scale=0.1, size=n) y = 5 * ((x1 > 1).astype(int) * (x2 < -1).astype(int)) + 0.3 * x4 + eps X = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'x4': x4}) rf = RuleFit() rf.fit(X.values, y, X.columns) rf.fit(X.values, y) rules = rf.get_rules(exclude_zero_coef=True) print(rules)
def fitrf(train, labels): r = RuleFit() r.fit(train, labels) return r
shuffle=True, random_state=args.randomseed) # 生成 k-fold 训练集、测试集索引 resampled_index_set = rs.split(y_resampled) k_fold_step = 1 # 初始化折数 # 暂存每次选中的测试集和对应预测结果 test_cache = pred_cache = np.array([], dtype=np.int) # 迭代训练 k-fold 交叉验证 for train_index, test_index in resampled_index_set: print("\nFold:", k_fold_step) # 初始化 estimator 训练集进入模型 rf = RuleFit(tree_size=args.treesize, rfmode=args.rfmode, max_rules=args.maxrules, random_state=args.randomseed) rf.fit(x_resampled[train_index], y_resampled[train_index], features) # 测试集验证 # 验证测试集 (通过 index 去除 fake data) real_test_index = test_index[test_index < X.shape[0]] batch_test_x = x_resampled[real_test_index] batch_test_y = y_resampled[real_test_index] batch_test_size = len(real_test_index) y_pred = rf.predict(batch_test_x) # 计算测试集 ACC accTest = accuracy_score(batch_test_y, y_pred) print("\nFold:", k_fold_step, "Test Accuracy:", "{:.6f}".format(accTest), "Test Size:", batch_test_size) # 暂存每次选中的测试集和预测结果 test_cache = np.concatenate((test_cache, batch_test_y)) pred_cache = np.concatenate((pred_cache, y_pred)) print(
class_info = dataset.groupby(dataset.columns[0]).size() print('\n', class_info) # print("\nClass info:", np.unique(y, return_counts=True)) print("\nTraining Start...") # 标准化处理 scaler = StandardScaler().fit(X_origin) X = scaler.transform(X_origin) # 初始化 estimator rf = RuleFit(tree_size=args.treesize, rfmode=args.rfmode, max_rules=args.maxrules, random_state=args.randomseed) # 模型训练 rf.fit(X, y, features) # 输出 rulefit 相关训练参数 print("\n=== Model parameters ===") print( "\nTree generator:{0}, \n\nMax rules:{1}, Tree size:{2}, Random state:{3}" .format(rf.tree_generator, rf.max_rules, rf.tree_size, rf.random_state)) # 验证 y_pred = rf.predict(X) # 输出统计结果 num_categories = class_info.values.size if (num_categories > 2): utils.model_evaluation(num_categories, y, y_pred) else: utils.bi_model_evaluation(y, y_pred) # 输出 rules