def rf_discrete(self, x, y): """ Discrete the input 1-D numpy array based on RandomForest :param x: 1-D numpy array :param y: 1-D numpy array target variable :return: discreted 1-D numpy array """ res = np.array([0] * x.shape[-1], dtype=int) interval_list = [] x = np.column_stack((x, res)) # model = RandomForestRegressor(n_estimators=60, max_depth=10) model = DecisionTreeRegressor(max_depth=10) model.fit(x, y) prediction, bias, contribution = ti.predict(model, x) # print(prediction, "\n", bias, "\n", contribution) ''' for i in range(n): point1, point2 = stats.scoreatpercentile(x, [i*100/n, (i+1)*100/n]) x1 = x[np.where((x >= point1) & (x <= point2))] mask = np.in1d(x, x1) res[mask] = (i + 1) # logging.info("discrete: " + str(res) + str((point1, point2))) # logging.info("mask: " + str(mask)) # logging.info("discrete_main: " + str(res)) ''' # raise ValueError return res, interval_list
def analize_company_explain(loaded_model, y_predict_r, my_company_data): """ : param loaded_model machine learning model : param y_predict_r rounded prediction of company : param my_company_data data for company """ y_predict_r = np.round(y_predict_r, 1) optimised_random_forest = loaded_model.steps[-1] prediction, bias, contributions = ti.predict(optimised_random_forest[1][1], my_company_data) util.myprint(prediction) util.myprint(bias) local_res = list() y_pred_r_colors = list() for i in range(len(y_predict_r)): if y_predict_r[i] > ariskparametri.YearlyYellow: if y_predict_r[i] > ariskparametri.YearlyRed: y_pred_r_colors.append(ariskparametri.ReportBarRosso) else: y_pred_r_colors.append(ariskparametri.ReportBarGiallo) else: y_pred_r_colors.append(ariskparametri.ReportBarVerde) for i in range(len(contributions)): res0, res1 = map(list, zip(*contributions[i])) util.myprint(res0) local_res.append(res1) return local_res
def test_that_tree_works(): from treeinterpreter import treeinterpreter as ti # Code below compares refactored blog post to our wrapper implementation. # http://blog.datadive.net/random-forest-interpretation-with-scikit-learn/ # Fit tree boston = load_boston() rf = RandomForestRegressor() X, y = boston.data[:300], boston.target[:300] feature_names = boston.feature_names X_new = boston.data[[300, 309]] y_new = boston.target[[300, 309]] rf.fit(X, y) # Build expected local explanation prediction, bias, contributions = ti.predict(rf, X_new) # Build actual local explanation explainer = TreeInterpreter(rf, X, feature_names=feature_names) local_expl = explainer.explain_local(X_new, y_new) a_local_data = local_expl.data(key=0) assert all([ feature_names[i] == a_local_data["names"][i] for i in range(len(feature_names)) ]) assert all([ contributions[0, i] == a_local_data["scores"][i] for i in range(len(feature_names)) ]) assert a_local_data["extra"]["names"][0] == "Bias" assert a_local_data["extra"]["scores"][0] == bias[0]
def MeaningfulSampling(instance2explain, blackbox, training_data, N_samples): """ This function performs dense data generation for the instance2explain. It starts by randomly generating data points using the distribution of training data, and then making them closer to the instance2explain by considering similarities between feature values and feature importance. """ # Generating random data using the distribution of training data # Discretizing random data for comparison of feature values random_samples = RandomSampling(instance2explain, training_data, N_samples) random_samples_dc = QuartileDiscretization(random_samples) # Constructing a random forest classifier as surrogate model surrogate_model = RandomForestClassifier(n_estimators=10) surrogate_model.fit(random_samples, blackbox.predict(random_samples)) # Extracting feature contributions using TreeIntepreter # Discretizing contributions for comparison of feature importance prediction, bias, contributions = ti.predict(surrogate_model, random_samples) contributions_dc = SturgesDiscretization(contributions) # Making a dense neighborhood w.r.t instance2explain dense_samples = SampleManipulation(prediction, random_samples, random_samples_dc, contributions_dc) interpretable_dense_samples = InterpretableRepresentation(dense_samples) return interpretable_dense_samples, dense_samples
def classify(datapath, commithash=None, index=None): """ Load model and classify single data point. Also determines most significant feature """ # pylint: disable = too-many-locals clf = joblib.load('model.pkl') data, _, hashes, names = load_data(datapath) if commithash: temp, = np.where(hashes == commithash) sample = temp[0] elif index: sample = index else: sample = 1 prediction, _, contributions = ti.predict(clf, data[[sample]]) label1 = np.array(contributions)[0, :, 0] label2 = np.array(contributions)[0, :, 1] if prediction[0][0] > prediction[0][1]: res = label1 labeltext = 'clean' else: res = label2 labeltext = 'buggy' top = max(res) index, = np.where(res == top) feature = names[index[0] + 1] print('Predicted result: ' + labeltext) print('Top factor: ' + feature)
def predict(row): df = pd.DataFrame.from_dict([row], orient='columns') #open model RF with open('website/model.pkl', 'rb') as f: model = pickle.load(f) df1 = transform_test(df) #predict probability using model prediction = model.predict_proba(df1.values.reshape(1, -1))[0][1] row['prediction'] = prediction #determine contribution of features to prediction using treeinterpreter prediction, bias, contributions = ti.predict(model, df1.values.reshape(1, -1)) #empty list to hold important features which contributed to prediction important_features = [] #names for features column_features = [ 'name_length', 'num_payouts', 'user_age', 'org_facebook', 'org_twitter', 'body_length', 'gts', 'sale_duration', 'tickets_sold' ] #take the top 3 features which had the highest contribution for feature, key in sorted( zip(abs(contributions[0][:, 1]), column_features))[::-1][:3]: important_features.append(key) row['contributions'] = important_features #returns the prob of fraud which we can later classify using the threshold chosen return row
def randomForstClassifier(data): labels = [ 'srch_id', # 'site_id', 'prop_id', # 'prop_starrating', # 'prop_review_score', # 'prop_brand_bool', 'prop_location_score1', 'prop_location_score2', # 'position', 'price_usd', # 'promotion_flag', # 'srch_saturday_night_bool' # 'random_bool', # 'click_bool', # 'booking_bool', # 'price_usd_normalized', # 'consumer' # 'Pclass' # 'score' ] # testdata = (testdata[labels]) y = (data['score']) x = data[labels] X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=42) # pprint(X_test) print("random forest") rf = RandomForestClassifier(n_jobs=-1, class_weight='auto', n_estimators=1000, max_depth=400, verbose=0) rf.fit(X_train, y_train) predictions = rf.predict(X_test) print('accuracy_score ', accuracy_score(y_test, predictions)) print('confusion_matrix ', confusion_matrix(y_test, predictions)) print('classification_report', classification_report(y_test, predictions)) prediction, bias, contributions = ti.predict(rf, X_test) print("Prediction", prediction) print("Bias (trainset prior)", bias) print("Feature contributions:") for c, feature in zip(contributions[0], labels): print(feature, c)
def test_tree_regressor(self): X = self.boston.data Y = self.boston.target testX = X[int(len(X) / 2):] #Predict for decision tree dt = DecisionTreeRegressor() dt.fit(X[:int(len(X) / 2)], Y[:int(len(X) / 2)]) base_prediction = dt.predict(testX) pred, bias, contrib = treeinterpreter.predict(dt, testX) self.assertTrue(np.allclose(base_prediction, pred)) self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1))) testX = X[-1:] base_prediction = dt.predict(testX) pred, bias, contrib = treeinterpreter.predict(dt, testX) self.assertTrue(np.allclose(base_prediction, pred)) self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1)))
def test_tree_regressor(self): X = self.boston.data Y = self.boston.target testX = X[int(len(X)/2):] #Predict for decision tree dt = DecisionTreeRegressor() dt.fit(X[:int(len(X)/2)], Y[:int(len(X)/2)]) base_prediction = dt.predict(testX) pred, bias, contrib = treeinterpreter.predict(dt, testX) self.assertTrue(np.allclose(base_prediction, pred)) self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1))) testX = X[-1:] base_prediction = dt.predict(testX) pred, bias, contrib = treeinterpreter.predict(dt, testX) self.assertTrue(np.allclose(base_prediction, pred)) self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1)))
def iterforest(clf,i,metricarr): for i in range(i): clf_spec = RandomForestClassifier(max_depth=None, random_state=1,oob_score=True,n_estimators=10,class_weight="balanced") clf_spec.fit(X_train,y_train) y_pred_iter = pd.DataFrame(clf_rf.predict(X_test)) #rf_probs = clf_rf.predict_proba(X_test)[:, 0] #rf_probs2 = clf_rf.predict_proba(X_test)[:, 1] print(classification_report(y_test, y_pred_iter, target_names=classes_spr)) metricarr.append(classification_report(y_test, y_pred_iter, target_names=classes_spr)) np.set_printoptions(precision=2) classes_spr = list(unique_labels(y_test, y_pred_rf50)) plot_confusion_matrix(y_test, y_pred_rf50, classes=classes_spr, normalize=False, title='confusion matrix for Normal Random Forest iteration '+i) plt.show() from treeinterpreter import treeinterpreter as ti from collections import defaultdict import random class1=defaultdict(list) class2=defaultdict(list) #instance = X_test.sample(n=50000).values #instance randomized to avoid RAM error prediction, bias, contributions = ti.predict(clf_spec, instance) #print ("Prediction", prediction) #print ("Bias (trainset prior)", bias) print ("Feature contributions for rf100:") test =list() count = 0 for i in range(len(instance)): for c,feature in zip(contributions[i],header): #print (feature,c) fc_class1 = c[0] fc_class2=c[1] class1[feature].append(fc_class1) class2[feature].append(fc_class2) count+=1 #np.sum(contributions,axis=1) print(count) import statistics as sts class1_fixed=list() class2_fixed=list() for header_name in header: array_value = class1[header_name] class2_array_value = class2[header_name] class1_fixed.append(sts.median(array_value)) class2_fixed.append(sts.median(class2_array_value)) for i in enumerate(class1_fixed): if class1_fixed[i] >0: setnum1=class1_fixed[i] if class2_fixed[i] >0: setnum1=class1_fixed[i]
def test_tree_classifier(self): X = self.iris.data Y = self.iris.target dt = DecisionTreeClassifier() dt.fit(X[:int(len(X) / 2)], Y[:int(len(X) / 2)]) testX = X[int(len(X) / 2):int(len(X) / 2) + 1] base_prediction = dt.predict_proba(testX) pred, bias, contrib = treeinterpreter.predict(dt, testX) self.assertTrue(np.allclose(base_prediction, pred)) self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1)))
def test_tree_classifier(self): X = self.iris.data Y = self.iris.target dt = DecisionTreeClassifier() dt.fit(X[:int(len(X)/2)], Y[:int(len(X)/2)]) testX = X[int(len(X)/2):int(len(X)/2)+1] base_prediction = dt.predict_proba(testX) pred, bias, contrib = treeinterpreter.predict(dt, testX) self.assertTrue(np.allclose(base_prediction, pred)) self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1)))
def tree_interpret(model,X_test,cols=None): pred,bias,contrib=ti.predict(model,X_test) assert np.allclose(pred,np.sum(contrib,axis=1)+bias[0]),"something wrong!!!" if cols is None: try: cols = X_test.columns() except: cols = np.arange(X_test.shape[1]).astype(np.str) _contrib = pd.DataFrame(contrib[:,:,1],columns=cols) _pred = np.argmax(pred,axis=1) return _pred,bias,_contrib
def test_forest_classifier_parallel(self): idx = range(len(self.iris.data)) np.random.shuffle(idx) X = self.iris.data[idx] Y = self.iris.target[idx] dt = RandomForestClassifier(max_depth=20, n_estimators=500) dt.fit(X[:len(X)/2], Y[:len(X)/2]) testX = X[len(X)/2:] base_prediction = dt.predict_proba(testX) pred, bias, contrib = treeinterpreter.predict(dt, testX, n_jobs=2) self.assertTrue(np.allclose(base_prediction, pred)) self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1)))
def test_forest_classifier(self): idx = np.arange(len(self.iris.data)) np.random.shuffle(idx) X = self.iris.data[idx] Y = self.iris.target[idx] dt = RandomForestClassifier(max_depth=3) dt.fit(X[:int(len(X)/2)], Y[:int(len(X)/2)]) testX = X[int(len(X)/2):] base_prediction = dt.predict_proba(testX) pred, bias, contrib = treeinterpreter.predict(dt, testX) self.assertTrue(np.allclose(base_prediction, pred)) self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1)))
def make_prediction(): if request.method == 'POST': sample = [] data = {} feature_names = [ 'satisfaction', 'evaluation', 'projectCount', 'averageMonthlyHours', 'yearsAtCompany', 'workAccident', 'promotion', 'salary', 'department' ] #print request.form satisfaction = request.form.get('Satisfaction') sample.append(float(satisfaction)) evaluation = request.form.get('Evaluation') sample.append(float(evaluation)) projectCount = request.form.get('Project Count') sample.append(int(projectCount)) averageMonthlyHours = request.form.get('Average Monthly Hours') sample.append(int(averageMonthlyHours)) yearsAtCompany = request.form.get('Years At Company') sample.append(int(yearsAtCompany)) workAccident = request.form.get('Work Accident') sample.append(int(workAccident)) promotion = request.form.get('Promotion') sample.append(int(promotion)) salary = request.form.get('Salary') sample.append(int(salary)) department = request.form.get('Department') sample.append(department) sampleDf = pd.DataFrame(sample) if len(sample) < 9: return render_template( 'result.html', label1="Missing data or incorrect data entered") # make prediction prediction = model.predict(sampleDf.T) pred, bias, contributions = ti.predict(model, sampleDf.T) confidence = model.predict_proba(sampleDf.T) data["confidence_0"] = confidence[0][0] data["confidence_0"] = confidence[0][1] data["Prediction"] = prediction[0] #json_cities = json.dumps(city_array) #return render (request, 'plot3/plot_page.html', {"city_array" : json_cities}) data[0] = {} data[1] = {} for c in range(len(contributions[0])): data[0][feature_names[c]] = round(contributions[0][c][0], 2) data[1][feature_names[c]] = round(contributions[0][c][1], 2) json_data = json.dumps(data) jsonify(data=data) return render_template('result.html', data=data)
def treeinterpreter(self): # that [:300] can be changed as needed for now it's only setting for convenience # you can use [[200:300]] to find instances you're interested prediction, bias, contributions = ti.predict(self.model, self.X[:300]) contributionsum = np.sum(contributions, axis=0) contributiondict = dict(zip(self.names, contributionsum)) featurerank = sorted(contributiondict.items(), key=lambda kv: kv[1], reverse=True) print("Features sorted by their score under treeinterpreter criteria:") print(featurerank) return featurerank
def test_forest_classifier_joint(self): idx = np.arange(len(self.iris.data)) np.random.shuffle(idx) X = self.iris.data[idx] Y = self.iris.target[idx] dt = RandomForestClassifier(max_depth=3) dt.fit(X[:int(len(X)/2)], Y[:int(len(X)/2)]) testX = X[int(len(X)/2):] base_prediction = dt.predict_proba(testX) pred, bias, contribs = treeinterpreter.predict(dt, testX, joint_contribution=True) self.assertTrue(np.allclose(base_prediction, pred)) self.assertTrue(np.allclose(base_prediction, np.array([sum(contrib.values()) for contrib in contribs]) + bias))
def test_gradient_boosting_regressor(self): X = self.boston.data Y = self.boston.target testX = X[len(X)/2:] dt = GradientBoostingRegressor(n_estimators=10) dt.fit(X[:len(X)/2], Y[:len(X)/2]) base_prediction = dt.predict(testX) pred, bias, contrib = treeinterpreter.predict(dt, testX) self.assertTrue(np.allclose(base_prediction, pred)) self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1)))
def test_forest_classifier(self): idx = np.arange(len(self.iris.data)) np.random.shuffle(idx) X = self.iris.data[idx] Y = self.iris.target[idx] dt = RandomForestClassifier(max_depth=3) dt.fit(X[:int(len(X) / 2)], Y[:int(len(X) / 2)]) testX = X[int(len(X) / 2):] base_prediction = dt.predict_proba(testX) pred, bias, contrib = treeinterpreter.predict(dt, testX) self.assertTrue(np.allclose(base_prediction, pred)) self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1)))
def get_interpretation_for_trees(model, row, names, n_bests): explanation = treeinterpreter.predict(model, row) explanation_sorted_with_columns = list( sorted([(elt1, elt2) for elt1, elt2 in zip(explanation[2][0], names)], key=lambda x: np.abs(x[0]), reverse=True)) return { "contribution": [elt[0] for elt in explanation_sorted_with_columns][0:n_bests], "columns": [elt[1] for elt in explanation_sorted_with_columns][0:n_bests] }
def test_random_forest_regressor(self): X = self.boston.data Y = self.boston.target testX = X[len(X)/2:] #Predict for decision tree dt = RandomForestRegressor(n_estimators=10) dt.fit(X[:len(X)/2], Y[:len(X)/2]) base_prediction = dt.predict(testX) pred, bias, contrib = treeinterpreter.predict(dt, testX) self.assertTrue(np.allclose(base_prediction, pred)) self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1)))
def test_forest_regressor(self): X = self.boston.data Y = self.boston.target testX = X[len(X)/2:] #Predict for decision tree dt = RandomForestRegressor(n_estimators=10) dt.fit(X[:len(X)/2], Y[:len(X)/2]) base_prediction = dt.predict(testX) pred, bias, contrib = treeinterpreter.predict(dt, testX) self.assertTrue(np.allclose(base_prediction, pred)) self.assertTrue(np.allclose(pred, bias + np.sum(contrib, axis=1)))
def get_interpretation(X, rf_model): for i, row in X.iterrows(): data_point = pd.DataFrame([row]) # Once transposed, it will be the column name data_point.set_axis(['value_variable']) prediction, bias, contributions = ti.predict(rf_model, data_point) local_interpretation = data_point.append( pd.DataFrame([[round(c[1], 3) for c in contributions[0]]], columns=data_point.columns.tolist(), index=['contribution_variable' ])).T.sort_values('contribution_variable', ascending=False) print(local_interpretation)
def test_forest_classifier_joint(self): for ForestClassifier in (RandomForestClassifier, ExtraTreesClassifier): idx = np.arange(len(self.iris.data)) np.random.shuffle(idx) X = self.iris.data[idx] Y = self.iris.target[idx] dt = ForestClassifier(max_depth=3) dt.fit(X[:int(len(X)/2)], Y[:int(len(X)/2)]) testX = X[int(len(X)/2):] base_prediction = dt.predict_proba(testX) pred, bias, contribs = treeinterpreter.predict(dt, testX, joint_contribution=True) self.assertTrue(np.allclose(base_prediction, pred)) self.assertTrue(np.allclose(base_prediction, np.array([sum(contrib.values()) for contrib in contribs]) + bias))
def rePredict(request): fixData = [] temp = [] # 特征名称列表 featureList = totaldata.columns.values.tolist() # print(selData) fmin = [] fmax = [] for i in featureList: fmin.append(float(totaldata[i].min())) fmax.append(float(totaldata[i].max())) for i in featureList: temp.append(float(request.GET.get(i))) fixData.append(temp); p = {} rule = {} predict = [] result = estimator.predict_proba(fixData).tolist() for j in range(len(estimator.estimators_)): m = estimator.estimators_[j].decision_path(fixData) predict.append(estimator.estimators_[j].predict_proba(fixData)[0].tolist()) d = [] r = {} # 决策路径和其对应的特征,范围值,0:[-1.5, 2] for i in m[0].indices: temp = [] f = int(estimator.estimators_[j].tree_.feature[i]) # 特征index threshold = estimator.estimators_[j].tree_.threshold[i] # 节点分裂值 if (estimator.estimators_[j].tree_.feature[i] >= 0): if f not in r.keys(): r[f] = [fmin[f], fmax[f]] temp.append(int(f)) temp.append(float(threshold)) if (fixData[0][f] <= threshold): temp.append("<=") if threshold < r[f][1]: r[f][1] = float(threshold) else: temp.append(">") if threshold > r[f][0]: r[f][0] = float(threshold) d.append(temp) p[j] = d rule[j] = r # contribution compute fixDataNDArray = np.array(fixData) prediction, bias, contributions = ti.predict(estimator, fixDataNDArray) print(prediction, bias, contributions) return JsonResponse({'path': p, 'rule': rule, 'predict': predict, 'featureList': featureList, 'result': result, 'data': fixData[0], 'prediction': prediction.tolist(), 'bias': bias.tolist(), 'contributions': contributions.tolist()}, safe=False)
def test_forest_regressor_joint(self): X = self.boston.data Y = self.boston.target testX = X[int(len(X)/2):] #Predict for decision tree dt = RandomForestRegressor(n_estimators=10) dt.fit(X[:int(len(X)/2)], Y[:int(len(X)/2)]) base_prediction = dt.predict(testX) pred, bias, contribs = treeinterpreter.predict(dt, testX, joint_contribution=True) self.assertTrue(np.allclose(base_prediction, pred)) self.assertTrue(np.allclose(base_prediction, np.array([sum(contrib.values()) for contrib in contribs]) + bias))
def _forest_importances(self, X, test_cache, result_channels, c, s): if isinstance(self.clf[c], (RandomForestRegressor, RandomForestClassifier)) and \ isinstance(X, np.ndarray): features = np.ones(X.shape[1], dtype=bool) if 'feature_importance' not in result_channels[c][s]: result_channels[c][s]['feature_importance'] = np.full(X.shape[1], np.nan) for sel in self.selection[c]: features[features] = sel.get_support() contrib = ti.predict(self.clf[c], np.array(test_cache[c]))[2] result_channels[c][s]['feature_importance'][features] = \ np.mean(contrib if self.is_regression else contrib[:, :, 0], axis=0) elif self.scheme.is_multiroi[c] and \ isinstance(self.clf[c].base_estimator, (RandomForestRegressor, RandomForestClassifier)): if 'feature_importance' not in result_channels[c][s]: result_channels[c][s]['feature_importance'] = dict() if sum(self.selection, []): features = dict() for sel in self.selection[c]: for k, v in sel.get_support().items(): if k in test_cache[c].keys(): if k not in features: features[k] = v else: features[k][features[k]] = v else: features = {k: np.ones(len(v[0]), dtype=bool) for k, v in test_cache[c].items()} for k in test_cache[c].keys(): if k not in result_channels[c][s]['feature_importance']: result_channels[c][s]['feature_importance'][k] = \ np.full(len(features[k]), np.nan) contrib = ti.predict(self.clf[c].estimators_[k], test_cache[c][k])[2] result_channels[c][s]['feature_importance'][k][features[k]] = \ np.mean(contrib if self.is_regression else contrib[:, :, 0], axis=0) return result_channels
def test_forest_regressor(self): for ForestRegressor in (RandomForestRegressor, ExtraTreesRegressor): X = self.boston.data Y = self.boston.target testX = X[int(len(X) / 2):] #Predict for decision tree dt = ForestRegressor(n_estimators=10) dt.fit(X[:int(len(X) / 2)], Y[:int(len(X) / 2)]) base_prediction = dt.predict(testX) pred, bias, contrib = treeinterpreter.predict(dt, testX) self.assertTrue(np.allclose(base_prediction, pred.flatten())) self.assertTrue( np.allclose(pred.flatten(), bias + np.sum(contrib, axis=1)))
def Contribution(model_dict, data_to_anly, period): predict, biases, contributions = ti.predict( model_dict['P' + str(period) + 'RF'], data_to_anly.drop(['Yield'], axis=1)) #將各筆資料的變數以貢獻程度降冪排列,合併貢獻值與變數名稱 df_sort_contributions = cs.contributionSort(contributions, list(data_to_anly.columns)) tmp = pd.DataFrame( data={ 'Actual': data_to_anly.loc[:, 'Yield'], 'Predict': list(predict), 'Biases': list(biases) }) contribution = pd.concat([tmp, df_sort_contributions], axis=1) return contribution
def __calculate_variables_contribution(self, record): for i, row in record.iterrows(): data_point = pd.DataFrame([row]) data_point.set_axis( ['value_variable'], inplace=True) # Once transposed, it will be the column name prediction, bias, contributions = ti.predict( self.rf_model, data_point) local_interpretation = data_point.append( pd.DataFrame([[round(c[1], 3) for c in contributions[0]]], columns=data_point.columns.tolist(), index=['contribution_variable' ])).T.sort_values('contribution_variable', ascending=False) return local_interpretation
def test_forest_regressor_joint(self): return None for ForestRegressor in (RandomForestRegressor, ExtraTreesRegressor): X = self.boston.data Y = self.boston.target testX = X[int(len(X)/2):] #Predict for decision tree dt = ForestRegressor(n_estimators=10) dt.fit(X[:int(len(X)/2)], Y[:int(len(X)/2)]) base_prediction = dt.predict(testX) pred, bias, contribs = treeinterpreter.predict(dt, testX, joint_contribution=True) self.assertTrue(np.allclose(base_prediction, pred)) self.assertTrue(np.allclose(base_prediction, np.array([sum(contrib.values()) for contrib in contribs]) + bias))
def test_gradient_boosting_classifier(self): idx = range(len(self.iris.data)) np.random.shuffle(idx) X = self.iris.data[idx] Y = self.iris.target[idx] dt = GradientBoostingClassifier(max_depth=3) dt.fit(X[:len(X)/2], Y[:len(X)/2]) testX = X[len(X)/2:] base_prediction = dt.predict_proba(testX) pred, bias, contrib = treeinterpreter.predict(dt, testX) self.assertTrue(np.allclose(base_prediction, pred)) # Need to convert score to proba # using logistic function or similar sum_contrib = dt.loss_._score_to_proba(bias + np.sum(contrib, axis=1)) self.assertTrue(np.allclose(pred, sum_contrib))
def tree_interp(test_df,model): """ Call treeinterpreter for RF model and build dataframe with output. ONLY WORKS FOR INDEP CONTRIBS FOR REGRESSION parameters: test_df, df: dataframe of test instances model: trained rf regression or classification model from sklearn joint, str: t/f, whether to get joint feature contributions or not PUT BACK IN WHEN YOU ENABLE THIS FEATURE returns: a dataframe with the sampleID, label, bias, prediction and contributions for all features, for each instance """ from treeinterpreter import treeinterpreter as ti # put labels, ID in contribution dataframe interp_df_half = test_df['Y'].to_frame() interp_df_half.set_index(test_df.index) # get feature names to use as col names for contributions test_featureNames = test_df.columns.values.tolist() test_featureNames = test_featureNames[1:] print('\n\n===> Calculating independent feature contributions <===') # drop Y to format test data for ti test_X = test_df.drop(['Y'], axis=1) # call ti prediction, bias, contributions = ti.predict(model,test_X) # add results to contribution df interp_df_half['bias'] = bias.tolist() interp_df_half['prediction'] = prediction.flatten().tolist() # make df of contributions and all other columns to concatanate contrib_df = pd.DataFrame(contributions,index = test_df.index, columns=test_featureNames) # make df where columns are ID, label, bias, prediction, contributions local_interp_df = pd.concat([interp_df_half, contrib_df], axis=1) print(f'Snapshot of the interpretation dataframe: {local_interp_df.head()}') return local_interp_df
def random_forest_regressor(numpy_df_train, numpy_df_test, y_train, y_test): from treeinterpreter import treeinterpreter as ti from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import classification_report,confusion_matrix rf = RandomForestRegressor(verbose=True) rf.fit(numpy_df_train, y_train) print('Freature Importance ', rf.feature_importances_) print('Generating predictions') prediction, bias, contributions = ti.predict(rf, numpy_df_test) print('Predictions generated') idx = test.index predictions_df = pd.DataFrame(data=prediction[0:], index = idx, columns=['predicted_value']) return predictions_df
def predict_sklearn_wTreeInterpreter(ext_df, features_to_include): with open(MODEL_PATH, "rb") as pklr: clf = pickle.load(pklr) try: from treeinterpreter import treeinterpreter ext_df["pred_Bs"], bias, contribution = treeinterpreter.predict( clf, ext_df[features_to_include]) return contribution except: exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exception(exc_type, exc_value, exc_traceback, file=sys.stdout) return
def get_detailed_report(train, trained_filename): """ : param train data file to do prediction and analyse : param trained_filename machine learning model pickle """ loaded_model = pickle.load(open(trained_filename, 'rb')) optimised_random_forest = loaded_model.steps[-1] prediction, bias, contributions = ti.predict(optimised_random_forest[1][1], train) util.myprint(prediction) util.myprint(bias) indices = [ 'Ricavi delle vendite', 'EBITDA/Vendite (%)', 'EBITDA migl EUR', 'Indice di liquidità', 'Indice corrente', 'Indice di indebitam. a lungo', 'Indice di copertura delle immob. (patrimoniale)', 'Grado di ammortamento', 'Debiti v/banche su fatt. (%)', 'Grado di copertura degli interessi passivi', 'Giac. media delle scorte (gg)', 'Giorni copertura scorte (gg)', 'Redditività di tutto il capitale investito (ROI) (%)', 'Flusso di cassa di gestione', 'Oneri finanz. su fatt. (%)' ] for i in range(5): print("Feature contributions:") res0, res1 = map(list, zip(*contributions[i])) util.myprint(res0) for idx, feature in sorted(zip(res1, indices), key=lambda x: -abs(x[0])): print(feature, abs(round(idx, 2))) print("-" * 20) res = list() for i in range(len(contributions)): res0, res1 = map(list, zip(*contributions[i])) util.myprint(res0) res.append(res1) det_res = pd.DataFrame(np.abs(res)) return det_res
print("Feature Contribution (Random Forest)") clf = joblib.load('D:\SLIIT\SoftwareIndustry\df_model.pkl') df_emp = mysql_cn.read('select * from employeesit_predict') emp_id = df_emp['Employee_ID'] emp_name = df_emp['Employee_Name'] id = emp_id.tolist() e_name = emp_name.tolist() arr_con = np.zeros(len(features)) arr_gain_lost = np.zeros(len(features)) arr_feature = np.zeros(len(features)) msg.update_message() for index in range(len(X)): i = X[index:index + 1] prediction, bias, contributions = ti.predict(clf, i) a = 0 for c, feature in zip(contributions[0], features): arr_con[a] = round(c[1] * 100, 2) # arr_feature[a] = feature # arr_gain_lost[a]=round(c[1], 2) a = a + 1 arr_con_list = arr_con.tolist() max_value = max(arr_con_list) max_index = arr_con_list.index(max_value) mysql_cn.insert_update( "INSERT INTO `employeesit_predict_feature_cont`(`Employee_ID`, `Employee_Name`, `Age`, `Gender`, `Marital_Status`, " "`Having_Degree`, `Job_Role`, `Department`, `WorkFrom`, `WorkTo`, `Tenure`, `Salary`, `Bonus`, `Claims`, `Worked_Project`, `No_of_Leaves`, " "`Distance`, `No_of_Parents`, `No_of_Children`, `No_of_Complaints`, `Bias`, `Prediction_Probability`, `Max_Feature`) " "VALUES('%s','%s',%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,'%s')" % (id[index], e_name[index], arr_con[0], arr_con[1], arr_con[2], arr_con[3], arr_con[4], arr_con[5],
print print('Analyzing') #get the class-1 (outlier/anomaly) rows from the feature matrix, and drop the prediction so we can investigate them outliers = featureMatrix[featureMatrix.prediction == opts.anomalyclass].drop('prediction',axis=1) num_outliers = len(outliers.index) print 'detected %d anomalies out of %d total rows (%.2f%%)' % (num_outliers, total_rows, (num_outliers * 1.0 / total_rows)*100) if num_outliers == 0: sys.exit(0) if (opts.verbose) and type(clf) is RandomForestClassifier: print 'investigating all the outliers' #investigate each outlier (determine the most influential columns in the prediction) prediction, bias, contributions = ti.predict(clf, outliers) print 'done' print(contributions.shape) i=0 #for each anomaly for index, row in outliers.iterrows(): print('-----------------------------------------') print 'line ',index #find the row in the original data of the anomaly. print it out as CSV. print pd.DataFrame(df.iloc[index]).T.to_csv(header=False, index=False) if (opts.verbose) and type(clf) is RandomForestClassifier: #if we are verbose print out the investigation by zipping the heavily weighted columns with the appropriate features instancecontributions = zip(contributions[i], outliers.columns.values) print "Top feature contributions to anomaly class:" for (c, feature) in sorted(instancecontributions, key=lambda (c,f): c[1], reverse=True)[:10]:
# In[17]: fit1 # In[37]: instances = boston.data[[300, 309]] print "Instance 0 prediction:", rf.predict(instances[0].reshape(1,13)) print "Instance 1 prediction:", rf.predict(instances[1].reshape(1,13)) # In[38]: prediction, bias, contributions = ti.predict(rf, instances) # In[40]: for i in range(len(instances)): print "Instance", i print "Bias (trainset mean)", bias[i] print "Feature contributions:" for c, feature in sorted(zip(contributions[i], boston.feature_names), key=lambda x: -abs(x[0])): print feature, round(c, 2) print "-"*20
# (y.size - x.shape[1] - 1 - 1)) #print("Residual sum of squares: %.2f" # % np.mean((rf.predict(x) - y) ** 2)) xy_df['predicted'] = rf.predict(x) xy_df['delta'] = xy_df[y_feat] - xy_df['predicted'] #xy_df = xy_df.sort(columns = y_feat, ascending = False) #xy_df.to_csv(y_feat + '_predict.txt', sep = '\t', decimal = ',') ################################################################################ # TreeInterpreter # Calculate feature importances feat_imp_df = pd.DataFrame(data = rf.feature_importances_, index = feat_names,\ columns = ['feature_importances']) prediction, bias, contributions = ti.predict(rf, x) contr_df = pd.DataFrame(contributions, index = xy_df.index, columns = feat_names) bias_df = pd.DataFrame(bias, index = xy_df.index, columns = ['bias']) feat_stat_df = bias_df.join(contr_df) feat_stat_df = pd.concat([feat_stat_df, feat_imp_df.transpose()]) feat_stat_df = feat_stat_df.T.sort(columns = 'feature_importances',\ na_position = 'first', ascending = False).T #feat_stat_df.to_csv(y_feat + '_feature_statistics.txt', sep = '\t',\ # decimal = ',') result = pd.concat([xy_df, feat_stat_df], axis = 1) result = result.sort(columns = y_feat, ascending = False, na_position = 'last') result.columns = [['DataSet', 'DataSet', 'DataSet', 'DataSet', 'DataSet', \ 'DataSet', 'DataSet', 'DataSet', 'DataSet', 'DataSet', 'DataSet',\ 'Feature_Statistics', 'Feature_Statistics', 'Feature_Statistics',\ 'Feature_Statistics', 'Feature_Statistics', 'Feature_Statistics',\