def skill_info(self, examples, feature_names=None): # print("SLOOP", examples) tree = self tree_ = tree.tree_ # print("feature_names", feature_names) feature_name = [ feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!" for i in tree_.feature ] node_indicator = tree.decision_path(examples) dense_ind = np.array(node_indicator.todense()) def recurse(node, ind): if(tree_.feature[node] != _tree.TREE_UNDEFINED): l = tree_.children_left[node] less = ind[l] if(not less): s = recurse(tree_.children_right[node], ind) else: s = recurse(tree_.children_left[node], ind) name = feature_name[node] ineq = "<=" if less else ">" thresh = str(tree_.threshold[node]) return [(name.replace("?ele-", ""), ineq, thresh)] + s else: return [] for ind in dense_ind: return recurse(0, ind)
def closest_decision(self, tree, sample, strategy='informativeness', beta=5): """Find the closest decision that is of a class other than the target class. Args: tree: sklearn tree sample: Entry to explain beta: Hyperparameter >= 1 to determine when to only search part of tree (higher = search smaller area) Returns: Ordered descriptive decision path difference, confidence of leaf decision """ # Only search part of tree depending on tree size decision_path = tree.decision_path(sample.reshape(1, -1)).indices if len(decision_path) < 2: warnings.warn('Stub tree') return None, 0.0 start_depth = int(round(len(decision_path) / beta)) start_node = decision_path[start_depth] # Get decision for sample fact_leaf = tree.apply(sample.reshape(1, -1)).item(0) # TODO: Retrain tree if wrong prediction if np.argmax(tree.tree_.value[fact_leaf]) != 0: warnings.warn('Tree did not predict as fact') # Find closest leaf that does not predict output x, based on a strategy graph, foil_nodes = self._fact_foil_graph(tree.tree_, start_node=start_node) if self.verbose: print(f'[E] Found {len(foil_nodes)} contrastive decision regions, ' f'starting from node {start_node}') if len(foil_nodes) == 0: return None, 0 # Contrastive decision region foil_path, confidence = self._get_path(graph, fact_leaf, foil_nodes, tree.tree_, strategy) return self.descriptive_path(foil_path, sample, tree), confidence
def print_decision_path(tree, X, sample_id=0): node_indicator = tree.decision_path(X) leave_id = tree.apply(X) node_index = node_indicator.indices[ node_indicator.indptr[sample_id]:node_indicator.indptr[sample_id + 1]] print('Rules used to predict sample %s: ' % sample_id) print node_index for node_id in node_index: if (X[sample_id, tree.tree_.feature[node_id]] <= tree.tree_.threshold[node_id]): threshold_sign = "<=" else: threshold_sign = ">" print("decision id node %s : (X_test[%s, %s] (= %s) %s %s)" % (node_id, sample_id, tree.tree_.feature[node_id], X[sample_id, tree.tree_.feature[node_id]], threshold_sign, tree.tree_.threshold[node_id]))
def decision_path(self, tree, sample): """Get a descriptive decision path of a sample. Args: tree: sklearn tree sample: Sample to decide decision path of Returns: Descriptive decision path for sample """ dp = list(np.nonzero(tree.decision_path(sample.reshape(1, -1)))[1]) if len(dp) == 0: return [] turned_right = [dp[i] in tree.tree_.children_right for i, node in enumerate(dp[:-1])] + [False] return self.descriptive_path(list(zip(dp, turned_right)), sample, tree)
def __true_positive(self, tree): """ Takes in a decision tree and returns a numpy Matrix of all the correctly classified transactions along with an array containing the addresses of the tree nodes accessed along the decision paths that resulted in the correct classifications. :param tree: A decision either the only one in the case of a decision tree classifier or a single member of a forest in the case of random forest classifier. :return: A numpy Matrix of all the correctly classified transactions and An array containing the addresses of the tree nodes accessed along the decision paths that resulted in the correct classifications. """ p = tree.predict(self.__X) true_p_df = self.__X[(p == 1) & (p == self.__y)].copy() if (true_p_df.shape[0]): return true_p_df.to_numpy(), tree.decision_path(true_p_df).toarray() else: return true_p_df, true_p_df
def NCA(forest, samples): n_sample = samples.shape[0] d = np.zeros([n_sample, n_sample]) n_estimator = len(forest) for k in range(n_estimator): tree = forest[k] path = tree.decision_path(samples).todense() for i in range(n_sample): for j in range(n_sample): sample_ids = [i, j] d[i, j] = d[i, j] + 1 / (path[sample_ids].sum(axis=0) == len(sample_ids)).sum() d = d / n_estimator d_Nearest_Common_Ancestor = [1 / x for x in d] return d_Nearest_Common_Ancestor
def SP(forest, samples): n_sample = samples.shape[0] d = np.zeros([n_sample, n_sample]) n_estimator = len(forest) for k in range(n_estimator): tree = forest[k] path = tree.decision_path(samples).todense() for i in range(n_sample): for j in range(n_sample): sample_ids = [i, j] splitting_depth = (path[sample_ids].sum( axis=0) == len(sample_ids)).sum() depth_i = (path[[i, i]].sum(axis=0) == len(sample_ids)).sum() depth_j = (path[[j, j]].sum(axis=0) == len(sample_ids)).sum() d[i, j] = d[i, j] + depth_i + depth_j - 2 * splitting_depth d_shortest_path = d / n_estimator return d_shortest_path
def Tree_path(tree, samples): ''' inputs: takes tree (best estimated one, if GridSearchCV has been used), pure samples as inputs, could also be not pure samples outputs: returns a list of dictionaries, where, keys mean feature no[0-means-->sample[i][0], where i is any sample no], values mean condition followed by thresholds, one after another condition and thresholds are added in dictionary values i'th dictionary in dictionary list represents a unique rule. uncomment prints to see it in action. ''' number_of_nodes = tree.tree_.node_count feature = tree.tree_.feature threshold = tree.tree_.threshold decision_paths = tree.decision_path(samples) leave_ids = tree.apply(samples) dic = [] for i in range(0, len(samples), 1): sample_id = i d = dict() indexes=decision_paths.indices[decision_paths.indptr[sample_id]:\ decision_paths.indptr[sample_id+1]] #print('sample id: ',sample_id) comparator = '' for node_id in indexes: d[feature[node_id]] = [] for node_id in indexes: if leave_ids[sample_id] == node_id: d.pop(feature[node_id], None) #print(d) if d not in dic: dic.append(d) continue if (samples[sample_id][feature[node_id]] <= threshold[node_id]): comparator = "<=" else: comparator = ">" #print("X_test[%s,%s] %s %s "%(sample_id,feature[node_id],comparator,threshold[node_id]) ) d[feature[node_id]].append(comparator) d[feature[node_id]].append(threshold[node_id]) #print(dic) return dic
if(i < 3): p = clf.predict(x) temp = x[(p == 1) & (p == y)].copy() temp = temp.to_numpy() for n, row in enumerate(clf.decision_path(temp).toarray()): for indx in np.nonzero(row)[-1][-2:-1]: if(temp[n,clf.tree_.feature[indx]] <= clf.tree_.threshold[indx]): feature_thresholds[clf.tree_.feature[indx]][1].append(clf.tree_.threshold[indx]) else: feature_thresholds[clf.tree_.feature[indx]][0].append(clf.tree_.threshold[indx]) else: for tree in clf.estimators_: p = tree.predict(x) temp = x[(p == 1) & (p == y)].copy() temp = temp.to_numpy() for n, row in enumerate(tree.decision_path(temp).toarray()): for indx in np.nonzero(row)[-1][-2:-1]: if(temp[n,tree.tree_.feature[indx]] <= tree.tree_.threshold[indx]): feature_thresholds[tree.tree_.feature[indx]][1].append(tree.tree_.threshold[indx]) else: feature_thresholds[tree.tree_.feature[indx]][0].append(tree.tree_.threshold[indx]) for k in range(len(features)): if len(feature_thresholds[k][0]) == 0: feature_thresholds[k][0].append(0) if len(feature_thresholds[k][1]) == 0: feature_thresholds[k][1].append(0) elif(len(feature_thresholds[k][1]) == 0): feature_thresholds[k][1].append(max(x[features[k]])) number = sum(importances > 0.15)
print(x) print(y) n_estimators = 4 max_depth = 4 model = RandomForestRegressor(n_estimators=n_estimators, max_depth=4) model.fit(x, y) _score = model.score(x, y) print(_score) for i_sample in range(len(x)): for i_estimator, tree in enumerate(model.estimators_): path = tree.decision_path(x) #print(path) print('i_estimator ', i_estimator) new_array = np.array(path.todense())[i_sample] print(i_sample) #print(dense) new_path = [] for i in range(len(new_array)): if new_array[i] == 1: new_path.append(i) #print(new_path) for i in range(1, len(new_path)): label = features[tree.tree_.feature[new_path[i - 1]]] value_ = tree.tree_.value[new_path[i - 1]]
def predict_marginalized_over_instances(self, X: np.ndarray): """Predict mean and variance marginalized over all instances. Returns the predictive mean and variance marginalised over all instances for a set of configurations. Note ---- This method overwrites the same method of ~smac.epm.base_epm.AbstractEPM; the following method is random forest specific and follows the SMAC2 implementation; it requires no distribution assumption to marginalize the uncertainty estimates Parameters ---------- X : np.ndarray [n_samples, n_features (config)] Returns ------- means : np.ndarray of shape = [n_samples, 1] Predictive mean vars : np.ndarray of shape = [n_samples, 1] Predictive variance """ if self.instance_features is None or \ len(self.instance_features) == 0: new_X = [] for tree in self.rf.estimators_: tree_X = [] path = tree.decision_path(X) for i in range(path.shape[0]): row = path.getrow(i).toarray().flatten().copy() new_row = [] for j in range(len(row)): if row[j] == np.NaN: new_row.append(0) else: threshold = tree.tree_.threshold[j] feature_idx = tree.tree_.feature[j] diff = (threshold - X[i][feature_idx]) new_row.append(diff) tree_X.append(new_row) new_X.append(np.array(tree_X)) new_X = np.hstack(new_X) new_X = (new_X - self.X_min_) / self.diff_ new_X[np.isnan(new_X)] = 0 new_X = new_X / self.max_length_ #new_X = self.scaler.transform(new_X) mean, std = self.gp.predict(new_X, return_std=True) mean = mean.reshape((-1, 1)) var = (std).reshape((-1, 1))**2 #print(new_X, mean, var) var[var < self.var_threshold] = self.var_threshold var[np.isnan(var)] = self.var_threshold return mean, var else: raise NotImplementedError()
def _train(self, X: np.ndarray, y: np.ndarray): """Trains the random forest on X and y. Parameters ---------- X : np.ndarray [n_samples, n_features (config + instance features)] Input data points. Y : np.ndarray [n_samples, ] The corresponding target values. Returns ------- self """ self.X = X self.y = y.flatten() self.rf = sklearn.ensemble.RandomForestRegressor( max_features=1.0, bootstrap=False, n_estimators=1, max_depth=None, ) #self.rf = sklearn.tree.DecisionTreeRegressor(max_depth=10) self.rf.fit(X, y) new_X = [] for tree in self.rf.estimators_: tree_X = [] # There's no reason to also take the leaves into account! path = tree.decision_path(X) for i in range(path.shape[0]): row = path.getrow(i).toarray().flatten().copy() new_row = [] for j in range(len(row)): if row[j] == 0: new_row.append(np.NaN) else: threshold = tree.tree_.threshold[j] feature_idx = tree.tree_.feature[j] diff = (threshold - X[i][feature_idx]) new_row.append(diff) tree_X.append(new_row) new_X.append(np.array(tree_X)) new_X = np.hstack(new_X) assert X.shape[0] == new_X.shape[0] X_min = np.nanmin(new_X, axis=0) X_max = np.nanmax(new_X, axis=0) diff = X_max - X_min diff[diff == 0] = 1 self.X_min_ = X_min self.diff_ = diff new_X = (new_X - self.X_min_) / self.diff_ new_X[np.isnan(new_X)] = 0 self.max_length_ = np.max(np.sum(new_X, axis=1)) new_X = new_X / self.max_length_ # TODO compute the kernel manually by computing the tree similarities and then only compute an additive kernel within each tree... # only compare 'same' paths of a tree self.gp = sklearn.pipeline.Pipeline([ # Cannot use the scaler here as it would destroy all knowledge about where the zeros are #['preproc', sklearn.preprocessing.MinMaxScaler()], [ 'regressor', sklearn.gaussian_process.GaussianProcessRegressor( kernel=sklearn.gaussian_process.kernels.ConstantKernel() * sklearn.gaussian_process.kernels.Matern( ), # + sklearn.gaussian_process.kernels.WhiteKernel( # noise_level=1e-7, noise_level_bounds=(1e-14, 1e-6) #), n_restarts_optimizer=10, normalize_y=True, ) ] ]) print(new_X.shape) self.gp.fit(new_X, y) print(self.gp.steps[-1][-1].kernel_) return self