def test_rf_classifier_decision_path_leaf(self): model = RandomForestClassifier(n_estimators=3, max_depth=3) X, y = make_classification(3, n_features=4, random_state=42) X = X[:, :2] model.fit(X, y) initial_types = [('input', FloatTensorType((None, X.shape[1])))] model_onnx = convert_sklearn(model, initial_types=initial_types, options={ id(model): { 'decision_leaf': True, 'decision_path': True, 'zipmap': False } }, target_opset=TARGET_OPSET) sess = InferenceSession(model_onnx.SerializeToString()) res = sess.run(None, {'input': X.astype(numpy.float32)}) pred = model.predict(X) assert_almost_equal(pred, res[0].ravel()) dec = model.decision_path(X) exp_leaf = path_to_leaf(model.estimators_, dec[0].todense(), dec[1]) exp_path = binary_array_to_string(dec[0].todense()) got_path = numpy.array([''.join(row) for row in res[2]]) assert exp_path == got_path.ravel().tolist() assert exp_leaf.tolist() == res[3].tolist()
def test_drf_classifier_backupsklearn(backend='auto'): df = pd.read_csv("./open_data/creditcard.csv") X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') import h2o4gpu Solver = h2o4gpu.RandomForestClassifier #Run h2o4gpu version of RandomForest Regression drf = Solver(backend=backend, random_state=1234, oob_score=True) print("h2o4gpu fit()") drf.fit(X, y) #Run Sklearn version of RandomForest Regression from sklearn.ensemble import RandomForestClassifier drf_sk = RandomForestClassifier(random_state=1234, oob_score=True, max_depth=3) print("Scikit fit()") drf_sk.fit(X, y) if backend == "sklearn": assert (drf.predict(X) == drf_sk.predict(X)).all() == True assert (drf.predict_log_proba(X) == drf_sk.predict_log_proba(X)).all() == True assert (drf.predict_proba(X) == drf_sk.predict_proba(X)).all() == True assert (drf.score(X, y) == drf_sk.score(X, y)).all() == True assert (drf.decision_path(X)[1] == drf_sk.decision_path(X)[1]).all() == True assert (drf.apply(X) == drf_sk.apply(X)).all() == True print("Estimators") print(drf.estimators_) print(drf_sk.estimators_) print("n_features") print(drf.n_features_) print(drf_sk.n_features_) assert drf.n_features_ == drf_sk.n_features_ print("n_classes_") print(drf.n_classes_) print(drf_sk.n_classes_) assert drf.n_classes_ == drf_sk.n_classes_ print("n_features") print(drf.classes_) print(drf_sk.classes_) assert (drf.classes_ == drf_sk.classes_).all() == True print("n_outputs") print(drf.n_outputs_) print(drf_sk.n_outputs_) assert drf.n_outputs_ == drf_sk.n_outputs_ print("Feature importance") print(drf.feature_importances_) print(drf_sk.feature_importances_) assert (drf.feature_importances_ == drf_sk.feature_importances_).all() == True print("oob_score") print(drf.oob_score_) print(drf_sk.oob_score_) assert drf.oob_score_ == drf_sk.oob_score_
def generate_mapping(self, X, y): X = self.dummy_encoder.transform(X.copy(deep=True)) y = y.copy(deep=True) mapping = [] for switch in self.dummy_encoder.mapping: col = switch.get('col') values = switch.get('mapping').copy(deep=True) if isinstance(self.max_depth, int): max_depth = self.max_depth elif isinstance(self.max_depth, float): max_depth = round(self.max_depth * values.shape[1]) else: max_depth = min(self.max_depth[1], round(self.max_depth[0] * values.shape[1])) if max_depth == 0: continue forest = RandomForestClassifier( max_depth=max_depth, n_estimators=self.n_estimators, n_jobs=self.n_jobs, ) forest.fit(X[values.columns], y) subsets = self.get_subsets(forest.decision_path(values)) subset_df = pd.DataFrame(data=subsets, index=values.index, columns=[ '{col}_subset_{i}'.format(col=col, i=i) for i in range(subsets.shape[1]) ]) base_df = values.join(subset_df) mapping.append({'col': col, 'mapping': base_df}) return mapping
def test_randomforestclassifier_decision_path(self): model = RandomForestClassifier(max_depth=2, n_estimators=2) X, y = make_classification(10, n_features=4, random_state=42) X = X[:, :2].astype(numpy.float32) model.fit(X, y) model_onnx = to_onnx( model, X, options={id(model): { 'decision_path': True, 'zipmap': False }}) sess = OnnxInference(model_onnx) res = sess.run({'X': X}) pred = model.predict(X) self.assertEqualArray(pred, res['label'].ravel()) prob = model.predict_proba(X) self.assertEqualArray(prob, res['probabilities']) dec = model.decision_path(X) exp = binary_array_to_string(dec[0].todense()) got = numpy.array([''.join(row) for row in res['decision_path']]) self.assertEqual(exp, got.tolist())
def test_randomforestclassifier_decision_path(self): model = RandomForestClassifier(max_depth=2, n_estimators=2) X, y = make_classification(10, n_features=4, random_state=42) X = X[:, :2] model.fit(X, y) initial_types = [('input', FloatTensorType((None, X.shape[1])))] model_onnx = convert_sklearn( model, initial_types=initial_types, options={id(model): { 'decision_path': True, 'zipmap': False }}) sess = InferenceSession(model_onnx.SerializeToString()) res = sess.run(None, {'input': X.astype(numpy.float32)}) pred = model.predict(X) assert_almost_equal(pred, res[0].ravel()) prob = model.predict_proba(X) assert_almost_equal(prob, res[1]) dec = model.decision_path(X) exp = binary_array_to_string(dec[0].todense()) got = numpy.array([''.join(row) for row in res[2]]) assert exp == got.ravel().tolist()
n_outputs = clf.n_outputs_ # the number of outputs when the model is built importance = clf.feature_importances_ # an array containing the fractional importance of each feature oob_score = clf.oob_score_ # score the training dataset using an out-of-bag estimator, this computes the average of correct classifications # basically the coefficent of determination of R**2 using 'unseen' data not used to build the model oob_decision_func = clf.oob_decision_function_ # now looking at the methods leaf_indicies = clf.apply( x_test ) # Using apply - which says which end leaf each row in x_test ends in # using decision_path - indicator, n_nodes_ptr = clf.decision_path(x_test) parameters = clf.get_params() predicted_array = clf.predict(x_test) # running the test set through the model log_mean_predicted_class = clf.predict_log_proba(x_test) mean_predicted_class = clf.predict_proba(x_test) mean_accuracy = clf.score( x_test, y_test ) # returns the accuracy (coefficent of determination of R**2) of the predicted test data outputs and the true values of the test data # calcuate the R^2 of the model on the training set r_2_train = clf.score(x_train, y_train)
labels = folhas.pop('Class').values images = folhas.values rf = RandomForestClassifier(n_estimators=25) fit = rf.fit(images, labels) predict = cross_val_predict(rf, images, labels, cv=10) score = cross_val_score(rf, images, labels, cv=10) prob = rf.predict_proba(images) print("\n\n\n\n\nLeaf Values:\n") print(folhas.head(340)) print("\n\nProbability:\n") print(prob) print("\n\nCross validation score:\n") print(score) print("\n\nDecision path:\n") print(rf.decision_path(images)) print("\n\nAccuracy:\n") print(accuracy_score(labels, predict)) feat_importances = pd.Series(rf.feature_importances_, index=folhas.columns) feat_importances.nlargest(14).plot(kind='barh', title='Importancia dos Atributos/Features') cm = confusion_matrix(labels, predict) plt.matshow(cm) plt.ylabel('X') plt.xlabel('Y') plt.title('MATRIZ DE CONFUSAO') plt.colorbar() plt.show()
def test_drf_classifier_backupsklearn(backend='auto'): df = pd.read_csv("./open_data/creditcard.csv") X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') import h2o4gpu Solver = h2o4gpu.RandomForestClassifier #Run h2o4gpu version of RandomForest Regression drf = Solver(backend=backend, random_state=1234, oob_score=True) print("h2o4gpu fit()") drf.fit(X, y) #Run Sklearn version of RandomForest Regression from sklearn.ensemble import RandomForestClassifier drf_sk = RandomForestClassifier(random_state=1234, oob_score=True, max_depth=3) print("Scikit fit()") drf_sk.fit(X, y) if backend == "sklearn": assert (drf.predict(X) == drf_sk.predict(X)).all() == True assert (drf.predict_log_proba(X) == drf_sk.predict_log_proba(X) ).all() == True assert (drf.predict_proba(X) == drf_sk.predict_proba(X)).all() == True assert (drf.score(X, y) == drf_sk.score(X, y)).all() == True assert (drf.decision_path(X)[1] == drf_sk.decision_path(X)[1] ).all() == True assert (drf.apply(X) == drf_sk.apply(X)).all() == True print("Estimators") print(drf.estimators_) print(drf_sk.estimators_) print("n_features") print(drf.n_features_) print(drf_sk.n_features_) assert drf.n_features_ == drf_sk.n_features_ print("n_classes_") print(drf.n_classes_) print(drf_sk.n_classes_) assert drf.n_classes_ == drf_sk.n_classes_ print("n_features") print(drf.classes_) print(drf_sk.classes_) assert (drf.classes_ == drf_sk.classes_).all() == True print("n_outputs") print(drf.n_outputs_) print(drf_sk.n_outputs_) assert drf.n_outputs_ == drf_sk.n_outputs_ print("Feature importance") print(drf.feature_importances_) print(drf_sk.feature_importances_) assert (drf.feature_importances_ == drf_sk.feature_importances_ ).all() == True print("oob_score") print(drf.oob_score_) print(drf_sk.oob_score_) assert drf.oob_score_ == drf_sk.oob_score_
], ] with open('dataset/gcc_bash.pickle', 'rb') as f: gcc_bash = pickle.load(f) with open('dataset/clang_bash.pickle', 'rb') as f: clang_bash = pickle.load(f) with open('dataset/tcc_bash.pickle', 'rb') as f: tcc_bash = pickle.load(f) # grep_clang_tcc_features, grep_clang_tcc_labels, _ = prepare_pos(clang_grep, tcc_grep) bash_gcc_clang_features, bash_gcc_clang_labels, bash_gcc_clang_names = prepare_pos( gcc_bash, clang_bash) bash_tcc_clang_features, bash_tcc_clang_labels, bash_tcc_clang_names = prepare_pos( tcc_bash, clang_bash) rfc = RandomForestClassifier(random_state=42) train_features, train_labels = prepare_training_data(training_data_files) rfc.fit(train_features, train_labels) paths, _ = rfc.decision_path(bash_gcc_clang_features) base, _ = rfc.decision_path([bash_tcc_clang_features[10]]) sims = [(cos_sim(path, base), i) for i, path in enumerate(paths)] sims = sorted(sims, key=lambda x: x[0], reverse=True) for sim, i in sims[:3]: print(bash_gcc_clang_names[i]) print(sim) for sim, i in sims[-3:]: print(bash_gcc_clang_names[i]) print(sim) print(bash_gcc_clang_names[sims[0][1]]) print(bash_tcc_clang_names[10])
clf = RandomForestClassifier(max_features='sqrt', n_jobs=2, random_state=RANDOM_STATE) clf.fit(train_data, train_label) print('Read testing data...') with open('testing.csv', 'r') as reader: test_data = [] for line in reader.readlines(): pixels = list(map(float, line.rstrip().split(','))) test_data.append(pixels) print('Loaded ' + str(len(test_data))) print('Predicting...') test_data = np.array(test_data) decision_path = clf.decision_path(test_data) feature_label = [] for i in range(3088): feature_label.append(str(i)) # understand tree structure # ref 1: http://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html # ref 2: https://stackoverflow.com/questions/40155128/plot-trees-for-a-random-forest-in-python-with-scikit-learn # parse tree structure and save the info by txt, png, and dot f = open('rf_tree_explanation.txt', 'a') for treeIndex in range(len(clf.estimators_)): f.write('Tree number %d\n' % (treeIndex + 1)) n_nodes = clf.estimators_[treeIndex].tree_.node_count children_left = clf.estimators_[treeIndex].tree_.children_left children_right = clf.estimators_[treeIndex].tree_.children_right
#data for both classification and regression X_train = np.random.rand(n_samples, 10) y_train = np.random.randint(num_classes, size=(n_samples)) if (predicttype == 'classify'): forest = RandomForestClassifier(n_estimators=n_trees, oob_score=True) else: forest = RandomForestRegressor(n_estimators=n_trees, oob_score=True) oob_indices, oob_leaves_id, OOB_tree_indicator = {}, {}, {} #fit forest.fit(X_train, y_train) forest_oob_score = forest.oob_score_ n_trees, train_size = forest.n_estimators, len(y_train) indicator, n_nodes_ptr = forest.decision_path(X_train) node_indicator = {} sample_index = {} for t, estimator in enumerate(forest): oob_indices[t] = _generate_unsampled_indices(estimator.random_state, X_train.shape[0]) oob_leaves_id[t] = estimator.apply(X_train[oob_indices[t], :]) sample_index[t] = _generate_sample_indices(estimator.random_state, n_samples) node_indicator[t] = indicator[:, n_nodes_ptr[t]:n_nodes_ptr[t + 1]] mean_vals = {} for t in range(n_trees): mean_vals[t] = np.zeros(node_indicator[t].shape[1]) for node in range(node_indicator[t].shape[1]): r, c = node_indicator[t][:, node].nonzero() mean_vals[t][node] = np.mean(y_train[sample_index[t]][r])
class WaveRandomForestClassifier(BaseEstimator, ClassifierMixin): """ RandomForest based classifier but with nodes that are removed See Paper: Wavelet decomposition of Random Forests http://www.jmlr.org/papers/volume17/15-203/15-203.pdf """ def __init__( self, n_estimators=100, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None, nodes_to_keep=0.9, ): self.n_estimators = n_estimators self.criterion = criterion self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease self.min_impurity_split = min_impurity_split self.bootstrap = bootstrap self.oob_score = oob_score self.n_jobs = n_jobs self.random_state = random_state self.verbose = verbose self.warm_start = warm_start self.class_weight = class_weight self.nodes_to_keep = nodes_to_keep self.forest = None def fit(self, X, y): # 1) create RandomForest self.forest = RandomForestClassifier( n_estimators=self.n_estimators, criterion=self.criterion, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, max_features=self.max_features, max_leaf_nodes=self.max_leaf_nodes, min_impurity_decrease=self.min_impurity_decrease, min_impurity_split=self.min_impurity_split, bootstrap=self.bootstrap, oob_score=self.oob_score, n_jobs=self.n_jobs, random_state=self.random_state, verbose=self.verbose, warm_start=self.warm_start, class_weight=self.class_weight, ) # 2) fit it self.forest.fit(X, y) self.n_outputs_ = self.forest.n_outputs_ # 3) retrieve node norms and values self.nodes_norm, self.nodes_value = compute_node_norm_classification_forest( self.forest) # 4) filter nodes self._nodes_order = np.argsort(-self.nodes_norm) if self.nodes_to_keep is not None: if self.nodes_to_keep < 1: nodes_to_keep = int( len(self._nodes_order) * self.nodes_to_keep) else: nodes_to_keep = int(self.nodes_to_keep) self._ind_nodes_to_keep = self._nodes_order[:nodes_to_keep] else: self._ind_nodes_to_keep = None return self def _set_nodes_to_keep(self, nodes_to_keep): """ change the number of waweletts to keep withtout refitting the underlying random forest """ self.nodes_to_keep = nodes_to_keep if self.forest is not None: if self.nodes_to_keep is None: self._ind_nodes_to_keep = None else: if self.nodes_to_keep < 1: nodes_to_keep = int( len(self._nodes_order) * self.nodes_to_keep) else: nodes_to_keep = int(self.nodes_to_keep) self._ind_nodes_to_keep = self._nodes_order[:nodes_to_keep] def predict_proba(self, X): if self.forest is None: raise NotFittedError("You should fit the model first") path, _ = self.forest.decision_path(X) if self._ind_nodes_to_keep is not None: predict_proba_filtered = [ path[:, self._ind_nodes_to_keep].dot( self.nodes_value[self._ind_nodes_to_keep, n, :]) for n in range(self.nodes_value.shape[1]) ] else: predict_proba_filtered = [ path[:, :].dot(self.nodes_value[:, n, :]) for n in range(self.nodes_value.shape[1]) ] for p in predict_proba_filtered: p[p < 0] = 0 p[p > 1] = 1 if len(predict_proba_filtered) == 1: return predict_proba_filtered[0] else: return predict_proba_filtered @property def classes_(self): return self.forest.classes_ def predict(self, X): """Predict class for X. The predicted class of an input sample is a vote by the trees in the forest, weighted by their probability estimates. That is, the predicted class is the one with highest mean probability estimate across the trees. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- y : array of shape = [n_samples] or [n_samples, n_outputs] The predicted classes. """ # Copied from base forest proba = self.predict_proba(X) if self.n_outputs_ == 1: return self.classes_.take(np.argmax(proba, axis=1), axis=0) else: n_samples = proba[0].shape[0] predictions = np.zeros((n_samples, self.n_outputs_)) for k in range(self.n_outputs_): predictions[:, k] = self.classes_[k].take(np.argmax(proba[k], axis=1), axis=0) return predictions def predict_log_proba(self, X): """Predict class log-probabilities for X. The predicted class log-probabilities of an input sample is computed as the log of the mean predicted class probabilities of the trees in the forest. Parameters ---------- X : array-like or sparse matrix of shape = [n_samples, n_features] The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- p : array of shape = [n_samples, n_classes], or a list of n_outputs such arrays if n_outputs > 1. The class probabilities of the input samples. The order of the classes corresponds to that in the attribute `classes_`. """ # Copied from base forest proba = self.predict_proba(X) if self.n_outputs_ == 1: return np.log(proba) else: for k in range(self.n_outputs_): proba[k] = np.log(proba[k]) return proba
print X_test.head() rf = RandomForestClassifier(n_estimators=30, max_depth=None, min_samples_split=10, class_weight="balanced" #min_weight_fraction_leaf=0.02 ) rf.fit(X_train, y_train) print("\n\n ---Random Forest Model---") rf_roc_auc = roc_auc_score(y_test, rf.predict(X_test)) print("Random Forest AUC = %2.2f" % rf_roc_auc) rf_accuracy = accuracy_score(y_test, rf.predict(X_test)) print("Random Forest Accuracy= %2.2f" % rf_accuracy) tn, fp, fn, tp = confusion_matrix(y_test, rf.predict(X_test)).ravel() print "True -ve:", tn print "True +ve:", tp print "False -ve:", fn print "False +ve:", fp print(classification_report(y_test, rf.predict(X_test))) print rf.predict_proba(X_test) print confusion_matrix(y_test, rf.predict(X_test)) print len(rf.decision_path(X_test)[1]) from sklearn.externals import joblib # save model joblib.dump(rf, 'model.pkl')
combined_score = [] #function for returning for slicing condition def condition_combination(index_vals, df=x): condition = True for indx in index_vals: condition = condition & (df[features[indices[indx]]] >= min(feature_thresholds[indices[indx]][0])) & (df[features[indices[indx]]] <= max(feature_thresholds[indices[indx]][1])) return condition print("Feature ranking " + name + ":") if(i < 3): p = clf.predict(x) temp = x[(p == 1) & (p == y)].copy() temp = temp.to_numpy() for n, row in enumerate(clf.decision_path(temp).toarray()): for indx in np.nonzero(row)[-1][-2:-1]: if(temp[n,clf.tree_.feature[indx]] <= clf.tree_.threshold[indx]): feature_thresholds[clf.tree_.feature[indx]][1].append(clf.tree_.threshold[indx]) else: feature_thresholds[clf.tree_.feature[indx]][0].append(clf.tree_.threshold[indx]) else: for tree in clf.estimators_: p = tree.predict(x) temp = x[(p == 1) & (p == y)].copy() temp = temp.to_numpy() for n, row in enumerate(tree.decision_path(temp).toarray()): for indx in np.nonzero(row)[-1][-2:-1]: if(temp[n,tree.tree_.feature[indx]] <= tree.tree_.threshold[indx]): feature_thresholds[tree.tree_.feature[indx]][1].append(tree.tree_.threshold[indx]) else:
print('Wrong value of codebook') #fit the forest forest = RandomForestClassifier(n_estimators=100, max_depth = 20, min_samples_split = 2,max_features = None, criterion = 'entropy', n_jobs = -1).fit(data_tr, labels) #make predictions hs = forest.predict(data_te) #whats the accuracy (forest.score(data_te, labels))*100 #how many matches is that sum(labels == hs) forest.decision_path(data_tr) from IPython.display import display, Image import pydotplus import sklearn.tree as tree for dtree in forest.estimators_: dot_data = tree.export_graphviz(dtree , out_file = None , filled = True , rounded = True , special_characters = True) graph = pydotplus.graph_from_dot_data(dot_data) img = Image(graph.create_png()) display(img)
def Classifier_random_forest(Xfeat_test, Xfeat,y_each_patient_test, y_each_patient, selected_babies, \ selected_test, label,classweight, Used_classifier, drawing, lst, ChoosenKind,\ SamplingMeth,probability_threshold,ASprobLimit,N,crit,msl,deciding_performance_measure,dispinfo): #### CREATING THE sampleweight FOR SELECTED BABIES #### TRAIN CLASSIFIER meanaccLOO = [] accLOO = [] testsubject = [] tpr_mean = [] counter = 0 mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) F1_macro_collect = [] F1_micro_collect = [] F1_weight_collect = [] F1_all_collect = [] K_collect = [] preliminaryK = zeros(len(probthres_Grid)) #CREATING TEST AND TRAIN SETS Selected_training = selected_babies X_train = [ Xfeat[np.where(np.array(selected_babies) == k)[0][0]] for k in Selected_training ] # combine only babies to train on in list y_train = [ y_each_patient[np.where(np.array(selected_babies) == k)[0][0]] for k in Selected_training ] X_test = Xfeat_test[selected_test] y_test = y_each_patient_test[selected_test] X_train = vstack( X_train) # mergin the data from each list element into one matrix y_train = vstack(y_train) y_old = y_train[:] #SAMPLING TO EQUALIZE CLASS IMBALANCE X_train, y_train = cmplx_Oversampling(X_train, y_train, ChoosenKind, SamplingMeth, label) #CALCULATE THE WEIGHTS DUE TO CLASS IMBALANCE class_weight = 'balanced' classlabels = ravel( y_old) # y_test has to be a 1d array for compute_class_weight # Now test if all labels are actually in the data. Otheriwse error with compute_class_weight. If not make the found labels the newe labels. If the new label is 1 then classsification does not work, therefore skip class_weigth , therefore CW if (classweight == 1) and len(unique(classlabels)) == len(label): cW = compute_class_weight(class_weight, label, classlabels) cWdict = dict( zip(label, cW) ) #the class weight need to be a dictionarry of the form:{class_label : value} CW = 1 elif (classweight == 1) and len(unique(classlabels)) != len(label): CW_label = unique(classlabels) #which values arein an array if len(CW_label) == 1: if dispinfo: print('classweight config skiped once as only one class exist') CW = 0 else: if dispinfo: print('used labels are:', CW_label, 'instead of:', label) cW = compute_class_weight(class_weight, CW_label, classlabels) cWdict = dict( zip(label, cW) ) #the class weight need to be a dictionarry of the form:{class_label : value} CW = 1 if dispinfo: disp(cWdict) #The Random Forest / Extreme Random Forest / Gradiant boosting if Used_classifier == 'TR': if (classweight == 1) and CW == 1: clf = tree.DecisionTreeClassifier(criterion=crit, splitter="best", max_depth=None,\ min_samples_split=2, min_samples_leaf=msl, \ min_weight_fraction_leaf=0.0, max_features=None, \ random_state=42, max_leaf_nodes=None, min_impurity_decrease=0.0,\ min_impurity_split=None, class_weight=cWdict, presort=False) else: clf = tree.DecisionTreeClassifier(criterion=crit, splitter="best", max_depth=None,\ min_samples_split=2, min_samples_leaf=msl, \ min_weight_fraction_leaf=0.0, max_features=None, \ random_state=42, max_leaf_nodes=None, min_impurity_decrease=0.0,\ min_impurity_split=None, presort=False) if Used_classifier == 'RF': if (classweight == 1) and CW == 1: clf = RandomForestClassifier(n_estimators=N, criterion=crit, max_depth=None, \ min_samples_split=2, min_samples_leaf=msl, min_weight_fraction_leaf=0.0,\ max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0.0,\ min_impurity_split=None, bootstrap=True, oob_score=False,\ n_jobs=1, random_state=42, verbose=0, warm_start=False,\ class_weight=cWdict) else: clf = RandomForestClassifier(n_estimators=N, criterion=crit, max_depth=None, \ min_samples_split=2, min_samples_leaf=msl, min_weight_fraction_leaf=0.0,\ max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0.0,\ min_impurity_split=None, bootstrap=True, oob_score=False,\ n_jobs=1, random_state=42, verbose=0, warm_start=False,\ ) elif Used_classifier == 'ERF': if (classweight == 1) and CW == 1: clf = ExtraTreesClassifier(n_estimators=N, criterion=crit, max_depth=None,\ min_samples_split=2, min_samples_leaf=msl, \ max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0.0,\ min_impurity_split=None, bootstrap=True, oob_score=False,\ n_jobs=1, random_state=42, verbose=0, warm_start=False,\ class_weight=cWdict) else: clf = ExtraTreesClassifier(n_estimators=N, criterion=crit, max_depth=None,\ min_samples_split=2, min_samples_leaf=msl, min_weight_fraction_leaf=0.0,\ max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0.0,\ min_impurity_split=None, bootstrap=True, oob_score=False,\ n_jobs=1, random_state=42, verbose=0, warm_start=False,\ ) elif Used_classifier == 'GB': clf = GradientBoostingClassifier(loss="deviance", learning_rate=0.1, n_estimators=1000, subsample=1, \ criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0,\ max_depth=30, min_impurity_decrease=0.0, min_impurity_split=None, init=None, \ random_state=42, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto') elif Used_classifier == 'LR': clf = LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=1) #Performance analysis # sys.exit('Jan werth') if len(label) < 2: print("please use at least two labels") # F1 Kappa else: prediction = clf.fit(X_train, y_train.ravel()).predict( X_test) # prediction decide on 0.5 proability which class to take probs = ( clf.fit(X_train, y_train.ravel()).predict_proba(X_test) ) # with the calculated probabilities we can choose our own threshold if probability_threshold: # prediction takes always proability 0.5 to deside. Here we deside based on other way lower proabilities. deciding if any other than AS has slightly elevated probabilities for k in range(len(probthres_Grid) ): # try differnt Trhesholds for the probability preliminary_pred = copy(prediction[:]) probthres = probthres_Grid[k] for i in range(len(probs)): if len(label) == 3: if any( probs[i, 1:] >= probthres ) and probs[i, 0] < ASprobLimit[ 0]: #IF THE PROBABILITY IS HIGHER THAN ... USE THAT CLASS INSTEAD OF AS. But if AS is over ~0.7 still take AS highprob = np.argmax( probs[i, 1:] ) # otherwise search for max prob of the labels other than AS preliminary_pred[i] = label[ highprob + 1] # change the label in predictions to the new found label; +1 as we cut the array before by 1. Otherwise false index if len(label) > 3: if any( probs[i, 1:] >= probthres ) and probs[i, 0] < ASprobLimit[ 1]: #IF THE PROBABILITY IS HIGHER THAN ... USE THAT CLASS INSTEAD OF AS. But if AS is over ~0.7 still take AS highprob = np.argmax( probs[i, 1:] ) # otherwise search for max prob of the labels other than AS preliminary_pred[i] = label[ highprob + 1] # change the label in predictions to the new found label; +1 as we cut the array before by 1. Otherwise false index elif ( probs[i, 1] ) >= probthres: # if we have only two labels searching for max does not work preliminary_pred[i] = label[ 1] # CHange the label in prediction to the second label #!!!!!!!! To change klassifier for perfomance measure if deciding_performance_measure == 'Kappa': preliminaryK[k] = cohen_kappa_score( y_test.ravel(), preliminary_pred, labels=label) # Find the threshold where Kapa gets max elif deciding_performance_measure == 'F1_second_label': preliminaryK[k] = f1_score(y_test.ravel(), preliminary_pred, labels=label, average=None)[1] elif deciding_performance_measure == 'F1_third_label': preliminaryK[k] = f1_score(y_test.ravel(), preliminary_pred, labels=label, average=None)[2] elif deciding_performance_measure == 'F1_fourth_label': preliminaryK[k] = f1_score(y_test.ravel(), preliminary_pred, labels=label, average=None)[3] #!!!!!!!! To change klassifier for perfomance measure maxK = preliminaryK.argmax(axis=0) if dispinfo: print('Used probability Thresh: %.2f' % probthres_Grid[maxK]) probthres = probthres_Grid[ maxK] #repeat creating the predictions with the optimal probabilty threshold for i in range(len(probs)): if len(label) == 3: if any( probs[i, 1:] >= probthres ) and probs[i, 0] < ASprobLimit[ 0]: #IF THE PROBABILITY IS HIGHER THAN ... USE THAT CLASS INSTEAD OF AS. But if AS is over ~0.7 still take AS highprob = np.argmax( probs[i, 1:] ) # otherwise search for max prob of the labels other than AS prediction[i] = label[ highprob + 1] # change the label in predictions to the new found label; +1 as we cut the array before by 1. Otherwise false index if len(label) > 3: if any( probs[i, 1:] >= probthres ) and probs[i, 0] < ASprobLimit[ 1]: #IF THE PROBABILITY IS HIGHER THAN ... USE THAT CLASS INSTEAD OF AS. But if AS is over ~0.7 still take AS highprob = np.argmax( probs[i, 1:] ) # otherwise search for max prob of the labels other than AS prediction[i] = label[highprob + 1] # cha elif ( probs[i, 1] ) >= probthres: # if we have only two labels searching for max does not work prediction[i] = label[ 1] # CHange the label in prediction to the second label scoring = clf.score(X_test, y_test.ravel(), sample_weight=None) Fimportances = clf.feature_importances_ if Used_classifier != 'GB': Dpath = clf.decision_path(X_train) resultsF1_macro = f1_score(y_test.ravel(), prediction, average='macro') #, pos_label=None) resultsF1_micro = f1_score(y_test.ravel(), prediction, average='micro') resultsF1_weight = f1_score(y_test.ravel(), prediction, average='weighted') resultsF1_all = f1_score(y_test.ravel(), prediction, labels=label, average=None) #, pos_label=None) resultsK = cohen_kappa_score(y_test.ravel(), prediction, labels=label) if drawing and Used_classifier == 'TR': import graphviz from Loading_5min_mat_files_cECG import Class_dict, features_dict usedfeatures = list( (features_dict[k]) for k in lst ) #create a dict only with the usedfeatures in lst out of all which are in features_dict usedlabels = list( (Class_dict[k]) for k in label ) #create a dict only with the usedfeatures in lst out of all which are in features_dict with open("RF.txt", "w") as f: f = tree.export_graphviz(clf, out_file=f, feature_names=usedfeatures, class_names=usedlabels, filled=True, rounded=True) # with open("RF.dot", "w") as f: # f = tree.export_graphviz(clf, out_file=f) with open("RF.svc", "w") as f: f = tree.export_graphviz(clf, out_file=f) # dot -Tpdf RF.dot -o RF.pdf # open -a preview RF.pdf # dot_data = tree.export_graphviz(clf, out_file=None) # graph = graphviz.Source(dot_data) # graph.render("Jan") # # dot_data = tree.export_graphviz(clf, out_file=None, # feature_names=usedfeatures, # class_names=usedlabels, # filled=True, rounded=True, # special_characters=True) # graph = graphviz.Source(dot_data) # graph # return resultsF1_macro, resultsK, resultsF1_micro, resultsF1_weight, resultsF1_all, Fimportances, scoring, prediction, probs
print(ypred) print(list(le.inverse_transform(ypred))) print(lgb1.predict_proba(xtest)) print(accuracy_score(ytest,ypred)) print(accuracy_score(ytrain,ypred1)) from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import SelectFromModel rf=RandomForestClassifier(random_state=0,class_weight="balanced") print(rf.fit(xtrain,ytrain)) ypred=(rf.predict(xtest)) ypred1=(rf.predict(xtrain)) print(ypred) print(list(le.inverse_transform(ypred))) print(classification_report(ypred,ytest)) print(rf.apply(xtest)) print(rf.decision_path(xtest)) print(rf.predict_proba(xtest)) print(rf.predict_log_proba(xtest)) print(rf.score(xtrain,ytrain)) rmse=math.sqrt(mean_squared_error(ytest,ypred)) print(rmse) print(r2_score(ytest,ypred)) confusionmatrix=confusion_matrix(ypred,ytest) print(confusionmatrix) print(accuracy_score(ytest,ypred)) print(accuracy_score(ytrain,ypred1)) from sklearn.svm import SVC svc=SVC(kernel="rbf",random_state=0,gamma=1,C=1,class_weight="balanced") print(svc.fit(xtrain,ytrain)) ypred=svc.predict(xtest) ypred1=svc.predict(xtrain)
print(trainframe['label']) trainframe['label'] = trainframe['label'].astype('int') testframe['label'] = testframe['label'].astype('int') print(testframe['label']) with open('actualpredictionsRF', 'wb') as fp: pickle.dump(testframe['label'], fp) print("completed encoding labels") # #clf = RandomForestClassifier() #clf = tree.DecisionTreeClassifier(criterion='entropy') clf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42) clf = clf.fit(matrix, trainframe['label']) #clf = clf.fit(matrix, trainframe['label']) #predictions = clf.predict(matrix2) #pickle to save the data print(clf.decision_path(matrix)) filename = 'randomforestFinal' pickle.dump(clf, open(filename, 'wb')) loaded_model = pickle.load(open(filename, 'rb')) predictions = loaded_model.predict(matrix2) ''' filename1='RandomForestpredict' pickle.dump(predictions,open(filename1, 'wb')) ''' #print(clf.feature_importances_) #predictions = clf.predict(matrix2) ts2 = time.time() ts2 = ts2 - ts print("time taken") print(ts2)
'oob_score': [True, False], 'random_state': [None, 2017] } RF_GS = RandomForestClassifier() clf_GS_RF = GridSearchCV(RF_GS, RFparameters) clf_GS_RF.fit(x_train_tfidf, y_train) sorted(clf_GS_RF.cv_results_.keys()) GridPredRF = clf_GS_RF.predict(x_test_tfidf) GS_RF_accuracy = metrics.accuracy_score(y_test, GridPredBag) print('Accuracy score for clf_GS_RF: ', GS_RF_accuracy) print('Best params for clf_GS_RF: ', clf_GS_RF.best_params_) print('Detailed clf_GS_RF results: ', clf_GS_RF.cv_results_) #print ('OOB score from clf_GS_RF : ', clf_GS_RF.oob_score_ ) #print ('OOB decision function from clf_GS_RF : ', RF_GS.oob_decision_function_ ) print('Feature importances from clf_GS_RF : ', RF_GS.feature_importances_) RF_GS.decision_path(x_train_tfidf) print(clf_GS_RF.best_params_) print(clf_GS_RF.best_score_) print(clf_GS_RF.grid_scores_) #clf_GS_RF.error_score print(metrics.confusion_matrix(y_test, GridPredRF)) #plt.plot( , clf_GS_RF) for e in df['param_max_features'].value_counts().index: print(e) estimators = df.param_n_estimators[(df.param_criterion == 'entropy') & (df.param_oob_score == True)] mean_score = df.mean_test_score[(df.param_criterion == 'entropy') & (df.param_oob_score == True)] plt.plot(estimators[df['param_max_features'] == e], mean_score[df['param_max_features'] == e],
plt.show() #plt.plot() y = train_data["Survived"] #PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked features = ["Sex", "Age", "Pclass", "Embarked", "SibSp", "Parch"] X = pd.get_dummies(train_data[features]) X_test = pd.get_dummies(test_data[features]) model = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=1) model.fit(X, y) predictions = model.predict(X_test) print(model.feature_importances_) print(model.decision_path(X_test)) from sklearn.tree import export_graphviz estimator = model.estimators_[5] # Export as dot file export_graphviz(estimator, out_file='tree.dot', feature_names = ["Sex", "Age", "Pclass", "Embarked", "SibSp", "Parch"], class_names = "Survived", rounded = True, proportion = False, precision = 2, filled = True) # Convert to png using system command (requires Graphviz) from subprocess import call #process = subprocess.Popen(command, stdout=tempFile, shell=True) #call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600']) #set path=%path%;C:\Anaconda3\graphviz-2.38\release\bin
clf = RandomForestClassifier(random_state=0).fit(X_train, y_train) print('RF Model fitted with' + repr(40) + 'features\n RF feature importances\n') important_features = pd.Series(data=clf.feature_importances_, index=fdataDF.columns) feats = {} # a dict to hold feature_name: feature_importance for feature, importance in zip(fdataDF.columns, clf.feature_importances_): feats[feature] = importance #add the name/value pair importances = pd.DataFrame.from_dict( feats, orient='index').rename(columns={0: 'Gini-importance'}) importances.sort_values(by='Gini-importance').plot(kind='bar', rot=90) plt.rcParams.update({'font.size': 5}) plt.show() print('RF decision path') pprint.pprint(clf.decision_path(X_train)) y_pred = clf.predict(X_test) print('RF Score') print(clf.score(X_test, y_test)) print('RMSE\n') print(np.sqrt(metrics.mean_squared_error(y_pred, y_test))) # computing permutation importance of each feature in dataset perm = PermutationImportance(clf, random_state=1).fit(X_train, y_train) # # create a structured array # dtype = [('feature', str), ('permutation_weights', float)] features = np.array(X_train.columns.to_list()) permF = np.array([X_train.columns.to_list, perm.feature_importances_]) rankedFeatureIds = perm.feature_importances_.argsort( )[:: -1] #[::-1] reverses the ascending result of argsort, indices of arrays sorted
score = clf.score(X_test, y_test) if score > current_best_score: current_best_score = score best_tree_shuffle = clf print(current_best_score) #%% res = clf.predict(X_test) compteur = 0 nb_nuls_predis = 0 for i in range(len(X_test)): #print(res[i]==labels_test[i]) if res[i] == y_test[i]: compteur += 1 #print('\n') print(str(compteur * 100 / len(X_test)) + "%") #%% plot_confusion_matrix(clf, X_test, y_test) #%% clf.predict_proba(X_test) clf.decision_path(X_test) print(clf.decision_path(X_test)) clf.score(X_test, y_test) #%% ne pas écraser foretV1 !!! #from joblib import dump, load #dump(clf, 'foretV2.joblib') #%% clf = load('foretV1.joblib') #%% clf.score(X_test, y_test)
######################################### # Negative figures. We still have raw scores. ####################################### # Option *decision_path* # ++++++++++++++++++++++ # # *scikit-learn* implements a function to retrieve the # decision path. It can be enabled by option *decision_path*. clrrf = RandomForestClassifier(n_estimators=2, max_depth=2) clrrf.fit(X_train, y_train) clrrf.predict(X_test[:2]) paths, n_nodes_ptr = clrrf.decision_path(X_test[:2]) print(paths.todense()) model_def = to_onnx(clrrf, X_train.astype(numpy.float32), options={id(clrrf): {'decision_path': True, 'zipmap': False}}) sess = InferenceSession(model_def.SerializeToString()) ########################################## # The model produces 3 outputs. print([o.name for o in sess.get_outputs()]) ########################################## # Let's display the last one.
# In[536]: # Implies 300 Trees Generates the Highest Accuracy plt.plot(xlabels, n_trees) plt.xlabel('Trees in RandomForest') plt.ylabel('Accuracy') plt.title("RandomForest Optimization") # #RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, #min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, bootstrap=True, #oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None) # In[514]: # Why does this decision path look like its got more features than 5 ? print(eeg_rf.score(test_x, test_y)*100) eeg_rf.decision_path(test_x) # ## Ensemble Classifiers # In[64]: # Voting Classifier ... Accuracy lower than RandomForest (with 500 trees) ...possible ? voting = VotingClassifier(estimators=[('lr', lr_eeg), ('rf', eeg_rf), ('gnb', clf)], voting='hard') voting = voting.fit(train_x_masked, train_y_masked) print("Confusion Matrix ") print("Classif: 0 1") print(confusion_matrix(y_true=test_y_masked, y_pred=voting.predict(test_x_masked))) print() print("Voting Classifier Accuracy: ") print((1-(902+289)/(666+902+289+1346))*100)
#To set the working directory os.chdir("/Users/steven/Documents/dataMining/Kaggle/digitRecognizer") cwd = os.getcwd() # get titanic & test csv files as a DataFrame digit_train_df = pd.read_csv("input/train.csv") digit_test_df = pd.read_csv("input/test.csv") # preview the data digit_train_df.head() digit_test_df.head() #defining the training data set X_train = digit_train_df.drop("label", axis=1) Y_train = digit_train_df["label"] X_test = digit_test_df random_forest = RandomForestClassifier(n_estimators=10) random_forest.fit(X_train, Y_train) random_forest.decision_path(X_train) Y_pred = random_forest.predict(X_test) random_forest.score(X_train, Y_train) submission = pd.DataFrame({"ImageId": X_test.index + 1, "Label": Y_pred}) submission.to_csv('RF_10.csv', index=False)