def TreeTest(): spamDat = spamData() k = 10 all_folds = hw3.partition_folds(spamDat, k) num_in_fold = [] err_in_fold = [] for i in range(len(all_folds) - 1): spam = all_folds[i] num_in_fold.append(len(spam)) truth, f_data = decTree.split_truth_from_data(spam) tree = decTree.TreeOptimal(max_depth=2) #tree = decTree.TreeRandom() tree.fit(f_data, truth) print 'Prediction...\n' predict = tree.predict(f_data) print predict print truth error = 1. - hw3.get_accuracy(predict, truth) err_in_fold.append(error) print 'Tree error is: {}'.format(error) spam = all_folds[k -1] truth, f_data = decTree.split_truth_from_data(spam) tree = decTree.TreeOptimal(max_depth=2) #tree = decTree.TreeRandom() tree.fit(f_data, truth) predict = tree.predict(f_data) error = 1. - hw3.get_accuracy(predict, truth) sum_training_err = 0 for i in range(len(num_in_fold)): sum_training_err += err_in_fold[i] #sum_training_err += float(err_in_fold)/num_in_fold average_training_error = float(sum_training_err)/len(num_in_fold) print 'Average training error: {}\nAverage testing error: {}'.format(average_training_error, error)
def d_tree(): data = get_crime_data().as_matrix() X = data[:, [0, 1, 2, 3, 4, 5, 6, 7, 9]] y = data[:, 8] xTr, xTe, yTr, yTe = train_test_split(X, y, test_size=0.4, random_state=0) dt = DecisionTreeClassifier(min_samples_split=20, random_state=99) tree = dt.fit(xTr, yTr) preds = tree.predict(xTe) print "Train Accuracy :: ", accuracy_score(yTr, tree.predict(xTr)) print "Test Accuracy :: ", accuracy_score(yTe, preds) print("-- 10-fold cross-validation --") cv_dt = cross_val_score(dt, xTe, yTe, cv=10) print("mean: {:.3f} (std: {:.3f})".format(cv_dt.mean(), cv_dt.std())) ''' Train Accuracy :: 0.9437 Test Accuracy :: 0.87695 -- 10-fold cross-validation -- mean: 0.876 (std: 0.007) ''' # ---- STAT TEST ---- from scipy.stats import ttest_ind results = pd.DataFrame({'preds': preds, 'yTe': yTe}) value, pvalue = ttest_ind(preds, yTe, equal_var=True) print(value, pvalue) if pvalue >= 0.05: print('Dtree is a good predictor for classification') else: print('Dtree is a bad predictor for classification')
def testRunWithIrisData(self): # Load data and store it into pandas DataFrame objects iris = load_iris() X = pd.DataFrame(iris.data[:, :], columns=iris.feature_names[:]) y = pd.DataFrame(iris.target, columns=["Species"]) # Defining and fitting a DecisionTreeClassifier instance tree = DecisionTreeClassifier(max_depth=2) tree.fit(X, y) # Creates dot file named tree.dot export_graphviz(tree, out_file="../output/IrisOutput_DT.dot", feature_names=list(X.columns), class_names=iris.target_names, filled=True, rounded=True) sample_one_pred = int(tree.predict([[5, 5, 1, 3]])) sample_two_pred = int(tree.predict([[5, 5, 2.6, 1.5]])) print( f"The first sample most likely belongs a {iris.target_names[sample_one_pred]} flower." ) print( f"The second sample most likely belongs a {iris.target_names[sample_two_pred]} flower." )
def landmark_decision_tree(X, y): # pylint: disable=C0103 """Compute statistic.""" try: if scipy.sparse.issparse(X): return np.NaN import sklearn.tree # pylint: disable=C0103 if len(y.shape) == 1 or y.shape[1] == 1: kf = sklearn.model_selection.StratifiedKFold(n_splits=10) else: kf = sklearn.model_selection.KFold(n_splits=10) accuracy = 0. for train, test in kf.split(X, y): random_state = sklearn.utils.check_random_state(42) tree = sklearn.tree.DecisionTreeClassifier( random_state=random_state) if len(y.shape) == 1 or y.shape[1] == 1: tree.fit(X[train], y[train]) else: tree = OneVsRestClassifier(tree) tree.fit(X[train], y[train]) predictions = tree.predict(X[test]) accuracy += sklearn.metrics.accuracy_score( predictions, y[test]) return accuracy / 10 except Exception as ex: # pylint: disable=W0703 automl_log( "Landmark Decision Tree could not be computed. Returning 0 \ instead. Originally failed with exception '{ex}'".format(ex=ex), 'WARNING') return 0.
def process_query(tree, query): target_value = tree.predict(query) print_query(query) print("Predicted Target Value:", target_value, '\n') if target_value != query[TARGET]: classification_error_samples += 1 plot_categorical_decision_tree(query)
def get_best_tree(model, X, keep_scores=False): """ Given a model of ensembled trees with an `estimators_` attribute, finds the tree that most closely resembles Parameters ---------- model X Returns ------- """ overall_prediction = model.predict(X) predictions = dict() scores = dict() best_score, best_tree_number = -999, -999 for tree_num, tree in enumerate(model.estimators_): predictions[tree_num] = tree.predict(X) new_score = tree.score(X, overall_prediction) scores[tree_num] = new_score if new_score > best_score: best_score = new_score best_tree_number = tree_num nearest_tree = model.estimators_[best_tree_number] if keep_scores: return best_tree_number, nearest_tree, scores return best_tree_number, nearest_tree
def check_accuracy(dt, dataset, test_percentage=20, num_repeats=3): training_set = copy.deepcopy(dataset) accuracies = [] num_test_samples = int(len(training_set) / (100 / test_percentage)) for i in range(num_repeats): random.shuffle(training_set) test_set = [ training_set.pop(random.randrange(len(training_set))) for i in range(num_test_samples) ] # Fit the dt tree = dt.fit(training_set) values = [tree.predict(a) for a in test_set] expected = [a.classification for a in test_set] same = 0 for i in range(len(values)): if values[i] == expected[i]: same += 1 accuracies.append((same / len(values)) * 100) [training_set.append(i) for i in test_set] # test set overwritten later, # so no need to pop average_accuracy = sum(accuracies) / len(accuracies) std_dev = 0 for a in accuracies: std_dev += ((a - average_accuracy)**2) std_dev = math.sqrt(1 / len(accuracies) * std_dev) print("Accuracy: {}, Std dev: {}".format(average_accuracy, std_dev)) return (average_accuracy, std_dev)
def predict(self, X): """ Funtion to run the BaggingClassifier on a data point Input: X: pd.DataFrame with rows as samples and columns as features Output: y: pd.Series with rows corresponding to output variable. THe output variable in a row is the prediction for sample in corresponding row in X. """ out = "y" if (isinstance(X, pd.DataFrame)): if (out in X.columns): X = X.drop(['y'],axis=1) y_hat = np.zeros(len(X)) all_predictions = [] for tree in self.trees: all_predictions.append(tree.predict(X)) pred_arr = np.array(all_predictions) pred_arr = pred_arr.T # pred max pred value for each samples y_hat = [np.argmax(np.bincount(i)) for i in pred_arr] return(pd.Series(y_hat))
def k_fold_cross_validation(folds, noisy_folds, criteria, m): mean_accuracy = 0.0 features_set_size = len(folds[0][0])-1 fold_size = len(folds) trees = [] for test_fold_idx in range(fold_size): # train for all folds except for test_fold_idx X = [] Y = [] for train_fold_idx in range(fold_size): if train_fold_idx == test_fold_idx: continue X += [row[:-1] for row in noisy_folds[train_fold_idx]] Y += [row[-1] for row in noisy_folds[train_fold_idx]] classifier = FeaturesClassifier(criteria, m) tree = classifier.fit(X, Y, list(range(features_set_size))) # test for test_fold_idx X = [row[:-1] for row in folds[test_fold_idx]] Y = [row[-1] for row in folds[test_fold_idx]] results = tree.predict(X) count = [1 for i in range(len(results)) if results[i] == Y[i]] mean_accuracy += len(count)/float(len(results)) #print(len(count)/float(len(results))) trees += [tree] mean_accuracy /= float(len(folds)) return trees , mean_accuracy
def test_errors(treeName, tree, merged): errors = [] ar = np.array(merged) for i in range(0, len(ar)): r = ar[i] prediction = tree.predict([ r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8], r[9], r[10], r[11], r[12], r[13] ]) actual = r[14] error = actual - prediction if actual == 0.0: actual += 0.00001 error /= actual errors.append(error) errorsArray = np.array(errors) print("Min error: " + str(np.min(abs(errorsArray)))) print("Max error: " + str(np.max(abs(errorsArray)))) print("Median error: " + str(np.median(errorsArray))) plt.figure() plt.suptitle("Histogram of Error: " + treeName + "\nMinError: " + str(round(np.min(abs(errorsArray)), 4)) + ", Max error: " + str(round(np.max(abs(errorsArray)), 4)) + ", Median error: " + str(round(np.median(errorsArray), 4))) plt.hist(errorsArray, bins=100) fileName = treeName + "\\" + treeName + "ErrorHistogram.png" plt.savefig(fileName) return errors
def predict(self, df): """ function that predicts the expected effect """ if not hasattr(self, 'model'): raise Exception('Model not fitted.') # removing 'index' from index_cols if necessary index_cols = self.index_cols.copy() if 'new_index' in self.index_cols: index_cols.remove('new_index') # removing w from df, if necessary if not self.use_w_in_tree: for col in self.w_var: if col in df.columns: index_cols += [col] # removing y from df, if necessary for col in self.y_var: if col in df.columns: index_cols += [col] preds = df.set_index(index_cols)[[]].copy() for index, tree in enumerate(self.model): if self.algorithm == 'propensity': preds[f'pred_tree_{index}'] = (pd.Series( tree.apply(df.set_index(index_cols))).map( self.propensity_score[index]).tolist()) else: preds[f'pred_tree_{index}'] = tree.predict( df.set_index(index_cols)) preds['prediction'] = preds.mean(axis=1) return preds
def suggest(session_id, results): global all_trees global all_data global all_y if session_id in all_trees: tree = all_trees[session_id] keys = range(1, len(results.keys()) + 1) data = [] for key in keys: num_reviews = results[key]["reviews"] rating = results[key]["rating"] row = [num_reviews, rating] data.append(row) predictions = tree.predict(data) for i in range(1, len(predictions) + 1): prediction = predictions[i - 1] print prediction if prediction == 1: print "suggesting", i return i print "-------- suggesting default" return 1 else: print "---------- 1 suggesting default" return 1
def _calculate(self, X, y, logger, categorical): import sklearn.tree if type(y) in ('binary', 'multiclass'): kf = sklearn.model_selection.StratifiedKFold(n_splits=5) else: kf = sklearn.model_selection.KFold(n_splits=5) accuracy = 0. for train, test in kf.split(X, y): random_state = sklearn.utils.check_random_state(42) tree = sklearn.tree.DecisionTreeClassifier( random_state=random_state) if len(y.shape) == 1 or y.shape[1] == 1: tree.fit( X.iloc[train] if hasattr(X, 'iloc') else X[train], y.iloc[train] if hasattr(y, 'iloc') else y[train], ) else: tree = OneVsRestClassifier(tree) tree.fit( X.iloc[train] if hasattr(X, 'iloc') else X[train], y.iloc[train] if hasattr(y, 'iloc') else y[train], ) predictions = tree.predict( X.iloc[test] if hasattr(X, 'iloc') else X[test], ) accuracy += sklearn.metrics.accuracy_score( predictions, y.iloc[test] if hasattr(y, 'iloc') else y[test], ) return accuracy / 5
def get_track(genre, tree): genre = str(genre) proceed = True pl_list = [ 'new music friday', 'singled out', 'new music friday uk', 'new music friday canada', 'new music friday au' ] rand_int = random.randint(0, 4) #goes to 'new music friday' playlist which is updated by spotify print(rand_int) print(f"the playlist: {pl_list[rand_int]}") playlist_id = search( pl_list[rand_int], 'playlist') #returns id of first search result in a string playlist_data = get( f'v1/playlists/{playlist_id}/tracks') #array of song data num_array = np.linspace(0, 98, 99) int_array = [int(x) for x in num_array] while proceed == True: index = random.randint(0, len(int_array) - 1) n = int_array[index] print(index, n) track_id = playlist_data['items'][n]['track']['id'] try: track_data = get(f'v1/audio-features/{track_id}') key = track_data['key'] time_sig = track_data['time_signature'] acst = track_data['acousticness'] dance = track_data['danceability'] enrg = track_data['energy'] loud = track_data['loudness'] tempo = track_data['tempo'] speech = track_data['speechiness'] instrum = track_data['instrumentalness'] track_features = [ speech, instrum, key, time_sig, acst, dance, enrg, loud, tempo ] #print(track_features) result = tree.predict([track_features]) #print(result) except: result = 'invalid' #print(result) int_array = np.delete(int_array, index) print(result, track_id) if result == genre: print("hello") return f'https://open.spotify.com/embed/track/{track_id}' break
def calc_dna_dist_mat( model: t.Union[sklearn.ensemble.RandomForestClassifier, sklearn.ensemble.RandomForestRegressor], X: np.ndarray, ) -> t.Tuple[np.ndarray, str, float]: """Calculate DNA distance matrix between trees.""" inst_num = X.shape[0] dna = np.zeros((model.n_estimators, inst_num), dtype=X.dtype) for tree_ind, tree in enumerate(model.estimators_): dna[tree_ind, :] = tree.predict(X) if isinstance(model, sklearn.ensemble.RandomForestClassifier): # Shift Cohen's Kappa to prevent negative values, and also transform # it to a distance measure (i.e., the higher is the correlation, the # smaller will be the dna_dists value.) # Note: this distance measure is in [0, 2], with 0 being 'totally # equal' and 2 being 'totally distinct.' dna_dists = 1.0 - scipy.spatial.distance.pdist( X=dna, metric=sklearn.metrics.cohen_kappa_score) dist_formula = "1 - Cohen_kappa(x)" max_limit = 2.0 else: dna_min, dna_max = np.quantile(dna, (0, 1), axis=0) dna = (dna - dna_min) / (1e-8 + dna_max - dna_min) dna_dists = scipy.spatial.distance.pdist(X=dna, metric="euclidean") dist_formula = "Euclidean_dist(x)" max_limit = inst_num**0.5 return dna_dists, dist_formula, max_limit
def predict(self, input_data): # first create a dictionary that will store the results results = {} for tree in self.random_forest: # get the result form one of the tree and add it to appropriate element in dict tree_result = tree.predict(input_data) if tree_result in results: results[tree_result] += 1 else: results[tree_result] = 1 # IMPORTANT - following operations are required to make sure that the result is the same as obtained from scikit # the problem (class name, number of votes): # 0: 5, 1: 0, 2: 1, 3: 5 # scikit result - 0 (even though 0 and 3 have the same number of votes) # my result - it depends on which value was presented first, so it can be 0 or 3 # find maximal value max_value = max(results.values()) # and use it to get all pairs that are equal max_result = [(key, value) for key, value in results.items() if value == max_value] # at the end get element with the lowest key value chosen_class = min(max_result, key=lambda t: t[0])[0] return chosen_class
def process_query(tree, query, index): global classification_error_samples target_value = tree.predict(query) print_query(query, index) print("Predicted Target Value : " + target_value, '\n') plot_categorical_decision_tree(index, query) return target_value
def LandmarkDecisionTree(X, y, categorical): if not sps.issparse(X): import sklearn.tree if len(y.shape) == 1 or y.shape[1] == 1: kf = sklearn.model_selection.StratifiedKFold(n_splits=10) else: kf = sklearn.model_selection.KFold(n_splits=10) accuracy = 0. for train, test in kf.split(X, y): random_state = sklearn.utils.check_random_state(42) tree = sklearn.tree.DecisionTreeClassifier( random_state=random_state) if len(y.shape) == 1 or y.shape[1] == 1: tree.fit(X[train], y[train]) else: tree = OneVsRestClassifier(tree) tree.fit(X[train], y[train]) predictions = tree.predict(X[test]) accuracy += sklearn.metrics.accuracy_score(predictions, y[test]) return accuracy / 10 else: return np.NaN
def kfold_cross_validation(df,tree,folds_num,features,target): kf = KFold(n_splits = folds_num, shuffle=True) attributes = df[features] labels = df[target] accuracy= [] precision = [] recall = [] f1score =[] #RMSE = [] scores= [] for i in range(folds_num): result = next(kf.split(attributes), None) x_train = attributes.iloc[result[0]] x_test = attributes.iloc[result[1]] y_train = labels.iloc[result[0]] y_test = labels.iloc[result[1]] model = tree.fit(x_train,y_train) y_pred = tree.predict(x_test) accuracy.append(accuracy_score(y_test, y_pred)) precision.append(precision_score(y_test, y_pred, average="weighted")) #labels=np.unique(y_pred) can be added to calculate the measure only for the labels that have predicted samples recall.append(recall_score(y_test, y_pred, average="weighted")) f1score.append(f1_score(y_test, y_pred, average="weighted")) #RMSE.append(root_mean_squared_error(y_test, y_pred)) #accuracy.append(model.score(x_test,y_test)) #print("Accuracy:",accuracy) #print("Avg accuracy:",np.mean(accuracy)) scores = [np.mean(accuracy),np.mean(precision), np.mean(recall),np.mean(f1score)] #scores = [np.mean(accuracy),np.mean(precision), np.mean(recall),np.mean(f1score),np.mean(RMSE)] return(scores)
def predictAndCreateDataFrame(tree, testFeatures, testData): import numpy as np prediction = tree.predict(testFeatures) passengerId = np.array(testData['PassengerId']).astype(int) solution = pd.DataFrame(prediction, passengerId, columns=['Survived']) solution.to_csv('solution.csv', index_label=['PassengerId']) return solution
def oob_regression_mse_score(rf, X_train, y_train): """ Compute out-of-bag (OOB) MSE for a scikit-learn random forest regressor. We learned the guts of scikit's RF from the BSD licensed code: https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/ensemble/forest.py#L702 """ X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train y = y_train.values if isinstance(y_train, pd.Series) else y_train n_samples = len(X) predictions = np.zeros(n_samples) n_predictions = np.zeros(n_samples) for tree in rf.estimators_: unsampled_indices = _generate_unsampled_indices( tree.random_state, n_samples) tree_preds = tree.predict(X[unsampled_indices, :]) predictions[unsampled_indices] += tree_preds n_predictions[unsampled_indices] += 1 if (n_predictions == 0).any(): warnings.warn("Too few trees; some variables do not have OOB scores.") n_predictions[n_predictions == 0] = 1 predictions /= n_predictions oob_score = mean_squared_error(y, predictions) return oob_score
def plot_rf_mae(rf, X_test, y_test): """Plot MAE for all trees in a RandomForestRegressor sklearn object. :param rf: trained RandomForestRegressor :param X_test: X test DataFrame :param y_test: y test DataFrame """ mae_trees = [ mean_absolute_error(tree.predict(X_test), y_test) for tree in rf.estimators_ ] index_trees = np.arange(len(rf.estimators_)) mae_ens = mean_absolute_error(rf.predict(X_test), y_test) # Plotting time plt.figure(figsize=FIGSIZE) plt.bar(x=index_trees, height=mae_trees, color="cornflowerblue") plt.ylim(0, 10) plt.axhline(mae_ens, color="tomato", linewidth=3, linestyle="dashed", label="Random Forest MAE") plt.xticks(index_trees) plt.xlabel("Single Decision Tree") plt.ylabel("MAE") plt.legend()
def Tree_graph(X_train, y_train, X_test, y_test,file_out,name,\ X_dev= None,y_dev=None,\ max_depth=5): x_axis = np.linspace(1, max_depth, num=max_depth) plt.figure(figsize=(10, 7)) y_acc = np.zeros(x_axis.shape) y_rec_pos = np.zeros(x_axis.shape) y_rec_neg = np.zeros(x_axis.shape) for n in range(max_depth): print(n + 1) tree = model_tree_fit(X_train,y_train,X_dev,y_dev,X_test,y_test,file_out,\ max_depth=n+1,\ out = False) all = precision_recall_fscore_support(y_test, tree.predict(X_test)) prec_all, rec_all, f1_all, __ = all y_rec_pos[n] = rec_all[0] y_rec_neg[n] = rec_all[1] #print (accuracy_score(y_test,gb.predict(X_test))) y_acc[n] = accuracy_score(y_test, tree.predict(X_test)) print('finished') plt.plot(x_axis, y_rec_pos, color='b', lw=3, alpha=0.7, label='Recall_positive') plt.plot(x_axis, y_rec_neg, color='g', lw=3, alpha=0.7, label='Recall_negative') plt.plot(x_axis, y_acc, color='r', lw=3, alpha=0.7, label='Accuracy') plt.title('Tree') plt.xlabel('depth') plt.ylabel('Metric, %') plt.legend(loc='upper right') plt.grid(True) path = 'Graphs/Tree_for_' plt.savefig(path + 'depth_' + str(max_depth) + name + '.png')
def fit(self, X, y): ''' Trains the model Arguments: X is a n-by-d numpy array y is an n-dimensional numpy array ''' #TODO n = len(X) self.k = len(np.unique(y)) #build map of indices to classes a = 0 convert_map = {} for i in np.unique(y): self.class_map[a] = i convert_map[i] = a a += 1 #convert y to be labelled 0-(k-1) instead of 1-k for i in xrange(len(y)): y[i] = convert_map[y[i]] w = np.ones(n) / n for t in xrange(self.numBoostingIters): #fit based on weights tree = DecisionTreeClassifier(max_depth=self.maxTreeDepth) tree.fit(X, y, sample_weight=w) self.tree_array.append(tree) #calculate weighted training error epsilon = 0 y_preds = tree.predict(X) wrong_preds = np.nonzero(y_preds - y) for index in wrong_preds[0]: epsilon += w[index] #early stopping if epsilon == 0: break #calculate beta beta = 0.5 * np.log((1 - epsilon)/epsilon) + np.log(self.k-1) self.beta_array.append(beta) #update all instance weights for i in xrange(len(w)): if i not in wrong_preds[0]: w[i] = w[i] * np.exp(-1 * beta) else: w[i] = w[i] * np.exp(beta) #normalize weight vector w = w / np.sum(w) #fix y for i in xrange(len(y)): y[i] = self.class_map[y[i]]
def evaluate_regressor(tree, X, Y): """ Evaluates a tree with the data values passed, returning the R2 and MSE """ r2 = tree.score(X, Y) e = tree.predict(X) mse = np.average(np.power((e - Y.values), 2)) return r2, mse
def calc_accuracy(tree, test_dataset): true_predicted = 0 for i in range(0, len(test_dataset.index)): if test_dataset.loc[ i, test_dataset.columns == 'y'].values == tree.predict( [test_dataset.loc[i, test_dataset.columns != 'y']]): true_predicted += 1 return true_predicted / len(test_dataset.index)
def makeAcuracy(tree, x_test, y_test): predictions = tree.predict(x_test) erro = 0.0 for x in range(len(predictions)): if predictions[x] != y_test[x]: erro += 1. acuracy = (1 - (erro / len(predictions))) return acuracy
def plot_n_predictions_rf(rf, X_test, N=10): sample = X_test.sample(N, random_state=42) predictions = pd.DataFrame( [tree.predict(sample).tolist() for tree in rf.estimators_], columns=["#{}".format(i) for i in sample.index]) plt.figure(figsize=(13, 7)) sns.boxplot(data=predictions) plt.xlabel("Index of sample") plt.ylabel("Prediction")
def predict(self, X): predictions = [] for tree in self.decision_trees: predictions.append(tree.predict(X)) total_pred = np.vstack(predictions) mode_prediction = stats.mode(total_pred).mode[0] return mode_prediction
def calc_prediction_result_success(tree, dataset): result = list() for i in range(0, len(dataset.index)): if dataset.loc[i, dataset.columns == 'class'].values \ != tree.predict([dataset.loc[i, dataset.columns != 'class']]): result.append(-1) else: result.append(1) return result
def predictions(tree, path): os.chdir(path) allsentences = [] for file in glob.glob("*.html"): sopa = BeautifulSoup(codecs.open(file), 'html.parser') texto = get_text(sopa) allsentences.append(texto) vocab = tokenize(allsentences) X_baseline = bag_of_words_from_sentences(allsentences, vocab) return tree.predict(X_baseline)
def err(x, y): from sklearn import tree tree = tree.DecisionTreeClassifier(random_state=0) tree.fit(x, y) error = 0 for i, v in enumerate(tree.predict(D_data)): if v != D_target[i]: error += 1 erate = error / float(len(D_target)) return erate
def decision_tree(train_set, test_set, features): start_time = time.time() # Instantiate the classifier tree = skl.tree.DecisionTreeClassifier(criterion="entropy") # Train classifier tree.fit(train_set[features].values, train_set['target']) # Predict y_pred = tree.predict(test_set[features]) # Report results report_results("Decision Tree", time.time() - start_time, test_set["target"], y_pred)
def _calculate(self, X, y, categorical): import sklearn.tree kf = sklearn.cross_validation.StratifiedKFold(y, n_folds=10) accuracy = 0. for train, test in kf: random_state = sklearn.utils.check_random_state(42) tree = sklearn.tree.DecisionTreeClassifier(random_state=random_state) tree.fit(X[train], y[train]) predictions = tree.predict(X[test]) accuracy += sklearn.metrics.accuracy_score(predictions, y[test]) return accuracy / 10
def _calculate(self, X, y, categorical): import sklearn.tree kf = sklearn.cross_validation.StratifiedKFold(y, n_folds=10) accuracy = 0.0 for train, test in kf: random_state = sklearn.utils.check_random_state(42) tree = sklearn.tree.DecisionTreeClassifier(random_state=random_state) tree.fit(X[train], y[train]) predictions = tree.predict(X[test]) accuracy += sklearn.metrics.accuracy_score(predictions, y[test]) return accuracy / 10
def predict(self, x): if self.forest == None: raise Exception('Model hasn\'t been trained yet!') yy = [] for tree in self.forest: yy.append(tree.predict(x)) y = [] for i in range(len(x)): r = [] for j in range(len(yy)): r.append(yy[j][i]) y.append(r.count(1) >= r.count(0)) return y
def predict(self, x): if self.forest == None: raise Exception('Model hasn\'t been trained yet!') yy = [] for tree in self.forest: yy.append(tree.predict(x)) y = [] for i in range(len(x)): m = 0 for j in range(len(yy)): m += yy[j][i] m /= len(yy) y.append(m) return y
def cluster_then_forest(xs, ys, in_sample_size): isi, in_sample, osi, out_sample = create_in_out_samples(xs, in_sample_size) clf = cluster.KMeans(n_clusters = 4) clf.fit(in_sample) oos_clusterid = clf.predict(out_sample) ins_clusterid = clf.predict(in_sample) for id in numpy.unique(oos_clusterid): print "Now working on Cluster " + str(id) oos_ind = oos_clusterid == id ins_ind = ins_clusterid == id tree = ensemble.RandomForestRegressor(50) tree.fit(in_sample[ins_ind], ys[isi][ins_ind]) print "Score for in-sample" print str(tree.score(in_sample[ins_ind], ys[isi][ins_ind])) print "Score for out-of sample" tree.predict(out_sample[oos_ind]) print str(tree.score(out_sample[oos_ind], ys[osi][oos_ind])) return None
def using_DecisionTree(): trainfile = os.path.join('data', 'train.csv') testFile = os.path.join('data', 'test.csv') train, test = regular(trainfile, testFile) data = train.values label = train.index.values tree = DecisionTree(maxDeep = 4) tree.buildTree(data, label) id = test.index.values testData = test.values res = 'id,country\n' for i in range(len(testData)): vector = testData[i] id_i = id[i] label = tree.predict(vector) line = '{0},{1}\n'.format(id_i, label) res += line with open('submit_myDecisionTree.txt', 'w') as f: f.write(res)
def _calculate(self, X, y, categorical): import sklearn.tree if len(y.shape) == 1 or y.shape[1] == 1: kf = sklearn.cross_validation.StratifiedKFold(y, n_folds=10) else: kf = sklearn.cross_validation.KFold(y.shape[0], n_folds=10) accuracy = 0. for train, test in kf: random_state = sklearn.utils.check_random_state(42) tree = sklearn.tree.DecisionTreeClassifier(random_state=random_state) if len(y.shape) == 1 or y.shape[1] == 1: tree.fit(X[train], y[train]) else: tree = OneVsRestClassifier(tree) tree.fit(X[train], y[train]) predictions = tree.predict(X[test]) accuracy += sklearn.metrics.accuracy_score(predictions, y[test]) return accuracy / 10
import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn import tree, datasets from sklearn.externals import joblib from numpy import * print ('3.- Resultados del vector de prueba') #Se importan los datos p = pd.read_csv('test.csv') X_test = p.values[:,0:4] y_test = p.values[:,4] #Cargamos la maquina tree = joblib.load('maquinaDTC.pkl') # Verificamos el acierto con el grupo de test P = tree.predict(X_test) a = sum(P == y_test) b = y_test.shape[0] print ('Porcentaje de Acierto :'+str(a)+' / ' +str(b)+ ' => ' + str(float(a)/float(b))) print ('')
# ## Pickle the clustering model # In[77]: clustering_model = {"model": clf} pickle.dump( clustering_model, open('../tree_model.pickle', 'wb') ) # In[78]: for t in ['all', 'test']: cur_data = pd.read_csv('../' + t + '_data_vectorized.csv', sep = '|', error_bad_lines=False, index_col="SubjectID") cur_data = cur_data[clustering_columns] res = pd.DataFrame(index = cur_data.index.astype(str)) # SubjectID is always str for later joins res['cluster'] = tree.predict(cur_data) print np.bincount(res.cluster) print t, res.shape res.to_csv('../' + t + '_tree_clusters.csv',sep='|') # In[ ]: res.head() # In[ ]:
def predict(tree, locLs, X, goodLocsFile): result = tree.predict(X) with open(goodLocsFile, 'w') as fout: for k,r in zip(locLs, result): ls = (k,r) print >> fout, '\t'.join(ls)
''' Decision Tree ''' binary_data = pd.get_dummies(all_census_prep) X_train, X_test, y_train, y_test = cross_validation.train_test_split(binary_data[binary_data.columns.difference(["earning_class"])], binary_data["earning_class"], train_size=0.80) scaler = preprocessing.StandardScaler() X_train = pd.DataFrame(scaler.fit_transform(X_train.astype("f64")), columns=X_train.columns) X_test = scaler.transform(X_test.astype("f64")) from sklearn.tree import DecisionTreeClassifier, export_graphviz tree = DecisionTreeClassifier(criterion='entropy',max_depth=20) tree.fit(X_train, y_train) y_pred = tree.predict(X_test) cm = metrics.confusion_matrix(y_test, y_pred) (cm[0][0]+cm[1][1]).astype('f64')/sum(sum(cm)) feature_names = list(X_train.columns) export_graphviz(tree, out_file="tree.dot",feature_names=feature_names) import pydotplus import pyparsing import StringIO dotfile = StringIO.StringIO() export_graphviz(tree, out_file=dotfile,feature_names=feature_names) graph = pydotplus.graph_from_dot_data(dotfile.getvalue()) graph.write_png("dtree2.png") '''
from sklearn import tree def classify(Xtrain, Ytrain): """ Use entirety of provided X, Y to predict Arguments Xtrain -- Training data Ytrain -- Training prediction Returns ready_tree -- a tree fitted to Xtrain and Ytrain """ ready_tree = tree.DecisionTreeClassifier() ready_tree.fit(Xtrain, Ytrain) return ready_tree if __name__ == "__main__": # Let's take our training data and train a decision tree # on a subset. Scikit-learn provides a good module for cross- # validation. if len(sys.argv) < 2: print "Usage: $ python decision-tree.py /path/to/data/file/" else: training = sys.argv[1] X,Y,n,f = load_data(training) Xt, Xv, Yt, Yv = shuffle_split(X,Y) tree = classify(Xt, Yt) print "Decision Tree Accuracy:",acc(Yv, tree.predict(Xv)),"%"