def model_complexity(X_train, y_train, X_test, y_test): """Calculate the performance of the model as model complexity increases.""" print "Model Complexity: " # We will vary the depth of decision trees from 2 to 25 max_depth = np.arange(1, 25) train_err = np.zeros(len(max_depth)) test_err = np.zeros(len(max_depth)) for i, d in enumerate(max_depth): # Setup a Decision Tree Regressor so that it learns a tree with depth d regressor = DecisionTreeRegressor(max_depth=d) # Fit the learner to the training data regressor.fit(X_train, y_train) # Find the performance on the training set train_err[i] = performance_metric(y_train, regressor.predict(X_train)) # Find the performance on the testing set test_err[i] = performance_metric(y_test, regressor.predict(X_test)) # Plot the model complexity graph model_complexity_graph(max_depth, train_err, test_err)
def test_rt(): boston = load_boston() X, y = boston.data, boston.target feature_names = boston.feature_names sk_dt = SKRT(random_state=1, max_depth=3) our_dt = RegressionTree(feature_names=feature_names, random_state=1) sk_dt.fit(X, y) our_dt.fit(X, y) sk_pred = sk_dt.predict(X) our_pred = our_dt.predict(X) assert np.allclose(sk_pred, our_pred) # With labels local_expl = our_dt.explain_local(X, y) local_viz = local_expl.visualize(0) assert local_viz is not None # Without labels local_expl = our_dt.explain_local(X) local_viz = local_expl.visualize(0) assert local_viz is not None global_expl = our_dt.explain_global() global_viz = global_expl.visualize() assert global_viz is not None
def learning_curve(depth, X_train, y_train, X_test, y_test, iteration=None): """Calculate the performance of the model after a set of training data.""" # We will vary the training set size so that we have 50 different sizes sizes = np.linspace(1, len(X_train), 50) train_err = np.zeros(len(sizes)) test_err = np.zeros(len(sizes)) print "Decision Tree with Max Depth: " print depth for i, s in enumerate(sizes): # Create and fit the decision tree regressor model regressor = DecisionTreeRegressor(max_depth=depth) regressor.fit(X_train[:s], y_train[:s]) # Find the performance on the training and testing set train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s])) test_err[i] = performance_metric(y_test, regressor.predict(X_test)) # Plot learning curve graph learning_curve_graph(depth, sizes, train_err, test_err) # added to produce figure 2 if iteration is not None: print "Final error at max_depth={}: {}".format(depth, test_err[-1]) fully_trained_error[depth - 1][iteration] = test_err[-1]
def test_decision_tree_regression(filename): start_time = time.time() scores = [] from sklearn.tree import DecisionTreeRegressor df = pd.read_csv(filename) h_indep = df.columns[:-1] h_dep = df.columns[-1] for _ in xrange(10): # print "- ", sys.stdout.flush() msk = np.random.rand(len(df)) < 0.4 train_data = df[msk] test_data = df[~msk] # print len(train_data), len(test_data) assert (len(train_data) + len(test_data) == len(df)), "Something is wrong" train_indep = train_data[h_indep] train_dep = train_data[h_dep] test_indep = test_data[h_indep] test_dep = test_data[h_dep] dt = DecisionTreeRegressor() dt.fit(train_indep, [i for i in train_dep.values.tolist()]) prediction = dt.predict(test_indep) from sklearn.metrics import mean_absolute_error scores.append(mean_absolute_error(test_dep, prediction)) # print len(confusion_matrices), extract_name = filename.split("/")[-1].split(".")[0] + ".p" # import pickle # pickle.dump(confusion_matrices, open("./Results_RF_Classification/CM_" + extract_name, "wb")) print round(np.mean(scores), 3), round(time.time() - start_time, 3), "sec"
def learning_curve(depth, X_train, y_train, X_test, y_test): """Calculate the performance improvement of the model, as training size increases.""" # create 50 equally spaced markers for the the graph's X axis sizes = np.round(np.linspace(1, len(X_train), 50)) # create 50 open bins to fill in the training and test errors train_err = np.zeros(len(sizes)) test_err = np.zeros(len(sizes)) print "Decision Tree with Max Depth: " print depth for i, s in enumerate(sizes): # train classifier and test on each level of depth complexity regressor = DecisionTreeRegressor(max_depth=depth) regressor.fit(X_train[:s], y_train[:s]) # fill in the training and test error train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s])) test_err[i] = performance_metric(y_test, regressor.predict(X_test)) # create the learning curve graph, using the calculated information learning_curve_graph(sizes, train_err, test_err) return test_err[-1]
def arbolesRegresion(caract): clf = DecisionTreeRegressor(min_samples_leaf=10, min_samples_split=15, max_depth=13, compute_importances=True) importancias = [0,0,0,0,0,0,0,0,0,0,0,0,0] mae=mse=r2=0 kf = KFold(len(boston_Y), n_folds=10, indices=True) for train, test in kf: trainX, testX, trainY, testY=boston_X[train], boston_X[test], boston_Y[train], boston_Y[test] nCar=len(caract) train=np.zeros((len(trainX), nCar)) test=np.zeros((len(testX), nCar)) trainYNuevo=trainY for i in range(nCar): for j in range(len(trainX)): train[j][i]=trainX[j][caract[i]] for k in range(len(testX)): test[k][i]=testX[k][caract[i]] trainYNuevo=np.reshape(trainYNuevo, (len(trainY), -1)) clf.fit(train, trainYNuevo) prediccion=clf.predict(test) # clf.fit(trainX, trainY) # prediccion=clf.predict(testX) mae+=metrics.mean_absolute_error(testY, prediccion) mse+=metrics.mean_squared_error(testY, prediccion) r2+=metrics.r2_score(testY, prediccion) feature_importance = clf.feature_importances_ feature_importance = 100.0 * (feature_importance / feature_importance.max()) for i in range(13): importancias[i] = importancias[i] + feature_importance[i] print 'Error abs: ', mae/len(kf), 'Error cuadratico: ', mse/len(kf), 'R cuadrado: ', r2/len(kf) for i in range(13): importancias[i] = importancias[i]/10 sorted_idx = np.argsort(importancias) pos = np.arange(sorted_idx.shape[0]) + .5 importancias = np.reshape(importancias, (len(importancias), -1)) boston = datasets.load_boston() pl.barh(pos, importancias[sorted_idx], align='center') pl.yticks(pos, boston.feature_names[sorted_idx]) pl.xlabel('Importancia relativa') pl.show() import StringIO, pydot dot_data = StringIO.StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("bostonTree.pdf")
def nn_lin(self, testX, neighbors): l = DecisionTreeRegressor() return np.mean(self.Y[neighbors]) l.fit(self.X[neighbors], self.Y[neighbors]) # for idx in np.where(l.coef_)[0]: # self.active[idx]+=1 return l.predict([testX])[0]
def test_thresholded_scorers(): # Test scorers that take thresholds. X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LogisticRegression(random_state=0) clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.decision_function(X_test)) score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) assert_almost_equal(score1, score3) logscore = get_scorer('log_loss')(clf, X_test, y_test) logloss = log_loss(y_test, clf.predict_proba(X_test)) assert_almost_equal(-logscore, logloss) # same for an estimator without decision_function clf = DecisionTreeClassifier() clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) # test with a regressor (no decision_function) reg = DecisionTreeRegressor() reg.fit(X_train, y_train) score1 = get_scorer('roc_auc')(reg, X_test, y_test) score2 = roc_auc_score(y_test, reg.predict(X_test)) assert_almost_equal(score1, score2) # Test that an exception is raised on more than two classes X, y = make_blobs(random_state=0, centers=3) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf.fit(X_train, y_train) assert_raises(ValueError, get_scorer('roc_auc'), clf, X_test, y_test)
def plot_curve(): # Defining our regression algorithm reg = DecisionTreeRegressor() # Fit our model using X and y reg.fit(X, y) print "Regressor score: {:.4f}".format(reg.score(X,y)) # TODO: Use learning_curve imported above to create learning curves for both the # training data and testing data. You'll need reg, X, y, cv and score from above. # Note: Because i didnt use all the parameters in order of function definition for learning_curve fn, # I have to explicitly assign values to the parameters. e.g, from learning_curve fn, after 'y' # comes 'train_sizes'. But since it is optional and I am not using that parameter, for all other parameters # that come after, i have to explicitly assign values to the parameter (e.g cv=cv, scoring=score) # else error train_sizes, train_scores, test_scores = learning_curve(reg, X, y, cv=cv, scoring=score) # Taking the mean of the test and training scores train_scores_mean = np.mean(train_scores,axis=1) test_scores_mean = np.mean(test_scores,axis=1) # Plotting the training curves and the testing curves using train_scores_mean and test_scores_mean plt.plot(train_sizes ,train_scores_mean,'-o',color='b',label="train_scores_mean") plt.plot(train_sizes,test_scores_mean ,'-o',color='r',label="test_scores_mean") # Plot aesthetics plt.ylim(-0.1, 1.1) plt.ylabel("Curve Score") plt.xlabel("Training Points") plt.legend(bbox_to_anchor=(1.1, 1.1)) plt.show()
def train_decision_tree(sizes, depth, X_test, X_train, y_test, y_train): """ Args: sizes (Numpy array): Array of training sample sizes to train on. depth (int): The maximum depth of the DecisionTreeRegressor X_test (Numpy array): Test set features X_train (Numpy array): Training set features y_test (Numpy array): Test set target variable y_train (Numpy array): Training set target variable Returns: test_err (Numpy array): Test set predictions. train_err (Numpy array): Training set predictions. """ train_err = np.zeros(len(sizes)) test_err = np.zeros(len(sizes)) for i, s in enumerate(sizes): # Create and fit the decision tree regressor model regressor = DecisionTreeRegressor(max_depth=depth) # Cast to int to avoid DeprecationWarning from numpy 1.8 regressor.fit(X_train[:int(s)], y_train[:int(s)]) # Find the performance on the training and testing set train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s])) test_err[i] = performance_metric(y_test, regressor.predict(X_test)) return test_err, train_err
def test_boston(self): from sklearn.tree import DecisionTreeRegressor as DecisionTreeRegressorSklearn model = DecisionTreeRegressor(tree_type='oblivious', max_n_splits=3) model_sklearn = DecisionTreeRegressorSklearn() dataset = load_boston() mse = [] mse_sklearn = [] for fold in range(5): X_train, X_test, y_train, y_test = train_test_split( dataset.data, dataset.target, test_size=0.33) model.fit(X_train, y_train) y = model.predict(X_test) mse.append(mean_squared_error(y, y_test)) model_sklearn.fit(X_train, y_train) y = model_sklearn.predict(X_test) mse_sklearn.append(mean_squared_error(y, y_test)) mean_mse = np.mean(mse) mean_mse_sklearn = np.mean(mse_sklearn) print(mean_mse, mean_mse_sklearn) # Check that our model differs in MSE no worse than 50% self.assertTrue(np.abs(mean_mse - mean_mse_sklearn) / mean_mse_sklearn < 0.5)
def train_decision_tree(time_regression_df, test_size, random_state, max_depth, export_testset): time_regression_df_train, time_regression_df_test = cv.train_test_split(time_regression_df, test_size=test_size, random_state=random_state) y_train = time_regression_df_train['trip_time'] x_train = time_regression_df_train.ix[:, 0:6] y_test = time_regression_df_test['trip_time'] x_test = time_regression_df_test.ix[:, 0:6] if export_testset: xy_test = pd.concat([x_test, y_test], axis=1) xy_test.to_csv('../data/' + filename_prefix + '_testset.csv') tic = time.time() regtree = DecisionTreeRegressor(max_depth=max_depth, min_samples_split=3, random_state=random_state) regtree.fit(x_train, y_train) elapsed = time.time() - tic print(elapsed) export_meta_data(regtree, x_test, y_test, elapsed) target_location = ('../treelib/' + filename_prefix + '_tree_depth_' + str(regtree.tree_.max_depth)) dump_model(regtree, target_location) return regtree
class TestDecisionTreeRegressorConverter(TestCase): def setUp(self): np.random.seed(1) self.est = DecisionTreeRegressor(max_depth=2) self.est.fit([ [0, 0], [0, 1], [1, 0], [1, 1], ], [0, 1, 1, 1]) self.ctx = TransformationContext( input=[IntegerNumericFeature('x1'), StringCategoricalFeature('x2', ['zero', 'one'])], model=[IntegerNumericFeature('x1'), StringCategoricalFeature('x2', ['zero', 'one'])], derived=[], output=[IntegerNumericFeature('output')] ) self.converter = DecisionTreeConverter( estimator=self.est, context=self.ctx, mode=DecisionTreeConverter.MODE_REGRESSION ) def test_transform(self): p = self.converter.pmml() tm = p.TreeModel[0] assert tm.MiningSchema is not None, 'Missing mining schema' assert len(tm.MiningSchema.MiningField) == 3, 'Wrong number of mining fields' assert tm.Node is not None, 'Missing root node' assert tm.Node.recordCount == 4 assert tm.Node.True_ is not None, 'Root condition should always be True'
def learning_curve(depth, X_train, y_train, X_test, y_test): """Calculate the performance of the model after a set of training data.""" # We will vary the training set size so that we have 50 different sizes sizes = np.round(np.linspace(1, len(X_train), 50)) train_err = np.zeros(len(sizes)) test_err = np.zeros(len(sizes)) sizes = [int(ii) for ii in sizes] print "Decision Tree with Max Depth: " print depth for i, s in enumerate(sizes): # Create and fit the decision tree regressor model regressor = DecisionTreeRegressor(max_depth=depth) regressor.fit(X_train[:s], y_train[:s]) # Find the performance on the training and testing set train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s])) test_err[i] = performance_metric(y_test, regressor.predict(X_test)) # Plot learning curve graph learning_curve_graph(sizes, train_err, test_err)
class SimpleGB(BaseEstimator): def __init__(self, tree_params_dict, iters, tau): self.tree_params_dict = tree_params_dict self.iters = iters self.tau = tau def fit(self, X_data, y_data): self.base_algo = DecisionTreeRegressor(**self.tree_params_dict).fit(X_data, y_data) self.estimators = [] curr_pred = self.base_algo.predict(X_data) for iter_num in range(self.iters): # Нужно посчитать градиент функции потерь grad = 0. # TODO # Нужно обучить DecisionTreeRegressor предсказывать антиградиент # Не забудьте про self.tree_params_dict algo = DecisionTreeRegressor().fit(X_data, y_data) # TODO self.estimators.append(algo) # Обновите предсказания в каждой точке curr_pred += 0. # TODO return self def predict(self, X_data): # Предсказание на данных res = self.base_algo.predict(X_data) for estimator in self.estimators: res += self.tau * estimator.predict(X_data) # Задача классификации, поэтому надо отдавать 0 и 1 return res > 0.
def get_imp(X,y): #rf = RandomForestClassifier() rf = DecisionTreeRegressor(random_state=9) rf.fit(X, y) imp_var = rf.feature_importances_ imp_var = pd.DataFrame({'variable':X.columns, 'imp':imp_var}).sort('imp', ascending=False) return(imp_var)
def decision_tree_regressor(X, y, labels): regressor = DecisionTreeRegressor(max_depth=3) regressor.fit(X, y) estimates_z = regressor.predict(X) leaves = regressor.apply(X) leaves_hash = np.zeros(np.max(leaves) + 1) for i in range(len(y)): if (estimates_z[i] - y[i]) > 0.05 and estimates_z[i] > 0.6 and y[i] > 0: # print estimates_z[i] # print y[i] # print estimates_z[i]-y[i] # print ((estimates_z[i]-y[i])>0.1 and estimates_z[i]>0 and y[i]>0) # print leaves[i] leaves_hash[leaves[i]] += 1 # print leaves_hash[leaves[i]] else: leaves_hash[-1] += 1 # print regressor.tree_.decision_path(X) print regressor.tree_.feature print regressor.tree_.threshold print leaves_hash print regressor.feature_importances_ visualize_tree(regressor.tree_, labels) return estimates_z
def CART(self): " CART" # Apply random forest Classifier to predict the number of bugs. if self.smoteit: self.train = SMOTE( self.train, atleast=50, atmost=101, resample=self.duplicate) if not self.tuning: clf = DecisionTreeRegressor(random_state=1) else: clf = DecisionTreeRegressor(max_depth=int(self.tunings[0]), min_samples_split=int(self.tunings[1]), min_samples_leaf=int(self.tunings[2]), max_features=float(self.tunings[3] / 100), max_leaf_nodes=int(self.tunings[4]), criterion='entropy', random_state=1) features = self.train.columns[:-2] klass = self.train[self.train.columns[-2]] # set_trace() clf.fit(self.train[features].astype('float32'), klass.astype('float32')) preds = clf.predict( self.test[self.test.columns[:-2]].astype('float32')).tolist() return preds
def model_complexity(X_train, y_train, X_test, y_test): """ Calculates the performance of the model as model complexity increases. The learning and testing errors rates are then plotted. """ print "Creating a model complexity graph. . . " # We will vary the max_depth of a decision tree model from 1 to 14 max_depth = np.arange(1, 14) train_err = np.zeros(len(max_depth)) test_err = np.zeros(len(max_depth)) for i, d in enumerate(max_depth): # Setup a Decision Tree Regressor so that it learns a tree with depth d regressor = DecisionTreeRegressor(max_depth = d) # Fit the learner to the training data regressor.fit(X_train, y_train) # Find the performance on the training set train_err[i] = performance_metric(y_train, regressor.predict(X_train)) # Find the performance on the testing set test_err[i] = performance_metric(y_test, regressor.predict(X_test)) # Plot the model complexity graph pl.figure(figsize=(7, 5)) pl.title('Decision Tree Regressor Complexity Performance') pl.plot(max_depth, test_err, lw=2, label = 'Testing Error') pl.plot(max_depth, train_err, lw=2, label = 'Training Error') pl.legend() pl.xlabel('Maximum Depth') pl.ylabel('Total Error') pl.show()
def fit_model1(X, y): """ Performs grid search over the 'max_depth' parameter for a decision tree regressor trained on the input data [X, y]. """ # Create cross-validation sets from the training data cv_sets = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.20, random_state=0) # TODO: Create a decision tree regressor object regressor = DecisionTreeRegressor() regressor.fit(X, y) # TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10 params = {'max_depth': (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)} # TODO: Transform 'performance_metric' into a scoring function using 'make_scorer' scoring_fnc = make_scorer(performance_metric) #print(regressor.predict(X)) ##scoring_fnc = make_scorer(mean_squared_error) # TODO: Create the grid search object grid_obj = GridSearchCV(regressor, params, scoring=scoring_fnc, cv=cv_sets) # Fit the grid search object to the data to compute the optimal model grid = grid_obj.fit(X, y) # Return the optimal model after fitting the data return grid.best_estimator_
def train_learning_model_decision_tree_ada_boost(df): #code taken from sklearn X_all, y_all = preprocess_data(df) X_train, X_test, y_train, y_test = split_data(X_all, y_all) tree_regressor = DecisionTreeRegressor(max_depth = 6) ada_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=6), n_estimators = 500, learning_rate = 0.01, random_state = 1) tree_regressor.fit(X_train, y_train) ada_regressor.fit(X_train, y_train) y_pred_tree = tree_regressor.predict(X_test) y_pred_ada = ada_regressor.predict(X_test) mse_tree = mean_squared_error(y_test, y_pred_tree) mse_ada = mean_squared_error(y_test, y_pred_ada) mse_tree_train = mean_squared_error(y_train, tree_regressor.predict(X_train)) mse_ada_train = mean_squared_error(y_train, ada_regressor.predict(X_train)) print ("MSE tree: %.4f " %mse_tree) print ("MSE ada: %.4f " %mse_ada) print ("MSE tree train: %.4f " %mse_tree_train) print ("MSE ada train: %.4f " %mse_ada_train)
def learning_curve(depth, X_train, y_train, X_test, y_test): """Calculate the performance of the model after a set of training data.""" # We will vary the training set size so that we have 50 different sizes sizes = np.round(np.linspace(1, len(X_train), 50)) train_err = np.zeros(len(sizes)) test_err = np.zeros(len(sizes)) print "Decision Tree with Max Depth: " print depth for i, s in enumerate(sizes): # Create and fit the decision tree regressor model regressor = DecisionTreeRegressor(max_depth=depth) regressor.fit(X_train[:s], y_train[:s]) # Find the performance on the training and testing set train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s])) test_err[i] = performance_metric(y_test, regressor.predict(X_test)) # if depth >= 4 and depth <= 6: # pl.figure() # pl.plot(y_test, 'bo') # pl.plot(regressor.predict(X_test), color='red') # pl.savefig("test_data_depth_" + str(depth)) # Plot learning curve graph learning_curve_graph(sizes, train_err, test_err, depth)
def test_bootstrap_samples(): """Test that bootstraping samples generate non-perfect base estimators.""" rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) base_estimator = DecisionTreeRegressor().fit(X_train, y_train) # without bootstrap, all trees are perfect on the training set ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(), max_samples=1.0, bootstrap=False, random_state=rng).fit(X_train, y_train) assert_equal(base_estimator.score(X_train, y_train), ensemble.score(X_train, y_train)) # with bootstrap, trees are no longer perfect on the training set ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(), max_samples=1.0, bootstrap=True, random_state=rng).fit(X_train, y_train) assert_greater(base_estimator.score(X_train, y_train), ensemble.score(X_train, y_train))
class CustomClassifier(BaseEstimator, ClassifierMixin): """Predicts the majority class of its training data.""" def __init__(self): global class_instance class_instance += 1 self.instance = class_instance #print "instance:", self.instance def __del__(self): global class_instance class_instance -= 1 def fit(self, X, y, sample_weight=array([])): # 1st Adaboost iteration: just return the current volatility if self.instance <= 2: self.y = y return self # 2+ Adaboost iteration: use linera regreession as a weak learner else: self.regr = DecisionTreeRegressor(max_depth=8) #self.regr = linear_model.Lasso(alpha=0.01,fit_intercept=False,normalize=False,max_iter=10000000) # they call lambda alpha self.regr.fit(X, y) def predict(self, X): # 1st Adaboost iteration: just return the current volatility if self.instance <= 2: return X[:,6] # return 6th element of feature vector (which is the current volatility) # 2+ Adaboost iteration: use linera regreession as a weak learner else: return self.regr.predict(X)
def fit_predict_model(city_data): '''Find and tune the optimal model. Make a prediction on housing data.''' # Get the features and labels from the Boston housing data X, y = city_data.data, city_data.target print X # Setup a Decision Tree Regressor regressor = DecisionTreeRegressor() parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10)} reg = GridSearchCV(regressor, parameters,scoring=make_scorer(metrics.mean_squared_error,greater_is_better=False)) print reg.fit(X, y) depth_values= list() for i in xrange(101): reg.fit(X,y) depth_values.append(int(reg.best_params_['max_depth'])) print "Best model parameter: " + str(np.median(depth_values)) # Fit the learner to the training data # Use the model to predict the output of a particular sample regressor = DecisionTreeRegressor(max_depth=np.median(depth_values)) print "Final Model: " print regressor regressor.fit(X, y) x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13] y = regressor.predict(x) print "House: " + str(x) print "Prediction: " + str(y)
def decision_tree_regressor_fit(bk_columns, bk): clf = DecisionTreeRegressor() X = bk[bk_columns] y = bk['count'] clf = clf.fit(X, y) return clf
def featureRelevance(self, data): testFeature = 'Detergents_Paper' new_data = data.drop([testFeature], axis = 1) X_train, X_test, y_train, y_test = train_test_split(new_data, data[[testFeature]], test_size=0.25, random_state=1) regressor = DecisionTreeRegressor(random_state=30).fit(X_train, y_train) score = regressor.score(X_test, y_test) print("feature relevance test: feature {}, score {}".format(testFeature, score)) return
def learn(train_file, n_trees=10, learning_rate=0.1, k=10, validate=False): print "Loading train file" train = np.loadtxt(train_file, delimiter=",", skiprows=1) scores = train[:, 0] queries = train[:, 1] features = train[:, 3:] ensemble = Ensemble(learning_rate) print "Training starts..." model_output = np.zeros(len(features)) time.clock() for i in range(n_trees): print " Iteration: " + str(i + 1) # Compute psedo responces (lambdas) # witch act as training label for document start = time.clock() print " --generating labels" lambdas = compute_lambdas(model_output, scores, queries, k) print zip(lambdas, scores) print " --done", str(time.clock() - start) + " sec" # create tree and append it to the model print " --fitting tree" start = time.clock() tree = DecisionTreeRegressor(max_depth=6) # print "Distinct lambdas", set(lambdas) tree.fit(features, lambdas) print " ---done", str(time.clock() - start) + " sec" print " --adding tree to ensemble" ensemble.add(tree) # update model score print " --generating step prediction" prediction = tree.predict(features) # print "Distinct answers", set(prediction) print " --updating full model output" model_output += learning_rate * prediction print model_output # train_score start = time.clock() print " --scoring on train" train_score = score(model_output, scores, queries, 10) print " --iteration train score " + str(train_score) + ", took " + str(time.clock() - start) + "sec to calculate" print "Finished sucessfully." print "------------------------------------------------" return ensemble
class Regressor(BaseEstimator): def __init__(self): self.clf = DecisionTreeRegressor(max_depth=5) def fit(self, X, y): self.clf.fit(X, y) def predict(self, X): return self.clf.predict(X)
def tune_regtree(x,y,alpha_list,scoring): scores=[] for alpha in alpha_list: clf = DecisionTreeRegressor(max_depth=alpha) clf.fit(x, y) scores.extend([np.mean(cross_val_score(clf, x, y, cv=5, scoring=scoring))]) max_index = scores.index(min(scores)) print scores return alpha_list[max_index]
def train(self, max_depth, max_leaf_nodes, model_name, output_path): with mlflow.start_run(run_name=self.run_origin ) as run: # NOTE: mlflow CLI ignores run_name run_id = run.info.run_uuid experiment_id = run.info.experiment_id print("MLflow:") print(" run_id:", run_id) print(" experiment_id:", experiment_id) print(" experiment_name:", client.get_experiment(experiment_id).name) # Create model dt = DecisionTreeRegressor(max_depth=max_depth, max_leaf_nodes=max_leaf_nodes) print("Model:\n ", dt) # Fit and predict dt.fit(self.X_train, self.y_train) predictions = dt.predict(self.X_test) # MLflow params print("Parameters:") print(" max_depth:", max_depth) print(" max_leaf_nodes:", max_leaf_nodes) mlflow.log_param("max_depth", max_depth) mlflow.log_param("max_leaf_nodes", max_leaf_nodes) # MLflow metrics rmse = np.sqrt(mean_squared_error(self.y_test, predictions)) mae = mean_absolute_error(self.y_test, predictions) r2 = r2_score(self.y_test, predictions) print("Metrics:") print(" rmse:", rmse) print(" mae:", mae) print(" r2:", r2) mlflow.log_metric("rmse", rmse) mlflow.log_metric("r2", r2) mlflow.log_metric("mae", mae) # MLflow tags mlflow.set_tag("mlflow.runName", self.run_origin) # mlflow CLI picks this up mlflow.set_tag("data_path", self.data_path) mlflow.set_tag("run_origin", self.run_origin) mlflow.set_tag("mlflow_version", mlflow.__version__) mlflow.set_tag("sklearn_version", sklearn.__version__) # MLflow log model mlflow.sklearn.log_model(dt, "sklearn-model", registered_model_name=model_name) # Convert sklearn model to ONNX and log model if self.log_as_onnx: from wine_quality import onnx_utils onnx_utils.log_model(dt, "onnx-model", model_name, self.X_test) # MLflow artifact - plot file plot_file = "plot.png" plot_utils.create_plot_file(self.y_test, predictions, plot_file) mlflow.log_artifact(plot_file) # Write run ID to file if (output_path): mlflow.set_tag("output_path", output_path) output_path = output_path.replace("dbfs:", "/dbfs") with open(output_path, "w") as f: f.write(run_id) return (experiment_id, run_id)
X = X[:, 1:] # Backwards elimination import pandas.util.testing as tm import statsmodels.tools.tools as tl X = tl.add_constant(X) import statsmodels.api as sm X = X[:, [0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 13, 19, 20, 23, 24, 25, 26, 27, 28, 30]] regressor_OLS = sm.OLS(endog=Y, exog=X).fit() regressor_OLS.summary() # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) # Fitting the Decision Tree Regression Model to the dataset from sklearn.tree import DecisionTreeRegressor regressor = DecisionTreeRegressor(max_depth=5) regressor.fit(X_train, Y_train) # Predicting a new result with Linear Regression Y_pred = regressor.predict(X_test) # Residuals calculation residuals = np.average(np.abs(Y_pred - Y_test)) print(residuals)
plt.scatter(X_adr, y_adr) plt.show() #### Implementación del arbol de decisión ##### from sklearn.model_selection import train_test_split # Separamos los datos de entrenamiento y validación X_train, X_test, y_train, y_test = train_test_split(X_adr, y_adr, test_size=0.2) from sklearn.tree import DecisionTreeRegressor # Defino el algoritmo a utilizar adr = DecisionTreeRegressor(max_depth=10) # Entreno el modelo adr.fit(X_train, y_train) # Realizamos la predicción Y_pred = adr.predict(X_test) # Graficamos los datos de prueba junto con la predicción X_grid = np.arange(min(X_test), max(X_test), 0.1) # declaramos un array de X X_grid = X_grid.reshape((len(X_grid), 1)) # lo transformamos a columna plt.scatter(X_test, y_test) plt.plot(X_grid, adr.predict(X_grid), color='red', linewidth=3) plt.show() # Calculamos la precisión del modelo
mean_squared_error(y_test, pred) # In[147]: rmse = np.sqrt(mean_squared_error(np.array(y_test).reshape(-1, 1), pred)) rmse # In[148]: r2_score(y_test, pred) # In[149]: from sklearn.tree import DecisionTreeRegressor model = DecisionTreeRegressor(max_leaf_nodes=10) DecisionR = model.fit(x_train, y_train) DecisionR # In[150]: from sklearn.metrics import mean_squared_error, r2_score pred = DecisionR.predict(x_test) mean_squared_error(y_test, pred) # In[151]: r2_score(y_test, pred) # In[152]:
def test_predict_proba(self): # Check to make sure that the model will train as expected with sklearn.Bunch objects data = load_iris() X_train, y_train = data.data[:120], data.target[:120] X_test, y_test = data.data[120:], data.target[120:] # All '2' variables are the baseline test and what we should match up with default_args = {'random_state': 19} dt1 = DecisionTree(default_args=default_args) dt2 = DecisionTreeClassifier(**default_args) dt1.train(X_train, y_train, exp_type='classification') dt2.fit(X_train, y_train) preds1 = dt1.predict_proba(X_test) preds2 = dt2.predict_proba(X_test) self.assertTrue((preds1 == preds2).all()) data = load_boston() X_train, y_train = data.data[:120], data.target[:120] X_test, y_test = data.data[120:], data.target[120:] # All '2' variables are the baseline test and what we should match up with default_args = {'random_state': 30} dt1 = DecisionTree(default_args=default_args) dt2 = DecisionTreeRegressor(**default_args) dt1.train(X_train, y_train, exp_type='regression') dt2.fit(X_train, y_train) try: preds1 = dt1.predict_proba(X_test) # The following line is not implemented in sklearn, however I've included to match the others #preds2 = dt2.predict_proba(X_test) except NotImplementedError as err: self.assertEqual( str(err), 'The \'predict_proba\' method is not implemented for regression problems (this is an scikit-learn issue, not an alexandria issue!)' ) data = load_diabetes() X_train, y_train = data.data[:120], data.target[:120] X_test, y_test = data.data[120:], data.target[120:] # All '2' variables are the baseline test and what we should match up with default_args = {'random_state': 15} dt1 = DecisionTree(default_args=default_args) dt2 = DecisionTreeRegressor(**default_args) dt1.train(X_train, y_train, exp_type='regression') dt2.fit(X_train, y_train) try: preds1 = dt1.predict_proba(X_test) # The following line is not implemented in sklearn, however I've included to match the others #preds2 = dt2.predict_proba(X_test) except NotImplementedError as err: self.assertEqual( str(err), 'The \'predict_proba\' method is not implemented for regression problems (this is an scikit-learn issue, not an alexandria issue!)' ) data = load_wine() X_train, y_train = data.data[:120], data.target[:120] X_test, y_test = data.data[120:], data.target[120:] # All '2' variables are the baseline test and what we should match up with default_args = {'random_state': 90, 'max_depth': 3} dt1 = DecisionTree(default_args=default_args) dt2 = DecisionTreeClassifier(**default_args) dt1.train(X_train, y_train, exp_type='classification') dt2.fit(X_train, y_train) preds1 = dt1.predict_proba(X_test) preds2 = dt2.predict_proba(X_test) self.assertTrue((preds1 == preds2).all()) # Check to make sure that the model will train as expected with sklearn.Bunch objects data = load_iris(as_frame=True) data = data.frame X = data.loc[:, data.columns != 'target'] y = data['target'] X_train, y_train = X.iloc[:120], y.iloc[:120] X_test, y_test = X.iloc[120:], y.iloc[120:] default_args = {'random_state': 19, 'max_features': 'auto'} dt1 = DecisionTree(default_args=default_args) dt2 = DecisionTreeClassifier(**default_args) dt1.train(X_train, y_train, exp_type='classification') dt2.fit(X_train, y_train) preds1 = dt1.predict_proba(X_test) preds2 = dt2.predict_proba(X_test) self.assertTrue((preds1 == preds2).all()) data = load_diabetes(as_frame=True) data = data.frame X = data.loc[:, data.columns != 'target'] y = data['target'] X_train, y_train = X.iloc[:120], y.iloc[:120] X_test, y_test = X.iloc[120:], y.iloc[120:] default_args = {'random_state': 19, 'min_samples_split': 4} dt1 = DecisionTree(default_args=default_args) dt2 = DecisionTreeRegressor(**default_args) dt1.train(X_train, y_train, exp_type='regression') dt2.fit(X_train, y_train) try: preds1 = dt1.predict_proba(X_test) # The following line is not implemented in sklearn, however I've included to match the others #preds2 = dt2.predict_proba(X_test) except NotImplementedError as err: self.assertEqual( str(err), 'The \'predict_proba\' method is not implemented for regression problems (this is an scikit-learn issue, not an alexandria issue!)' ) data = load_wine(as_frame=True) data = data.frame X = data.loc[:, data.columns != 'target'] y = data['target'] X_train, y_train = X.iloc[:120], y.iloc[:120] X_test, y_test = X.iloc[120:], y.iloc[120:] default_args = {'random_state': 19, 'criterion': 'entropy'} dt1 = DecisionTree(default_args=default_args) dt2 = DecisionTreeClassifier(**default_args) dt1.train(X_train, y_train, exp_type='classification') dt2.fit(X_train, y_train) preds1 = dt1.predict_proba(X_test) preds2 = dt2.predict_proba(X_test) self.assertTrue((preds1 == preds2).all())
X_poly = poly_reg.fit_transform(X2_train) lin_reg_2 = LinearRegression() lin_reg_2.fit(X_poly, y2_train) y2_pred = lin_reg_2.predict(poly_reg.fit_transform(X2_test)) from sklearn.metrics import r2_score r2 = r2_score(y1_test, y1_pred) print(r2) X3 = df3.iloc[:, :-1].values y3 = df3.iloc[:, -1].values from sklearn.model_selection import train_test_split X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state=0) from sklearn.tree import DecisionTreeRegressor regressor3 = DecisionTreeRegressor(random_state=0) regressor3.fit(X3_train, y3_train) y3_pred = regressor3.predict(X3_test) from sklearn.metrics import r2_score r3 = r2_score(y3_test, y3_pred) print(r3) X4 = df4.iloc[:, :-1].values y4 = df4.iloc[:, -1].values from sklearn.model_selection import train_test_split X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, test_size=0.2, random_state=0) from sklearn.ensemble import RandomForestRegressor regressor4 = RandomForestRegressor(n_estimators=10, random_state=0) regressor4.fit(X4_train, y4_train)
from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score # Importing in the data boston = load_boston() y = boston.target x = boston.data # Splitting the data into train and testsets x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) # Instantiating the models with default values linear_model = LinearRegression() rand_forest = RandomForestRegressor() adaboost_model = AdaBoostRegressor() decision_model = DecisionTreeRegressor() linear_model.fit(x_train, y_train) rand_forest.fit(x_train, y_train) adaboost_model.fit(x_train, y_train) decision_model.fit(x_train, y_train) preds_rf = rand_forest.predict(x_test) preds_linear = linear_model.predict(x_test) preds_ada = adaboost_model.predict(x_test) preds_dt = decision_model.predict(x_test) print(r2_score(y_test, preds_rf)) print(mean_squared_error(y_test, preds_rf)) print(mean_absolute_error(y_test, preds_rf))
from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeRegressor dataset = pd.read_csv("true_car_listings.csv", header=0) y_dataset = dataset[['Price']] dataset = dataset[["Year", "Mileage", "State", "Make"]] dataset = pd.get_dummies(dataset, columns=["State", "Make"]) X_train, X_test, y_train, y_test = train_test_split(dataset, y_dataset.values.ravel(), test_size=0.20, random_state=None) warnings.filterwarnings("ignore", category=DeprecationWarning) dt = DecisionTreeRegressor() # dt = reg = LinearRegression() # dt = RandomForestRegressor(max_depth=2, n_estimators=100) print("Treinamento") print(dataset.shape) dt_fit = dt.fit(X_train, y_train) dt_scores = cross_val_score(dt_fit, X_train, y_train, cv=10, scoring="neg_mean_squared_error") dt_predict = dt.predict(X_test) print("Media cross validation score: {}".format(np.mean(dt_scores))) print("RMSE Score: ", sqrt(mean_squared_error(y_test, dt_predict)))
#Feature Selection if feat_select==1: '''Three steps: 1) Run Feature Selection 2) Get lists of selected and non-selected features 3) Filter columns from original dataset ''' print('--FEATURE SELECTION ON--', '\n') ##1) Run Feature Selection ####### #Wrapper Select via model if fs_type==2: rgr = DecisionTreeRegressor(criterion='friedman_mse', splitter='best', max_depth=None, min_samples_split=3, min_samples_leaf=1, max_features=None, random_state=rand_st) sel = SelectFromModel(rgr, prefit=False, threshold='mean', max_features=None) print ('Wrapper Select: ') fit_mod=sel.fit(data_np, target_np) sel_idx=fit_mod.get_support() ##2) Get lists of selected and non-selected features (names and indexes) ####### temp=[] temp_idx=[] temp_del=[] for i in range(len(data_np[0])): if sel_idx[i]==1: #Selected Features get added to temp header temp.append(header[i+feat_start]) temp_idx.append(i)
# In[2]: from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import AdaBoostRegressor import numpy as np import matplotlib.pyplot as plt # Create a random dataset rng = np.random.RandomState(1) X = np.sort(5 * rng.rand(80, 1), axis=0) y = np.sin(X).ravel() y[::5] += 3 * (0.5 - rng.rand(16)) # Fit regression model from sklearn.tree import DecisionTreeRegressor clf_b = DecisionTreeRegressor(max_depth=5) clf_1 = AdaBoostRegressor(base_estimator=clf_b, n_estimators=10, random_state=0).fit(X, y) clf_2 = AdaBoostRegressor(base_estimator=clf_b, n_estimators=20, random_state=0).fit(X, y) # Predict X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis] y_1 = clf_1.predict(X_test) y_2 = clf_2.predict(X_test) plt.plot(X, y, 'o', color='black') plt.plot(X_test, y_1, label='10 estimators') plt.plot(X_test, y_2, label='20 estimators') # Plot the resu et")
# Load the Data Set from the csv file dataSet = pd.read_csv('groupStudy.csv') # In[5]: dataSet # In[6]: # selecting the row and columns to be used hours = dataSet['Hours of study'] marks = dataSet['Marks scored'] # In[7]: # reshaping the matrix X = np.array(hours).reshape(-1, 1) y = np.array(marks).reshape(-1, 1) # In[8]: from sklearn.tree import DecisionTreeRegressor regressor = DecisionTreeRegressor() regressor.fit(X, y) y_predicted = regressor.predict(np.array(5).reshape(-1, 1)) # In[9]: y_predicted
def predict_dtr_plot(ticker, x, y, x_train, y_train, days_predict, filePath): # get prediction dates base = date.today() dates = [base + timedelta(days=x) for x in range(days_predict)] predict_timestamp_list = [] # Used to display the date of prediction to user # convert to time stamp for dt in dates: predict_timestamp_list.append((str(dt))) timestamp = time.mktime(datetime.strptime((str(dt)), "%Y-%m-%d").timetuple()) np.append(x, int(timestamp)) model = DecisionTreeRegressor() # Define model - DTR worked best for most stocks. model.fit(x_train, y_train) # Fit to model predictions = model.predict(x) # predict print(len(predictions)) length = len(predictions) count = [0, 0] # prediction_timestamp_2plot = [] for predict in predictions: count[0] += 1 if count[0] > length - days_predict: count[1] += 1 print(f'Prediction ({predict_timestamp_list[count[1] - 1]}) = ' + str(predict)) # prediction_timestamp_2plot.append(predict_timestamp_list[count[1] - 1]) # Final step - create and show the graph pred_length = len(predict_timestamp_list) temp = [] count = 0 for length in range(length): count += 1 temp.append(count) plt.cla() # Clear old plot plt.clf() # predictions = predictions[(pred_length - days_predict):pred_length] predictions=predictions[-90:] plt.figure(figsize=(20, 5)) # prediction_dates = np.array(prediction_timestamp_2plot) plt.plot(predict_timestamp_list, predictions) plt.title(str(ticker)) plt.ylabel('Price', fontsize=12) # I use slice notation for the ticks - ex: a[start_index:end_index:step] plt.yticks(predictions[::10]) plt.xticks(predict_timestamp_list[::10]) # plt.xlabel('Time (Days)', fontsize=12) # plt.yscale('linear') # plt.xlabel(predict_timestamp_list) # ax = plt.figure().gca() # plt.suptitle(ticker, fontsize=20) # ax.xaxis.set_major_locator(MaxNLocator(integer=True)) # Improvement plt.grid(True) # ax.set_xticklabels(predict_timestamp_list, rotation=80) # plt.xticks(predict_timestamp_list[1::3], temp[1::3]) # This is numpy's slicing if filePath: # Make directory to store our export data try: plt.savefig(f"{filePath}/{ticker}.png") print(f"Plot image is located at: {filePath}/{ticker}/{ticker}.png") except: print(f"There was an exporting the plot image for {ticker}.") plt.show() # print("Mean sq. error:" + str(mean_squared_error(y, predictions))) return predictions, predict_timestamp_list
def test_eval(self): # Check to make sure that the model will evaluate as expected with sklearn.Bunch objects # Classification # sklearn.Bunch data = load_iris() X_train, y_train = data.data[:120], data.target[:120] X_test, y_test = data.data[120:], data.target[120:] # All '2' variables are the baseline test and what we should match up with default_args = {'random_state': 19} dt1 = DecisionTree(default_args=default_args) dt2 = DecisionTreeClassifier(**default_args) dt1.train(X_train, y_train, exp_type='classification') dt2.fit(X_train, y_train) dt1.eval(X_test, y_test, metrics='acc') actual_acc = dt1.getMetric('acc') preds = dt2.predict(X_test) expected_acc = accuracy_score(y_test, preds) self.assertEqual(actual_acc, expected_acc) # Error if r-squared is wanted in classification problem # Recall default_args = {'random_state': 30} dt1 = DecisionTree(default_args=default_args) dt2 = DecisionTreeRegressor(**default_args) dt1.train(X_train, y_train, exp_type='classification') dt2.fit(X_train, y_train) try: dt1.eval(X_test, y_test, metrics='r2') fail(self) except ValueError as ve: self.assertEqual( str(ve), 'cannot use R2 metric for classification problem!') # pandas.DataFrame data = load_iris(as_frame=True).frame data_cols = data.columns[:-1] target_col = 'target' X_train, X_test, y_train, y_test = train_test_split(data[data_cols], data[target_col], train_size=0.8, random_state=0) default_args = {'random_state': 19} dt1 = DecisionTree(default_args=default_args) dt2 = DecisionTreeClassifier(**default_args) dt1.train(X_train, y_train, exp_type='classification') dt2.fit(X_train, y_train) dt1.eval(X_test, y_test, metrics=['acc', 'precision', 'recall']) actual_acc = dt1.getMetric('acc') actual_prec = dt1.getMetric('prec') actual_rec = dt1.getMetric('rec') preds = dt2.predict(X_test) expected_acc = accuracy_score(y_test, preds) expected_prec = precision_score(y_test, preds, average='weighted') expected_rec = recall_score(y_test, preds, average='weighted') self.assertEqual(actual_acc, expected_acc) self.assertEqual(actual_prec, expected_prec) self.assertEqual(actual_rec, expected_rec) # Error if r-squared is wanted in classification problem # Recall default_args = {'random_state': 30} dt1 = DecisionTree(default_args=default_args) dt2 = DecisionTreeRegressor(**default_args) dt1.train(X_train, y_train, exp_type='classification') dt2.fit(X_train, y_train) try: dt1.eval(X_test, y_test, metrics='r2') fail(self) except ValueError as ve: self.assertEqual( str(ve), 'cannot use R2 metric for classification problem!') # Regression # sklearn.Bunch data = load_boston() X_train, y_train = data.data[:120], data.target[:120] X_test, y_test = data.data[120:], data.target[120:] default_args = {'random_state': 30} dt1 = DecisionTree(default_args=default_args) dt2 = DecisionTreeRegressor(**default_args) dt1.train(X_train, y_train, exp_type='regression') dt2.fit(X_train, y_train) dt1.eval(X_test, y_test, metrics=['r2']) actual_r2 = dt1.getMetric('r2') preds = dt2.predict(X_test) expected_r2 = r2_score(y_test, preds) self.assertEqual(actual_r2, expected_r2) # Error if accuracy, recall, etc is wanted in a regression problem # Recall default_args = {'random_state': 30} dt1 = DecisionTree(default_args=default_args) dt2 = DecisionTreeRegressor(**default_args) dt1.train(X_train, y_train, exp_type='regression') dt2.fit(X_train, y_train) try: dt1.eval(X_test, y_test, metrics='recall') fail(self) except ValueError as ve: self.assertEqual( str(ve), 'cannot use Recall metric for regression problem!') # Accuracy default_args = {'random_state': 30} dt1 = DecisionTree(default_args=default_args) dt2 = DecisionTreeRegressor(**default_args) dt1.train(X_train, y_train, exp_type='regression') dt2.fit(X_train, y_train) try: dt1.eval(X_test, y_test, metrics='accuracy') fail(self) except ValueError as ve: self.assertEqual( str(ve), 'cannot use Accuracy metric for regression problem!') # Precision default_args = {'random_state': 30} dt1 = DecisionTree(default_args=default_args) dt2 = DecisionTreeRegressor(**default_args) dt1.train(X_train, y_train, exp_type='regression') dt2.fit(X_train, y_train) try: dt1.eval(X_test, y_test, metrics='prec') fail(self) except ValueError as ve: self.assertEqual( str(ve), 'cannot use Precision metric for regression problem!') # pandas.DataFrame data = load_diabetes(as_frame=True).frame data_cols = data.columns[:-1] target_col = 'target' X_train, X_test, y_train, y_test = train_test_split(data[data_cols], data[target_col], train_size=0.8, random_state=0) default_args = {'random_state': 30} dt1 = DecisionTree(default_args=default_args) dt2 = DecisionTreeRegressor(**default_args) dt1.train(X_train, y_train, exp_type='regression') dt2.fit(X_train, y_train) dt1.eval(X_test, y_test, metrics=['r2']) actual_r2 = dt1.getMetric('r2') preds = dt2.predict(X_test) expected_r2 = r2_score(y_test, preds) self.assertEqual(actual_r2, expected_r2)
X_valid["IsWorkingTime"] = X_valid.Hour.map(getWorkingorNonWorkingHoursOfDay) X_valid["Prediction"] = model.predict(X_valid) showPredictionValidation(y_train, y_test, X_test, X_valid, df_result) print(mae(y_test, y_pred)) print(mse(y_test, y_pred)) print(r2(y_test, y_pred)) pipelines = [] # ============================================================================= pipelines.append(('DSTR', DecisionTreeRegressor())) pipelines.append(('GBM', GradientBoostingRegressor())) pipelines.append(('RDMF', RandomForestRegressor())) pipelines.append(('ADAB', AdaBoostRegressor())) pipelines.append(('ETR', ExtraTreesRegressor())) pipelines.append(('BAGR', BaggingRegressor())) pipelines.append(('KNNR', KNeighborsRegressor(n_neighbors=7))) #pipelines.append(('LR', LinearRegression())) #pipelines.append(('Ridge', Ridge())) #pipelines.append(('Lasso', Lasso())) #pipelines.append(('SVR', SVR())) ## ============================================================================= def apply_loocv(X_train, y_train, X_test, y_test):
def test_predict(self): # Check to make sure that the model will train as expected with sklearn.Bunch objects data = load_iris() X_train, y_train = data.data[:120], data.target[:120] X_test, y_test = data.data[120:], data.target[120:] # All '2' variables are the baseline test and what we should match up with default_args = {'random_state': 19} dt1 = DecisionTree(default_args=default_args) dt2 = DecisionTreeClassifier(**default_args) dt1.train(X_train, y_train, exp_type='classification') dt2.fit(X_train, y_train) preds1 = dt1.predict(X_test) preds2 = dt2.predict(X_test) self.assertTrue((preds1 == preds2).all()) data = load_boston() X_train, y_train = data.data[:120], data.target[:120] X_test, y_test = data.data[120:], data.target[120:] # All '2' variables are the baseline test and what we should match up with default_args = {'random_state': 30} dt1 = DecisionTree(default_args=default_args) dt2 = DecisionTreeRegressor(**default_args) dt1.train(X_train, y_train, exp_type='regression') dt2.fit(X_train, y_train) preds1 = dt1.predict(X_test) preds2 = dt2.predict(X_test) self.assertTrue((preds1 == preds2).all()) data = load_diabetes() X_train, y_train = data.data[:120], data.target[:120] X_test, y_test = data.data[120:], data.target[120:] # All '2' variables are the baseline test and what we should match up with default_args = {'random_state': 15} dt1 = DecisionTree(default_args=default_args) dt2 = DecisionTreeRegressor(**default_args) dt1.train(X_train, y_train, exp_type='regression') dt2.fit(X_train, y_train) preds1 = dt1.predict(X_test) preds2 = dt2.predict(X_test) self.assertTrue((preds1 == preds2).all()) data = load_wine() X_train, y_train = data.data[:120], data.target[:120] X_test, y_test = data.data[120:], data.target[120:] # All '2' variables are the baseline test and what we should match up with default_args = {'random_state': 90, 'max_depth': 3} dt1 = DecisionTree(default_args=default_args) dt2 = DecisionTreeClassifier(**default_args) dt1.train(X_train, y_train, exp_type='classification') dt2.fit(X_train, y_train) preds1 = dt1.predict(X_test) preds2 = dt2.predict(X_test) self.assertTrue((preds1 == preds2).all()) # Check to make sure that the model will train as expected with sklearn.Bunch objects data = load_iris(as_frame=True) data = data.frame X = data.loc[:, data.columns != 'target'] y = data['target'] X_train, y_train = X.iloc[:120], y.iloc[:120] X_test, y_test = X.iloc[120:], y.iloc[120:] default_args = {'random_state': 19, 'max_features': 'auto'} dt1 = DecisionTree(default_args=default_args) dt2 = DecisionTreeClassifier(**default_args) dt1.train(X_train, y_train, exp_type='classification') dt2.fit(X_train, y_train) preds1 = dt1.predict(X_test) preds2 = dt2.predict(X_test) self.assertTrue((preds1 == preds2).all()) data = load_diabetes(as_frame=True) data = data.frame X = data.loc[:, data.columns != 'target'] y = data['target'] X_train, y_train = X.iloc[:120], y.iloc[:120] X_test, y_test = X.iloc[120:], y.iloc[120:] default_args = {'random_state': 19, 'min_samples_split': 4} dt1 = DecisionTree(default_args=default_args) dt2 = DecisionTreeRegressor(**default_args) dt1.train(X_train, y_train, exp_type='regression') dt2.fit(X_train, y_train) preds1 = dt1.predict(X_test) preds2 = dt2.predict(X_test) self.assertTrue((preds1 == preds2).all()) data = load_wine(as_frame=True) data = data.frame X = data.loc[:, data.columns != 'target'] y = data['target'] X_train, y_train = X.iloc[:120], y.iloc[:120] X_test, y_test = X.iloc[120:], y.iloc[120:] default_args = {'random_state': 19, 'criterion': 'gini'} dt1 = DecisionTree(default_args=default_args) dt2 = DecisionTreeClassifier(**default_args) dt1.train(X_train, y_train, exp_type='classification') dt2.fit(X_train, y_train) preds1 = dt1.predict(X_test) preds2 = dt2.predict(X_test) self.assertTrue((preds1 == preds2).all())
# generate the data import matplotlib.pyplot as plt import random import pandas depth = int(input('Choose a number for tree depth: ')) x = pandas.DataFrame([10 * random.random() for __ in range(50)]) y = 2 * x - 1 + pandas.DataFrame([random.random() for __ in range(50)]) # pick model from sklearn.tree import DecisionTreeRegressor model = DecisionTreeRegressor(max_depth=depth) model.fit(x, y) # plot the model together with the data xfit = pandas.DataFrame([i for i in range(-1, 12)]) yfit = model.predict(xfit) plt.scatter(x, y) plt.plot(xfit, yfit) plt.show() # compute the R^2 score print("R^2 score: {}".format(model.score(x,y)))
rm = RidgeCV(cv=5, alphas=[0.1, 1.0, 3, 10, 30, 100, 300, 1000, 3000]) rm.fit(X, y) score = rm.score(X, y) print score, rm.alpha_ print 'train / test score' rm.fit(X_train, y_train) score = rm.score(X_test, y_test) print score, rm.alpha_ printProgress() # ====================================================================================== print banner print '7. Performing Decision Tree Regressor' from sklearn.tree import DecisionTreeRegressor dtc = DecisionTreeRegressor(max_depth=10, min_samples_split=20) dtc.fit(X, y) mscore = dtc.score(X, y) print mscore print 'test train score' dtc.fit(X_train, y_train) mscore = dtc.score(X_test, y_test) print mscore printProgress() # ====================================================================================== print banner print '8. prepping the word data' cvec = CountVectorizer(stop_words='english', lowercase=True,
X = df[cols] X = np.array(X) y = np.array(y) # Define classifiers to try: (clf, name) pairs classifiers = [ (LinearRegression(n_jobs=-1), 'LinearRegression'), (RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=0), "RandomForest"), (GradientBoostingRegressor(n_estimators=100, random_state=0), "GradientBoost"), (ExtraTreesRegressor(n_estimators=100, random_state=0), "ExtraTrees"), (DecisionTreeRegressor(random_state=0), "DecisionTrees"), (BaggingRegressor(n_estimators=100, n_jobs=-1, random_state=0), "Bagging"), (AdaBoostRegressor(n_estimators=100, random_state=0), "AdaBoost") # , # (XGBRegressor(n_estimators=100, n_jobs=-1, randomstate=0), "XGBoost") ] ######## SQUID Prediction # Store all ROC curves here: squid_rocs = [] for clf, name in classifiers: print("Evaluating %s classifier (squid)" % name) mae, r2 = cross_validate_and_plot(clf, X, y, cols, name + "_squid",
start: float = time.time() for j, i in enumerate(x_axis): [X_train, X_test, y_train, y_test] = train_test_split(X, y, test_size=.2, random_state=randint(0, 1000)) tree_aux: DecisionTreeRegressor = DecisionTreeRegressor( criterion='squared_error', splitter='best', max_depth=2, min_samples_split=1 * 1e-3, min_samples_leaf=100 * 1e-3, min_weight_fraction_leaf=0, max_features='auto', random_state=randint(0, 1000), max_leaf_nodes=None, min_impurity_decrease=0, ccp_alpha=0) model: AdaBoostRegressor = AdaBoostRegressor( base_estimator=tree_aux, n_estimators=30, learning_rate=100 * 1e-3, loss='square', random_state=randint(0, 1000)).fit(X_train, y_train) if cofs is None: cofs = model.feature_importances_ else:
principalComp = pca.fit_transform(X) print(pca.explained_variance_ratio_) principalDf = pd.DataFrame(data=principalComp, columns=['c1', 'c2', 'c3', 'c4']) print(principalDf.head()) kf = KFold(20) Xs_array = Xs.values Y_array = Y.values for a, b in kf.split(Xs_array): X_train, X_test = Xs_array[a], Xs_array[b] y_train, y_test = Y_array[a], Y_array[b] lr = LinearRegression() DT = DecisionTreeRegressor() RF = RandomForestRegressor() GB = GradientBoostingRegressor() NN = MLPRegressor(hidden_layer_sizes=(100, 8), random_state=1) inp = input("Do you want to fit the models " + asking[1]) if inp == 'y': model1 = lr.fit(X_train, y_train) model2 = DT.fit(X_train, y_train) model3 = RF.fit(X_train, y_train) model4 = GB.fit(X_train, y_train) model5 = NN.fit(Xs_array, Y_array) print("Accuracy Score of Linear regression on train set", model1.score(X_train, y_train) * 100) print("Accuracy Score of Decision Tree on train set", model2.score(X_train, y_train) * 100)
# -*- coding: utf-8 -*- """ @author: user """ import pandas as pd import matplotlib.pyplot as plt import numpy as np df = pd.read_csv("decision tree regression dataset.csv", sep=";", header=None) x = df.iloc[:, 0].values.reshape(-1, 1) y = df.iloc[:, 1].values.reshape(-1, 1) #%% decision tree regression from sklearn.tree import DecisionTreeRegressor tree_reg = DecisionTreeRegressor() # random sate = 0 tree_reg.fit(x, y) tree_reg.predict(5.5) x_ = np.arange(min(x), max(x), 0.01).reshape(-1, 1) y_head = tree_reg.predict(x_) # %% visualize plt.scatter(x, y, color="red") plt.plot(x_, y_head, color="green") plt.xlabel("tribun level") plt.ylabel("ucret") plt.show()
filtered_melbourne_data = melbourne_data.dropna(axis=0) # Choose target and predictors y = filtered_melbourne_data.Price melbourne_predictors = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude'] X = filtered_melbourne_data[melbourne_predictors] # We then create the Decision tree model with this code: # In[ ]: from sklearn.tree import DecisionTreeRegressor # Define model melbourne_model = DecisionTreeRegressor() # Fit model melbourne_model.fit(X, y) # The calculation of mean absolute error in the Melbourne data is # In[ ]: from sklearn.metrics import mean_absolute_error predicted_home_prices = melbourne_model.predict(X) mean_absolute_error(y, predicted_home_prices)
mse_bins_store = [] # Monte Carlo cross validation (MCCV) loop for rrr in range(50): # Resample validation set (uniform distribution) train_indices, test_indices = resreg.uniform_test_split(X, y, bins=bins, bin_test_size=70, verbose=False, random_state=rrr) X_train, y_train = X[train_indices, :], y[train_indices] X_test, y_test = X[test_indices, :], y[test_indices] # Unpack hyperparameters, resample training data, and fit regressors reg = DecisionTreeRegressor(random_state=rrr) if 'REBAGG' in strategy else \ RandomForestRegressor(n_estimators=10, n_jobs=-1, random_state=rrr) if strategy == 'RO': cl, ch, sample_method = param relevance = resreg.sigmoid_relevance(y_train, cl=cl, ch=ch) X_train, y_train = resreg.random_oversample(X_train, y_train, relevance, relevance_threshold=0.5, over=sample_method, random_state=rrr) reg.fit(X_train, y_train) elif strategy == 'SMOTER': cl, ch, sample_method, k = param
from scipy.sparse import csr_matrix # from polylearn import PolynomialNetworkRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.dummy import DummyRegressor from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor from sklearn.tree import DecisionTreeRegressor from math import sqrt regr = LinearSVR(random_state=0) svr_lin = SVR(kernel='linear', C=1e3) fm = pylibfm.FM() knr = KNeighborsRegressor(n_neighbors=10) dr = DummyRegressor() bagrgr = BaggingRegressor() dtreergr = DecisionTreeRegressor() adabregr = AdaBoostRegressor() gradbregr = GradientBoostingRegressor() def validate(X, y): print "Starting cross validation" scores = cross_val_score(knr, X, y, scoring='neg_mean_squared_error', cv=3) return scores if __name__ == "__main__": import music train_examples = music.load_examples('data/train.pkl') # poly = PolynomialNetworkRegressor(degreex=3, n_components=2, tol=1e-3, warm_start=True, random_state=0)
trainDataFinal = unigrams[:5395, :] trainTargetFinal = traintarget[:5395] #use test_set_all_instances.csv and bestFeaturesIndex to create testData, testTarget #First Pass get just text for unigram extraction... print("working on 2nd unigrams..") #Create unigram + features initial array unigramsAndFeaturesTest = np.zeros((2847, 25559)) unigramsAndFeaturesTest[:, :] = unigrams[5395:, :] testData = unigramsAndFeaturesTest testTarget = traintarget[5395:] estimator = Pipeline([("imputer", Imputer()), ("treeReg", DecisionTreeRegressor(max_depth=5))]) estimator.fit(trainDataFinal, trainTargetFinal) predicted = estimator.predict(testData) mseScore = mean_squared_error(testTarget, predicted) print("mseScore: " + str(mseScore)) #F-scores calculations #convert test scores to categorical values testTargetCategorical = [] for val in testTarget: if val < -1: testTargetCategorical.append('disagree') elif val > 1: testTargetCategorical.append('agree') else: testTargetCategorical.append('neutral')
#Preparing result yTest = pd.DataFrame(ytest) yPred = pd.DataFrame(ypred) yTest.index = yPred.index result = pd.concat((yTest, yPred), axis=1) ################################################################################################## #2.Fitting DECISION TREE REGRESSION from sklearn.tree import DecisionTreeRegressor DT_regressor = DecisionTreeRegressor() DT_regressor.fit(xtrain, ytrain) DT_regressor.score(xtrain, ytrain) DT_ypred = DT_regressor.predict(xtest) #Converting back to original from feature scale ytest = scalery.inverse_transform(ytest) ypred = scalery.inverse_transform(DT_ypred) #Preparing result yTest = pd.DataFrame(ytest)
#making Sale price as target variable train_y = train_file.SalePrice #test_y=test_file.SalePrice #Feature engineering, selecting some features to practice how decision model works features = [ 'LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd' ] train_X = train_file[features] test_X = test_file[features] #test_file.columns # In[22]: #Decision Tree model model = DecisionTreeRegressor(random_state=1) model.fit(train_X, train_y) # In[25]: #saving prediction values test_predictions = model.predict(test_X) test_predictions # In[29]: #making a dataframe with predicted sales price and test id output = pd.DataFrame({'Id': test_file.Id, 'SalePrice': test_predictions}) # In[31]:
# Decision Tree Regression # Importing the libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd # Importing the dataset dataset = pd.read_csv('../../data/Position_Salaries.csv') X = dataset.iloc[:, 1:-1].values y = dataset.iloc[:, -1].values # Training the Decision Tree Regression model on the whole dataset from sklearn.tree import DecisionTreeRegressor regressor = DecisionTreeRegressor(random_state=0) regressor.fit(X, y) # Predicting a new result regressor.predict([[6.5]]) # Visualising the Decision Tree Regression results (higher resolution) X_grid = np.arange(min(X), max(X), 0.01) X_grid = X_grid.reshape((len(X_grid), 1)) plt.scatter(X, y, color='red') plt.plot(X_grid, regressor.predict(X_grid), color='blue') plt.title('Truth or Bluff (Decision Tree Regression)') plt.xlabel('Position level') plt.ylabel('Salary') plt.show()
newData1 = X1.sample(4) newData1 clsModel.predict(newData1) #%% Regression Tree #regression #predict if mpg (numerical value) on basis of am, hp, wt X2 = df[['am','hp','wt']] Y2 = df[['mpg']] np.mean(Y2) from sklearn.tree import DecisionTreeRegressor X2.shape X2_train, X2_test, Y2_train, Y2_test = train_test_split(X2, Y2, test_size=.20) X2_train.shape X2_test.shape regrModel = DecisionTreeRegressor() #model with parameter regrModel.fit(X2_train, Y2_train) #visualise text_representation = tree.export_text(regrModel) print(text_representation) fnames = ['am','hp','wt'] fig = plt.figure(figsize=(40,30)) tree.plot_tree(regrModel, feature_names=fnames, filled=True) plt.show(); fig = plt.figure(figsize=(20,10)) tree.plot_tree(regrModel, feature_names=['am','hp','wt'], filled=True, max_depth=2, fontsize=20, node_ids=True) plt.show(); Y2_train[X2_train['hp'] <= 92].aggregate({'mpg':np.mean})
# %% # Elastic Net regressor = ElasticNet() # %% regressor = fittingModel(regressor, X_train, y_train) # %% pred_train, pred_test = predictValues(regressor, X_train, X_test) # %% validatingResults(pred_train, pred_test, y_train, y_test) # %% displayResults(y_test, pred_test) # %% # Decision Tree regressor = DecisionTreeRegressor(random_state=0) # %% regressor = fittingModel(regressor, X_train, y_train) # %% pred_train, pred_test = predictValues(regressor, X_train, X_test) # %% validatingResults(pred_train, pred_test, y_train, y_test) # %% displayResults(y_test, pred_test) # %% # Random forest regressor = RandomForestRegressor(n_estimators=30, random_state=0) # %% regressor = fittingModel(regressor, X_train, y_train)