def model_complexity(X_train, y_train, X_test, y_test): """Calculate the performance of the model as model complexity increases.""" print "Model Complexity: " # We will vary the depth of decision trees from 2 to 25 max_depth = np.arange(1, 25) train_err = np.zeros(len(max_depth)) test_err = np.zeros(len(max_depth)) for i, d in enumerate(max_depth): # Setup a Decision Tree Regressor so that it learns a tree with depth d regressor = DecisionTreeRegressor(max_depth=d) # Fit the learner to the training data regressor.fit(X_train, y_train) # Find the performance on the training set train_err[i] = performance_metric(y_train, regressor.predict(X_train)) # Find the performance on the testing set test_err[i] = performance_metric(y_test, regressor.predict(X_test)) # Plot the model complexity graph model_complexity_graph(max_depth, train_err, test_err)
def test_rt(): boston = load_boston() X, y = boston.data, boston.target feature_names = boston.feature_names sk_dt = SKRT(random_state=1, max_depth=3) our_dt = RegressionTree(feature_names=feature_names, random_state=1) sk_dt.fit(X, y) our_dt.fit(X, y) sk_pred = sk_dt.predict(X) our_pred = our_dt.predict(X) assert np.allclose(sk_pred, our_pred) # With labels local_expl = our_dt.explain_local(X, y) local_viz = local_expl.visualize(0) assert local_viz is not None # Without labels local_expl = our_dt.explain_local(X) local_viz = local_expl.visualize(0) assert local_viz is not None global_expl = our_dt.explain_global() global_viz = global_expl.visualize() assert global_viz is not None
def nn_lin(self, testX, neighbors): l = DecisionTreeRegressor() return np.mean(self.Y[neighbors]) l.fit(self.X[neighbors], self.Y[neighbors]) # for idx in np.where(l.coef_)[0]: # self.active[idx]+=1 return l.predict([testX])[0]
def test_thresholded_scorers(): # Test scorers that take thresholds. X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LogisticRegression(random_state=0) clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.decision_function(X_test)) score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) assert_almost_equal(score1, score3) logscore = get_scorer('log_loss')(clf, X_test, y_test) logloss = log_loss(y_test, clf.predict_proba(X_test)) assert_almost_equal(-logscore, logloss) # same for an estimator without decision_function clf = DecisionTreeClassifier() clf.fit(X_train, y_train) score1 = get_scorer('roc_auc')(clf, X_test, y_test) score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]) assert_almost_equal(score1, score2) # test with a regressor (no decision_function) reg = DecisionTreeRegressor() reg.fit(X_train, y_train) score1 = get_scorer('roc_auc')(reg, X_test, y_test) score2 = roc_auc_score(y_test, reg.predict(X_test)) assert_almost_equal(score1, score2) # Test that an exception is raised on more than two classes X, y = make_blobs(random_state=0, centers=3) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf.fit(X_train, y_train) assert_raises(ValueError, get_scorer('roc_auc'), clf, X_test, y_test)
def learning_curve(depth, X_train, y_train, X_test, y_test): """Calculate the performance of the model after a set of training data.""" # We will vary the training set size so that we have 50 different sizes sizes = np.round(np.linspace(1, len(X_train), 50)) train_err = np.zeros(len(sizes)) test_err = np.zeros(len(sizes)) sizes = [int(ii) for ii in sizes] print "Decision Tree with Max Depth: " print depth for i, s in enumerate(sizes): # Create and fit the decision tree regressor model regressor = DecisionTreeRegressor(max_depth=depth) regressor.fit(X_train[:s], y_train[:s]) # Find the performance on the training and testing set train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s])) test_err[i] = performance_metric(y_test, regressor.predict(X_test)) # Plot learning curve graph learning_curve_graph(sizes, train_err, test_err)
def model_complexity(X_train, y_train, X_test, y_test): """Calculate the performance of the model as model complexity increases.""" ### now we are using all the training data and seeing how the model complexity affects ### performance ### before we were holding the complexity constant and measuring performance as the training ### data increased print "Model Complexity: " # We will vary the depth of decision trees from 2 to 25 max_depth = np.arange(1, 25) train_err = np.zeros(len(max_depth)) test_err = np.zeros(len(max_depth)) x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13] print "len of training set = ",len(y_train) for i, d in enumerate(max_depth): # Setup a Decision Tree Regressor so that it learns a tree with depth d regressor = DecisionTreeRegressor(max_depth=d,random_state = 0) # Fit the learner to the training data regressor.fit(X_train, y_train) print d,regressor.predict(x) # Find the performance on the training set train_err[i] = performance_metric(y_train, regressor.predict(X_train)) # Find the performance on the testing set test_err[i] = performance_metric(y_test, regressor.predict(X_test)) # Plot the model complexity graph model_complexity_graph(max_depth, train_err, test_err)
def learn_regression_tree_ensemble(img_features, gt_illuminants, num_trees, max_tree_depth): eps = 0.001 inst = [[img_features[i], gt_illuminants[i][0] / (sum(gt_illuminants[i]) + eps), gt_illuminants[i][1] / (sum(gt_illuminants[i]) + eps)] for i in range(len(img_features))] inst.sort(key = lambda obj: obj[1]) #sort by r chromaticity stride = int(np.ceil(len(inst) / float(num_trees+1))) sz = 2*stride dst_model = [] for tree_idx in range(num_trees): #local group in the training data is additionally weighted by num_trees local_group_range = range(tree_idx*stride, min(tree_idx*stride+sz, len(inst))) X = num_trees * [inst[i][0] for i in local_group_range] y_r = num_trees * [inst[i][1] for i in local_group_range] y_g = num_trees * [inst[i][2] for i in local_group_range] #add the rest of the training data: X = X + [inst[i][0] for i in range(len(inst)) if i not in local_group_range] y_r = y_r + [inst[i][1] for i in range(len(inst)) if i not in local_group_range] y_g = y_g + [inst[i][2] for i in range(len(inst)) if i not in local_group_range] local_model = [] for feature_idx in range(len(X[0])): tree_r = DecisionTreeRegressor(max_depth = max_tree_depth, random_state = 1234) tree_r.fit([el[feature_idx][0] for el in X], y_r) tree_g = DecisionTreeRegressor(max_depth = max_tree_depth, random_state = 1234) tree_g.fit([el[feature_idx][0] for el in X], y_g) local_model.append([tree_r, tree_g]) dst_model.append(local_model) return dst_model
def learning_curve(depth, X_train, y_train, X_test, y_test): """Calculate the performance of the model after a set of training data.""" # We will vary the training set size so that we have 50 different sizes sizes = np.round(np.linspace(1, len(X_train), 50)) train_err = np.zeros(len(sizes)) test_err = np.zeros(len(sizes)) print "Decision Tree with Max Depth: " print depth for i, s in enumerate(sizes): # Create and fit the decision tree regressor model regressor = DecisionTreeRegressor(max_depth=depth) regressor.fit(X_train[:s], y_train[:s]) # Find the performance on the training and testing set train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s])) test_err[i] = performance_metric(y_test, regressor.predict(X_test)) # if depth >= 4 and depth <= 6: # pl.figure() # pl.plot(y_test, 'bo') # pl.plot(regressor.predict(X_test), color='red') # pl.savefig("test_data_depth_" + str(depth)) # Plot learning curve graph learning_curve_graph(sizes, train_err, test_err, depth)
def learning_curve(depth, X_train, y_train, X_test, y_test): """Calculate the performance of the model after a set of training data for a given depth or complexity.""" # We will vary the training set size so that we have 50 different sizes sizes = np.round(np.linspace(1, len(X_train), 50)) train_err = np.zeros(len(sizes)) test_err = np.zeros(len(sizes)) x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13] print "Decision Tree with Max Depth: " # print depth for i, s in enumerate(sizes): # iterate thru 50 different training set sizes # Create and fit the decision tree regressor model for the size of the training # set given and the depth of the decision tree (I assume the depth of the tree indicates # the complexity of the model, i.e as the depth grows the complexity grows) regressor = DecisionTreeRegressor(max_depth=depth,random_state=0) regressor.fit(X_train[:s], y_train[:s]) # Find the performance on the training and testing set train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s])) test_err[i] = performance_metric(y_test, regressor.predict(X_test)) #print out the prediction for the full training set for that depth print depth,s,regressor.predict(x) # Plot learning curve graph learning_curve_graph(sizes, train_err, test_err)
def fit_model1(X, y): """ Performs grid search over the 'max_depth' parameter for a decision tree regressor trained on the input data [X, y]. """ # Create cross-validation sets from the training data cv_sets = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.20, random_state=0) # TODO: Create a decision tree regressor object regressor = DecisionTreeRegressor() regressor.fit(X, y) # TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10 params = {'max_depth': (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)} # TODO: Transform 'performance_metric' into a scoring function using 'make_scorer' scoring_fnc = make_scorer(performance_metric) #print(regressor.predict(X)) ##scoring_fnc = make_scorer(mean_squared_error) # TODO: Create the grid search object grid_obj = GridSearchCV(regressor, params, scoring=scoring_fnc, cv=cv_sets) # Fit the grid search object to the data to compute the optimal model grid = grid_obj.fit(X, y) # Return the optimal model after fitting the data return grid.best_estimator_
def train_learning_model_decision_tree_ada_boost(df): #code taken from sklearn X_all, y_all = preprocess_data(df) X_train, X_test, y_train, y_test = split_data(X_all, y_all) tree_regressor = DecisionTreeRegressor(max_depth = 6) ada_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=6), n_estimators = 500, learning_rate = 0.01, random_state = 1) tree_regressor.fit(X_train, y_train) ada_regressor.fit(X_train, y_train) y_pred_tree = tree_regressor.predict(X_test) y_pred_ada = ada_regressor.predict(X_test) mse_tree = mean_squared_error(y_test, y_pred_tree) mse_ada = mean_squared_error(y_test, y_pred_ada) mse_tree_train = mean_squared_error(y_train, tree_regressor.predict(X_train)) mse_ada_train = mean_squared_error(y_train, ada_regressor.predict(X_train)) print ("MSE tree: %.4f " %mse_tree) print ("MSE ada: %.4f " %mse_ada) print ("MSE tree train: %.4f " %mse_tree_train) print ("MSE ada train: %.4f " %mse_ada_train)
def CART(self): " CART" # Apply random forest Classifier to predict the number of bugs. if self.smoteit: self.train = SMOTE( self.train, atleast=50, atmost=101, resample=self.duplicate) if not self.tuning: clf = DecisionTreeRegressor(random_state=1) else: clf = DecisionTreeRegressor(max_depth=int(self.tunings[0]), min_samples_split=int(self.tunings[1]), min_samples_leaf=int(self.tunings[2]), max_features=float(self.tunings[3] / 100), max_leaf_nodes=int(self.tunings[4]), criterion='entropy', random_state=1) features = self.train.columns[:-2] klass = self.train[self.train.columns[-2]] # set_trace() clf.fit(self.train[features].astype('float32'), klass.astype('float32')) preds = clf.predict( self.test[self.test.columns[:-2]].astype('float32')).tolist() return preds
def fit_predict_model(city_data): '''Find and tune the optimal model. Make a prediction on housing data.''' # Get the features and labels from the Boston housing data X, y = city_data.data, city_data.target print X # Setup a Decision Tree Regressor regressor = DecisionTreeRegressor() parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10)} reg = GridSearchCV(regressor, parameters,scoring=make_scorer(metrics.mean_squared_error,greater_is_better=False)) print reg.fit(X, y) depth_values= list() for i in xrange(101): reg.fit(X,y) depth_values.append(int(reg.best_params_['max_depth'])) print "Best model parameter: " + str(np.median(depth_values)) # Fit the learner to the training data # Use the model to predict the output of a particular sample regressor = DecisionTreeRegressor(max_depth=np.median(depth_values)) print "Final Model: " print regressor regressor.fit(X, y) x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13] y = regressor.predict(x) print "House: " + str(x) print "Prediction: " + str(y)
def test_decision_tree_regression(filename): start_time = time.time() scores = [] from sklearn.tree import DecisionTreeRegressor df = pd.read_csv(filename) h_indep = df.columns[:-1] h_dep = df.columns[-1] for _ in xrange(10): # print "- ", sys.stdout.flush() msk = np.random.rand(len(df)) < 0.4 train_data = df[msk] test_data = df[~msk] # print len(train_data), len(test_data) assert (len(train_data) + len(test_data) == len(df)), "Something is wrong" train_indep = train_data[h_indep] train_dep = train_data[h_dep] test_indep = test_data[h_indep] test_dep = test_data[h_dep] dt = DecisionTreeRegressor() dt.fit(train_indep, [i for i in train_dep.values.tolist()]) prediction = dt.predict(test_indep) from sklearn.metrics import mean_absolute_error scores.append(mean_absolute_error(test_dep, prediction)) # print len(confusion_matrices), extract_name = filename.split("/")[-1].split(".")[0] + ".p" # import pickle # pickle.dump(confusion_matrices, open("./Results_RF_Classification/CM_" + extract_name, "wb")) print round(np.mean(scores), 3), round(time.time() - start_time, 3), "sec"
def arbolesRegresion(caract): clf = DecisionTreeRegressor(min_samples_leaf=10, min_samples_split=15, max_depth=13, compute_importances=True) importancias = [0,0,0,0,0,0,0,0,0,0,0,0,0] mae=mse=r2=0 kf = KFold(len(boston_Y), n_folds=10, indices=True) for train, test in kf: trainX, testX, trainY, testY=boston_X[train], boston_X[test], boston_Y[train], boston_Y[test] nCar=len(caract) train=np.zeros((len(trainX), nCar)) test=np.zeros((len(testX), nCar)) trainYNuevo=trainY for i in range(nCar): for j in range(len(trainX)): train[j][i]=trainX[j][caract[i]] for k in range(len(testX)): test[k][i]=testX[k][caract[i]] trainYNuevo=np.reshape(trainYNuevo, (len(trainY), -1)) clf.fit(train, trainYNuevo) prediccion=clf.predict(test) # clf.fit(trainX, trainY) # prediccion=clf.predict(testX) mae+=metrics.mean_absolute_error(testY, prediccion) mse+=metrics.mean_squared_error(testY, prediccion) r2+=metrics.r2_score(testY, prediccion) feature_importance = clf.feature_importances_ feature_importance = 100.0 * (feature_importance / feature_importance.max()) for i in range(13): importancias[i] = importancias[i] + feature_importance[i] print 'Error abs: ', mae/len(kf), 'Error cuadratico: ', mse/len(kf), 'R cuadrado: ', r2/len(kf) for i in range(13): importancias[i] = importancias[i]/10 sorted_idx = np.argsort(importancias) pos = np.arange(sorted_idx.shape[0]) + .5 importancias = np.reshape(importancias, (len(importancias), -1)) boston = datasets.load_boston() pl.barh(pos, importancias[sorted_idx], align='center') pl.yticks(pos, boston.feature_names[sorted_idx]) pl.xlabel('Importancia relativa') pl.show() import StringIO, pydot dot_data = StringIO.StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("bostonTree.pdf")
def learning_curve(depth, X_train, y_train, X_test, y_test): """Calculate the performance improvement of the model, as training size increases.""" # create 50 equally spaced markers for the the graph's X axis sizes = np.round(np.linspace(1, len(X_train), 50)) # create 50 open bins to fill in the training and test errors train_err = np.zeros(len(sizes)) test_err = np.zeros(len(sizes)) print "Decision Tree with Max Depth: " print depth for i, s in enumerate(sizes): # train classifier and test on each level of depth complexity regressor = DecisionTreeRegressor(max_depth=depth) regressor.fit(X_train[:s], y_train[:s]) # fill in the training and test error train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s])) test_err[i] = performance_metric(y_test, regressor.predict(X_test)) # create the learning curve graph, using the calculated information learning_curve_graph(sizes, train_err, test_err) return test_err[-1]
def learning_curve(depth, X_train, y_train, X_test, y_test, iteration=None): """Calculate the performance of the model after a set of training data.""" # We will vary the training set size so that we have 50 different sizes sizes = np.linspace(1, len(X_train), 50) train_err = np.zeros(len(sizes)) test_err = np.zeros(len(sizes)) print "Decision Tree with Max Depth: " print depth for i, s in enumerate(sizes): # Create and fit the decision tree regressor model regressor = DecisionTreeRegressor(max_depth=depth) regressor.fit(X_train[:s], y_train[:s]) # Find the performance on the training and testing set train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s])) test_err[i] = performance_metric(y_test, regressor.predict(X_test)) # Plot learning curve graph learning_curve_graph(depth, sizes, train_err, test_err) # added to produce figure 2 if iteration is not None: print "Final error at max_depth={}: {}".format(depth, test_err[-1]) fully_trained_error[depth - 1][iteration] = test_err[-1]
def test(): labeled_data, unlabeled_data = load_all_data() bestData = load_data(BEST_SUBMISSION_PATH) bestY = bestData.y[1:].astype(np.float) #bestY = scale(bestY) #labeled_data.y = scale(labeled_data.y) for x in xrange(5, 6): clf = DecisionTreeRegressor() clf.fit(labeled_data.X, labeled_data.y) newData = Data() newData.X = np.append(labeled_data.X, unlabeled_data[0:95000], axis=0) newData.y = np.append(labeled_data.y, bestY[0:95000]) for i in xrange(x): clf = RandomForestRegressor() clf.fit(newData.X, newData.y) n, d = unlabeled_data.shape indices = np.random.choice(np.array(range(n)), size=5000, replace=False) labels = clf.predict(unlabeled_data[indices]) newData.X = np.append(labeled_data.X, unlabeled_data[indices], axis=0) newData.y = np.append(labeled_data.y, labels) print rmse(bestY, clf.predict(unlabeled_data)) saveRevenues(clf.predict(unlabeled_data))
def plot_curve(): # Defining our regression algorithm reg = DecisionTreeRegressor() # Fit our model using X and y reg.fit(X, y) print "Regressor score: {:.4f}".format(reg.score(X,y)) # TODO: Use learning_curve imported above to create learning curves for both the # training data and testing data. You'll need reg, X, y, cv and score from above. # Note: Because i didnt use all the parameters in order of function definition for learning_curve fn, # I have to explicitly assign values to the parameters. e.g, from learning_curve fn, after 'y' # comes 'train_sizes'. But since it is optional and I am not using that parameter, for all other parameters # that come after, i have to explicitly assign values to the parameter (e.g cv=cv, scoring=score) # else error train_sizes, train_scores, test_scores = learning_curve(reg, X, y, cv=cv, scoring=score) # Taking the mean of the test and training scores train_scores_mean = np.mean(train_scores,axis=1) test_scores_mean = np.mean(test_scores,axis=1) # Plotting the training curves and the testing curves using train_scores_mean and test_scores_mean plt.plot(train_sizes ,train_scores_mean,'-o',color='b',label="train_scores_mean") plt.plot(train_sizes,test_scores_mean ,'-o',color='r',label="test_scores_mean") # Plot aesthetics plt.ylim(-0.1, 1.1) plt.ylabel("Curve Score") plt.xlabel("Training Points") plt.legend(bbox_to_anchor=(1.1, 1.1)) plt.show()
def train_decision_tree(sizes, depth, X_test, X_train, y_test, y_train): """ Args: sizes (Numpy array): Array of training sample sizes to train on. depth (int): The maximum depth of the DecisionTreeRegressor X_test (Numpy array): Test set features X_train (Numpy array): Training set features y_test (Numpy array): Test set target variable y_train (Numpy array): Training set target variable Returns: test_err (Numpy array): Test set predictions. train_err (Numpy array): Training set predictions. """ train_err = np.zeros(len(sizes)) test_err = np.zeros(len(sizes)) for i, s in enumerate(sizes): # Create and fit the decision tree regressor model regressor = DecisionTreeRegressor(max_depth=depth) # Cast to int to avoid DeprecationWarning from numpy 1.8 regressor.fit(X_train[:int(s)], y_train[:int(s)]) # Find the performance on the training and testing set train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s])) test_err[i] = performance_metric(y_test, regressor.predict(X_test)) return test_err, train_err
def test_boston(self): from sklearn.tree import DecisionTreeRegressor as DecisionTreeRegressorSklearn model = DecisionTreeRegressor(tree_type='oblivious', max_n_splits=3) model_sklearn = DecisionTreeRegressorSklearn() dataset = load_boston() mse = [] mse_sklearn = [] for fold in range(5): X_train, X_test, y_train, y_test = train_test_split( dataset.data, dataset.target, test_size=0.33) model.fit(X_train, y_train) y = model.predict(X_test) mse.append(mean_squared_error(y, y_test)) model_sklearn.fit(X_train, y_train) y = model_sklearn.predict(X_test) mse_sklearn.append(mean_squared_error(y, y_test)) mean_mse = np.mean(mse) mean_mse_sklearn = np.mean(mse_sklearn) print(mean_mse, mean_mse_sklearn) # Check that our model differs in MSE no worse than 50% self.assertTrue(np.abs(mean_mse - mean_mse_sklearn) / mean_mse_sklearn < 0.5)
def train_decision_tree(time_regression_df, test_size, random_state, max_depth, export_testset): time_regression_df_train, time_regression_df_test = cv.train_test_split(time_regression_df, test_size=test_size, random_state=random_state) y_train = time_regression_df_train['trip_time'] x_train = time_regression_df_train.ix[:, 0:6] y_test = time_regression_df_test['trip_time'] x_test = time_regression_df_test.ix[:, 0:6] if export_testset: xy_test = pd.concat([x_test, y_test], axis=1) xy_test.to_csv('../data/' + filename_prefix + '_testset.csv') tic = time.time() regtree = DecisionTreeRegressor(max_depth=max_depth, min_samples_split=3, random_state=random_state) regtree.fit(x_train, y_train) elapsed = time.time() - tic print(elapsed) export_meta_data(regtree, x_test, y_test, elapsed) target_location = ('../treelib/' + filename_prefix + '_tree_depth_' + str(regtree.tree_.max_depth)) dump_model(regtree, target_location) return regtree
class TestDecisionTreeRegressorConverter(TestCase): def setUp(self): np.random.seed(1) self.est = DecisionTreeRegressor(max_depth=2) self.est.fit([ [0, 0], [0, 1], [1, 0], [1, 1], ], [0, 1, 1, 1]) self.ctx = TransformationContext( input=[IntegerNumericFeature('x1'), StringCategoricalFeature('x2', ['zero', 'one'])], model=[IntegerNumericFeature('x1'), StringCategoricalFeature('x2', ['zero', 'one'])], derived=[], output=[IntegerNumericFeature('output')] ) self.converter = DecisionTreeConverter( estimator=self.est, context=self.ctx, mode=DecisionTreeConverter.MODE_REGRESSION ) def test_transform(self): p = self.converter.pmml() tm = p.TreeModel[0] assert tm.MiningSchema is not None, 'Missing mining schema' assert len(tm.MiningSchema.MiningField) == 3, 'Wrong number of mining fields' assert tm.Node is not None, 'Missing root node' assert tm.Node.recordCount == 4 assert tm.Node.True_ is not None, 'Root condition should always be True'
def main(): # Create a random dataset rng = np.random.RandomState(1) X = np.sort(5 * rng.rand(80, 1), axis=0) y = np.sin(X).ravel() y[::5] += 3 * (0.5 - rng.rand(16)) clf_1 = DecisionTreeRegressor(max_depth=2) clf_2 = DecisionTreeRegressor(max_depth=5) clf_1.fit(X, y) clf_2.fit(X, y) # Predict X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis] y_1 = clf_1.predict(X_test) y_2 = clf_2.predict(X_test) # Plot the results plt.figure() plt.scatter(X, y, c="k", label="data") plt.plot(X_test, y_1, c="g", label="max_depth=2", linewidth=2) plt.plot(X_test, y_2, c="r", label="max_depth=5", linewidth=2) plt.xlabel("data") plt.ylabel("target") plt.title("Decision Tree Regression") plt.legend() plt.show()
class CustomClassifier(BaseEstimator, ClassifierMixin): """Predicts the majority class of its training data.""" def __init__(self): global class_instance class_instance += 1 self.instance = class_instance #print "instance:", self.instance def __del__(self): global class_instance class_instance -= 1 def fit(self, X, y, sample_weight=array([])): # 1st Adaboost iteration: just return the current volatility if self.instance <= 2: self.y = y return self # 2+ Adaboost iteration: use linera regreession as a weak learner else: self.regr = DecisionTreeRegressor(max_depth=8) #self.regr = linear_model.Lasso(alpha=0.01,fit_intercept=False,normalize=False,max_iter=10000000) # they call lambda alpha self.regr.fit(X, y) def predict(self, X): # 1st Adaboost iteration: just return the current volatility if self.instance <= 2: return X[:,6] # return 6th element of feature vector (which is the current volatility) # 2+ Adaboost iteration: use linera regreession as a weak learner else: return self.regr.predict(X)
def decision_tree_regressor(X, y, labels): regressor = DecisionTreeRegressor(max_depth=3) regressor.fit(X, y) estimates_z = regressor.predict(X) leaves = regressor.apply(X) leaves_hash = np.zeros(np.max(leaves) + 1) for i in range(len(y)): if (estimates_z[i] - y[i]) > 0.05 and estimates_z[i] > 0.6 and y[i] > 0: # print estimates_z[i] # print y[i] # print estimates_z[i]-y[i] # print ((estimates_z[i]-y[i])>0.1 and estimates_z[i]>0 and y[i]>0) # print leaves[i] leaves_hash[leaves[i]] += 1 # print leaves_hash[leaves[i]] else: leaves_hash[-1] += 1 # print regressor.tree_.decision_path(X) print regressor.tree_.feature print regressor.tree_.threshold print leaves_hash print regressor.feature_importances_ visualize_tree(regressor.tree_, labels) return estimates_z
def model_complexity(X_train, y_train, X_test, y_test): """ Calculates the performance of the model as model complexity increases. The learning and testing errors rates are then plotted. """ print "Creating a model complexity graph. . . " # We will vary the max_depth of a decision tree model from 1 to 14 max_depth = np.arange(1, 14) train_err = np.zeros(len(max_depth)) test_err = np.zeros(len(max_depth)) for i, d in enumerate(max_depth): # Setup a Decision Tree Regressor so that it learns a tree with depth d regressor = DecisionTreeRegressor(max_depth = d) # Fit the learner to the training data regressor.fit(X_train, y_train) # Find the performance on the training set train_err[i] = performance_metric(y_train, regressor.predict(X_train)) # Find the performance on the testing set test_err[i] = performance_metric(y_test, regressor.predict(X_test)) # Plot the model complexity graph pl.figure(figsize=(7, 5)) pl.title('Decision Tree Regressor Complexity Performance') pl.plot(max_depth, test_err, lw=2, label = 'Testing Error') pl.plot(max_depth, train_err, lw=2, label = 'Training Error') pl.legend() pl.xlabel('Maximum Depth') pl.ylabel('Total Error') pl.show()
def get_imp(X,y): #rf = RandomForestClassifier() rf = DecisionTreeRegressor(random_state=9) rf.fit(X, y) imp_var = rf.feature_importances_ imp_var = pd.DataFrame({'variable':X.columns, 'imp':imp_var}).sort('imp', ascending=False) return(imp_var)
def fit_predict_model(city_data): """Find and tune the optimal model. Make a prediction on housing data.""" # Get the features and labels from the Boston housing data X, y = city_data.data, city_data.target # Setup a Decision Tree Regressor regressor = DecisionTreeRegressor() parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10)} ################################### ### Step 4. YOUR CODE GOES HERE ### ################################### # 1. Find the best performance metric # should be the same as your performance_metric procedure # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html scorer = make_scorer(mean_squared_error, greater_is_better=False) # 2. Use gridearch to fine tune the Decision Tree Regressor and find the best model # http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV grid_search = GridSearchCV(regressor, parameters, scoring=scorer) grid_search.fit(X, y) tuned_params = grid_search.best_params_ print "Tuned Parameters: " print tuned_params regressor.set_params(**tuned_params) # Fit the learner to the training data print "Final Model: " print regressor.fit(X, y) print "R^2 of prediction: " print regressor.score(X, y) # Use the model to predict the output of a particular sample x = [[11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]] y = regressor.predict(x) print "House: " + str(x) print "Prediction: " + str(y) # Get the price of similar houses and calculate the mean, for comparison with out prediction nbrs = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(X) distances, indices = nbrs.kneighbors(x) sum_prices = [] for i in indices: sum_prices.append(city_data.target[i]) neighbor_avg = np.mean(sum_prices) print "Avg. Price of similar houses:" print neighbor_avg
def fit(self,X,y): if self.loss == 'deviance': classes_ = list(set(y)) self.classes_ = classes_ self.loss_function = MultinomialClass(classes_) #each class has a series of trees self.trees = [[] for k in classes_] #fx is a N*K matrix fx = [[0 for k in classes_] for i in y] #number of samples n_samples = len(X) for m in xrange(self.n_estimators): print 'epoch {0}'.format(m) sys.stdout.flush() #subsample_index = self._subsampling_index(n_samples) #sub_X = self._subsampling_A(X,subsample_index) rm = self.loss_function.negative_gradient(y,fx) for k in range(len(classes_)): rmk = map(lambda a:a[k],rm) #tree = RegressionTree(self.max_depth) tree = DecisionTreeRegressor(max_depth=self.max_depth) #sub_rm = self._subsampling_A(rmk,subsample_index) tree.fit(X,rmk) #tree.fit(sub_X,sub_rm) self.trees[k].append(tree) print 'fit {0} trees done'.format(k) sys.stdout.flush() gamma_mk = tree.predict(X) #gamma_mk = [tree.predict(x) for x in X] fxk = [fxi[k]+self.learning_rate*gamma_imk for fxi,gamma_imk in zip(fx,gamma_mk)] for fxi,next_fxik in zip(fx,fxk): fxi[k] = next_fxik else: fx = [0 for yi in y] for m in xrange(self.n_estimators): Loss = self.loss_function.loss(y,fx) print 'epoch {0} ,loss:{1}'.format(m,Loss) rm = self.loss_function.negative_gradient(y,fx) tree = RegressionTree(self.max_depth) sub_X,sub_rm = self._subsampling(X,rm) tree.fit(sub_X,sub_rm) self.trees.append(tree) gamma_m = [tree.predict(x) for x in X] fx = [fxi+self.learning_rate*gamma_im for fxi,gamma_im in zip(fx,gamma_m)]
logreg = LogisticRegression() logreg.fit(X_train, Y_train) Y_pred = logreg.predict(X_test) acc_log = round(logreg.score(X_train, Y_train) * 100, 2) #RANDOM FOREST #fitting the random forest regression model to the dataset from sklearn.ensemble import RandomForestRegressor max_depths = np.linspace(1, 10, 10, endpoint=True) for max_depth in max_depths: regressor = RandomForestRegressor(n_estimators=512, random_state=0, max_depth=max_depth) regressor.fit(X_train, Y_train) #predicting new result y_pred = regressor.predict(X_test) regressor.score(X_train, Y_train) acc_regressor = round(regressor.score(X_train, Y_train) * 100, 2) #DECISION TREE: # Fitting Decision Tree Regression to the dataset from sklearn.tree import DecisionTreeRegressor regressor1 = DecisionTreeRegressor(random_state=0) regressor1.fit(X_train, Y_train) # Predicting a new result y_pred1 = regressor1.predict(X_test) acc_regressor1 = round(regressor1.score(X_train, Y_train) * 100, 2)
dataset_train = pd.read_csv('fixture.csv') dataset = pd.read_csv('test_fixture_edited.csv') X_train = dataset_train.iloc[:, 7:].values y_home_train = dataset_train.iloc[:, 5].values y_away_train = dataset_train.iloc[:, 6] X_test = dataset.iloc[:, 7:].values y_home_test = dataset.iloc[:, 5].values y_away_test = dataset.iloc[:, 6] #For Home from sklearn.tree import DecisionTreeRegressor regressor = DecisionTreeRegressor(random_state=0) regressor.fit(X_train, y_home_train) y_pred_home = regressor.predict(X_test) X_grid = np.arange(min(X_test), max(X_test), 0.01) X_grid = X_grid.reshape((len(X_grid), 1)) plt.scatter(X_test, y_pred_home, color='red') plt.plot(X_grid, regressor.predict(X_grid), color='blue') plt.title('Overall Power Difference or Home Score (Decision Tree Regression)') plt.xlabel('Overall Power Difference') plt.ylabel('Home Score') plt.show() #For Away from sklearn.tree import DecisionTreeRegressor
iowa_file_path='../input/home-data-for-ml-course/train.csv' home_data=pd.read_csv(iowa_file_path) #Create target object and call it y y=home_data.SalePrice #Create X features=['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd'] X=home_data[features] #Split into validation and training data train_X,val_X,train_y,val_y=train_test_split(X,y,random_state=1) #Specify Model iowa_model=DecisionTreeRegressor(random_state=1) #Fit model iowa_model.fit(train_X,train_y) #Make validation predictions and calculate mean absolute error val_predictions=iowa_model.predict(val_X) val_mae=mean_absolute_error(val_predictions, val_y) print("Validation MAE: {:, .0f}".format(val_mae)) #Setup code checking from learntools.core import binder binder.bind(globals()) from learntools.machine_learning.ex5 import * print("\nSetup complete") def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y): model=DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0) model.fit(train_X, train_y)
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y): model=DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0) model.fit(train_X, train_y) preds_val=model.predict(val_X) mae=mean_absolute_error(val_y,pred_val) return mae
sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i]) rank_result['ARD_pca'] = sumsum / float(result_row) rs_score['ARD_pca'] = r2_score(y_test, y) ARDModel = ARDRegression() ARDModel.fit(X_train_std, y_train) y = ARDModel.predict(X_test_std) [result_row] = y.shape sumsum = 0 #print y for i in range(result_row): sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i]) rank_result['ARD_std'] = sumsum / float(result_row) rs_score['ARD_std'] = r2_score(y_test, y) DTRModel = DecisionTreeRegressor(max_depth=2) DTRModel.fit(X_train_pca, y_train) y = DTRModel.predict(X_test_pca) [result_row] = y.shape sumsum = 0 #print y for i in range(result_row): sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i]) rank_result['DTR2_pca'] = sumsum / float(result_row) rs_score['DTR2_pca'] = r2_score(y_test, y) DTRModel = DecisionTreeRegressor(max_depth=2) DTRModel.fit(X_train_std, y_train) y = DTRModel.predict(X_test_std) [result_row] = y.shape sumsum = 0 #print y for i in range(result_row):
dv = ph.iloc[:, 6].values from sklearn.preprocessing import LabelEncoder le1 = LabelEncoder() le2 = LabelEncoder() iv[:, 1] = le1.fit_transform(iv[:, 1]) iv[:, 3] = le1.fit_transform(iv[:, 3]) iv[:, 4] = le1.fit_transform(iv[:, 4]) iv[:, 5] = le1.fit_transform(iv[:, 5]) dv = le2.fit_transform(dv) from sklearn.tree import DecisionTreeRegressor regressor = DecisionTreeRegressor(random_state=0) regressor.fit(iv, dv) print "Accuracy of Prediction:", regressor.score(iv, dv) from sklearn.ensemble import RandomForestRegressor new_regressor = RandomForestRegressor(n_estimators=10, random_state=0) new_regressor.fit(iv, dv) import numpy as np print new_regressor.predict(np.array([10, 1, 4, 0, 1, 0]).reshape(1, -1)) print new_regressor.predict(np.array([10, 1, 4, 1, 0, 1]).reshape(1, -1)) """ test1 = np.array([[10, 'Y', 4, 'BS', 'Y', 'N']], dtype=object).reshape(1, -1) test1[:,1] = le1.transform(test1[:,1])
print() means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, clf.cv_results_['params']): print(mean.round(3), std.round(3), params) print() print() best_models.append( (clf.best_score_.round(3), modelname, score, clf.best_params_)) predictionlist = list() for modeldata in best_models: sme, model, error, params = modeldata if model == 'tree': treemodel = DecisionTreeRegressor(**params) treemodel.fit(x_train, y_train) tree_predict = treemodel.predict(x_test) predictionlist.append((model, tree_predict)) if model == 'forest': forestmodel = RandomForestRegressor(**params) forestmodel.fit(x_train, y_train) forest_predict = forestmodel.predict(x_test) predictionlist.append((model, forest_predict)) if model == 'xgb': xgbmodel = XGBRegressor(**params) xgbmodel.fit(x_train, y_train) xgb_predict = xgbmodel.predict(x_test) predictionlist.append((model, xgb_predict)) lin_reg = LinearRegression() lin_reg.fit(x_train, y_train)
from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error from sklearn.datasets import load_boston from sklearn.ensemble import AdaBoostRegressor # 加载数据 from sklearn.neighbors import KNeighborsRegressor from sklearn.tree import DecisionTreeRegressor data=load_boston() # 分割数据 train_x, test_x, train_y, test_y = train_test_split(data.data, data.target, test_size=0.25, random_state=33) # 使用AdaBoost回归模型 regressor=AdaBoostRegressor() regressor.fit(train_x,train_y) pred_y = regressor.predict(test_x) mse = mean_squared_error(test_y, pred_y) print("房价预测结果 ", pred_y) print("均方误差 = ",round(mse,2)) # 使用决策树回归模型 dec_regressor=DecisionTreeRegressor() dec_regressor.fit(train_x,train_y) pred_y = dec_regressor.predict(test_x) mse = mean_squared_error(test_y, pred_y) print("决策树均方误差 = ",round(mse,2)) # 使用KNN回归模型 knn_regressor=KNeighborsRegressor() knn_regressor.fit(train_x,train_y) pred_y = knn_regressor.predict(test_x) mse = mean_squared_error(test_y, pred_y) print("KNN均方误差 = ",round(mse,2))
def flash_fair_LSR(biased_col, n_obj): # biased_col can be "sex" or "race", n_obj can be "ABCD" or "AB" or "CD" dataset_orig_train, dataset_orig_vt = train_test_split(dataset_orig, test_size=0.3) dataset_orig_valid, dataset_orig_test = train_test_split(dataset_orig_vt, test_size=0.5) X_train, y_train = dataset_orig_train.loc[:, dataset_orig_train.columns != 'Probability'], dataset_orig_train[ 'Probability'] X_valid, y_valid = dataset_orig_valid.loc[:, dataset_orig_valid.columns != 'Probability'], dataset_orig_valid[ 'Probability'] X_test, y_test = dataset_orig_test.loc[:, dataset_orig_test.columns != 'Probability'], dataset_orig_test[ 'Probability'] def convert_lsr(index): # 30 2 2 100 a = int(index / 400 + 1) b = int(index % 400 / 200 + 1) c = int(index % 200 / 100 + 1) d = int(index % 100 + 10) return a, b, c, d all_case = set(range(0, 12000)) modeling_pool = random.sample(all_case, 20) List_X = [] List_Y = [] for i in range(len(modeling_pool)): temp = convert_lsr(modeling_pool[i]) List_X.append(temp) p1 = temp[0] if temp[1] == 1: p2 = 'l1' else: p2 = 'l2' if temp[2] == 1: p3 = 'liblinear' else: p3 = 'saga' p4 = temp[3] model = LogisticRegression(C=p1, penalty=p2, solver=p3, max_iter=p4) all_value = measure_scores(X_train, y_train, X_valid, y_valid, dataset_orig_valid, biased_col, model) four_goal = all_value[0] + all_value[1] + all_value[2] + all_value[3] two_goal_recall_far = all_value[0] + all_value[1] two_goal_aod_eod = all_value[2] + all_value[3] if n_obj == "ABCD": List_Y.append(four_goal) elif n_obj == "AB": List_Y.append(two_goal_recall_far) elif n_obj == "CD": List_Y.append(two_goal_aod_eod) else: print("Wrong number of objects") remain_pool = all_case - set(modeling_pool) test_list = [] for i in list(remain_pool): test_list.append(convert_lsr(i)) upper_model = DecisionTreeRegressor() life = 20 while len(List_X) < 200 and life > 0: upper_model.fit(List_X, List_Y) candidate = random.sample(test_list, 1) test_list.remove(candidate[0]) candi_pred_value = upper_model.predict(candidate) if candi_pred_value < np.median(List_Y): List_X.append(candidate[0]) candi_config = candidate[0] pp1 = candi_config[0] if candi_config[1] == 1: pp2 = 'l1' else: pp2 = 'l2' if candi_config[2] == 1: pp3 = 'liblinear' else: pp3 = 'saga' pp4 = candi_config[3] candi_model = LogisticRegression(C=pp1, penalty=pp2, solver=pp3, max_iter=pp4) candi_value = measure_scores(X_train, y_train, X_valid, y_valid, dataset_orig_valid, biased_col, candi_model) candi_four_goal = candi_value[0] + candi_value[1] + candi_value[2] + candi_value[3] candi_two_goal_recall_far = candi_value[0] + candi_value[1] candi_two_goal_aod_eod = candi_value[2] + candi_value[3] if n_obj == "ABCD": List_Y.append(candi_four_goal) elif n_obj == "AB": List_Y.append(candi_two_goal_recall_far) elif n_obj == "CD": List_Y.append(candi_two_goal_aod_eod) else: life -= 1 min_index = int(np.argmin(List_Y)) return List_X[min_index]
class Model_Finder: """ This class shall be used to find the model with best accuracy and AUC score. Version: 1.0 Revisions: None """ def __init__(self, file_object, logger_object): self.file_object = file_object self.logger_object = logger_object self.clf = RandomForestClassifier() self.DecisionTreeReg = DecisionTreeRegressor() self.score = PerformanceEvaluation.performance(self.file_object, self.logger_object) def get_best_params_for_random_forest(self, train_x, train_y): """ Method Name: get_best_params_for_random_forest Description: get the parameters for Random Forest Algorithm which give the best accuracy. Use Hyper Parameter Tuning. Output: The model with the best parameters On Failure: Raise Exception Version: 1.0 Revisions: None """ self.logger_object.log( self.file_object, 'Entered the get_best_params_for_random_forest method of the Model_Finder class' ) try: # initializing with different combination of parameters self.param_grid = { "n_estimators": [10, 50, 100, 130], "criterion": ['gini', 'entropy'], "max_depth": range(2, 4, 1), "max_features": ['auto', 'log2'] } #Creating an object of the Grid Search class self.grid = GridSearchCV(estimator=self.clf, param_grid=self.param_grid, cv=5, verbose=3) #finding the best parameters self.grid.fit(train_x, train_y) #extracting the best parameters self.criterion = self.grid.best_params_['criterion'] self.max_depth = self.grid.best_params_['max_depth'] self.max_features = self.grid.best_params_['max_features'] self.n_estimators = self.grid.best_params_['n_estimators'] #creating a new model with the best parameters self.clf = RandomForestClassifier(n_estimators=self.n_estimators, criterion=self.criterion, max_depth=self.max_depth, max_features=self.max_features) # training the mew model self.clf.fit(train_x, train_y) self.logger_object.log( self.file_object, 'Random Forest best params: ' + str(self.grid.best_params_) + '. Exited the get_best_params_for_random_forest method of the Model_Finder class' ) return self.clf except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in get_best_params_for_random_forest method of the Model_Finder class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Random Forest Parameter tuning failed. Exited the get_best_params_for_random_forest method of the Model_Finder class' ) raise Exception() def get_best_params_for_DecisionTreeRegressor(self, train_x, train_y): """ Method Name: get_best_params_for_DecisionTreeRegressor Description: get the parameters for DecisionTreeRegressor Algorithm which give the best accuracy. Use Hyper Parameter Tuning. Output: The model with the best parameters On Failure: Raise Exception Version: 1.0 Revisions: None """ self.logger_object.log( self.file_object, 'Entered the get_best_params_for_DecisionTreeRegressor method of the Model_Finder class' ) try: # initializing with different combination of parameters self.param_grid_decisionTree = { "criterion": ["mse", "friedman_mse", "mae"], "splitter": ["best", "random"], "max_features": ["auto", "sqrt", "log2"], 'max_depth': range(2, 16, 2), 'min_samples_split': range(2, 16, 2) } # Creating an object of the Grid Search class self.grid = GridSearchCV(self.DecisionTreeReg, self.param_grid_decisionTree, verbose=3, cv=5) # finding the best parameters self.grid.fit(train_x, train_y) # extracting the best parameters self.criterion = self.grid.best_params_['criterion'] self.splitter = self.grid.best_params_['splitter'] self.max_features = self.grid.best_params_['max_features'] self.max_depth = self.grid.best_params_['max_depth'] self.min_samples_split = self.grid.best_params_[ 'min_samples_split'] # creating a new model with the best parameters self.decisionTreeReg = DecisionTreeRegressor( criterion=self.criterion, splitter=self.splitter, max_features=self.max_features, max_depth=self.max_depth, min_samples_split=self.min_samples_split) # training the mew models self.decisionTreeReg.fit(train_x, train_y) self.logger_object.log( self.file_object, 'Decision-Tree Regressor best params: ' + str(self.grid.best_params_) + '. Exited method of the Model fit') return self.decisionTreeReg except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in Decision-Tree Regressor method of the Model Fit. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Grid search Parameter tuning failed. Exited the Decision-Tree Regressor method of the Model Fit' ) raise Exception() def get_best_params_for_xgboost(self, train_x, train_y): """ Method Name: get_best_params_for_xgboost Description: get the parameters for XGBoost Algorithm which give the best accuracy. Use Hyper Parameter Tuning. Output: The model with the best parameters On Failure: Raise Exception Version: 1.0 Revisions: None """ self.logger_object.log( self.file_object, 'Entered the get_best_params_for_xgboost method of the Model_Finder class' ) try: # initializing with different combination of parameters self.param_grid_xgboost = { 'learning_rate': [0.5, 0.1, 0.01, 0.001], 'max_depth': [3, 5, 10, 20], 'n_estimators': [10, 50, 100, 200] } # Creating an object of the Grid Search class self.grid = GridSearchCV(XGBRegressor(objective='reg:linear'), self.param_grid_xgboost, verbose=3, cv=5) # finding the best parameters self.grid.fit(train_x, train_y) # extracting the best parameters self.learning_rate = self.grid.best_params_['learning_rate'] self.max_depth = self.grid.best_params_['max_depth'] self.n_estimators = self.grid.best_params_['n_estimators'] # creating a new model with the best parameters self.xgb = XGBRegressor(objective='reg:linear', learning_rate=self.learning_rate, max_depth=self.max_depth, n_estimators=self.n_estimators) # training the mew model self.xgb.fit(train_x, train_y) self.logger_object.log( self.file_object, 'XGBoost best params: ' + str(self.grid.best_params_) + '. Exited the get_best_params_for_xgboost method of the Model_Finder class' ) return self.xgb except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in get_best_params_for_xgboost method of the Model_Finder class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'XGBoost Parameter tuning failed. Exited the get_best_params_for_xgboost method of the Model_Finder class' ) raise Exception() def xgb_classifier(self, train_x, train_y): """ Method Name: get_best_params_for_xgboost Description: get the parameters for XGBoost Algorithm which give the best accuracy. Output: The model with the best parameters On Failure: Raise Exception Version: 1.0 Revisions: None """ self.logger_object.log( self.file_object, 'Entered the get_best_params_for_xgboost method of the Model_Finder class' ) try: # creating a new model with the best parameters self.xgbc = XGBClassifier(objective='binary:logistic') # training the mew model self.xgbc.fit(train_x, train_y) self.logger_object.log( self.file_object, 'XGBoost model train method done of the Model_Finder class') return self.xgbc except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in get_best_params_for_xgboost method of the Model_Finder class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'XGBoost Parameter tuning failed. Exited the get_best_params_for_xgboost method of the Model_Finder class' ) raise Exception() def lgb_classifier(self, train_x, train_y): """ Method Name: get_best_params_for_xgboost Description: get the parameters for XGBoost Algorithm which give the best accuracy. Output: The model with the best parameters On Failure: Raise Exception Version: 1.0 Revisions: None """ self.logger_object.log(self.file_object, 'Entered the lgb_classifier class') try: # creating a new model with the best parameters self.lgbc = lgboost.LGBMClassifier() # training the mew model self.lgbc.fit(train_x, train_y) self.logger_object.log( self.file_object, 'LGBoost model train method done of the Model_Finder class') return self.lgbc except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in get_best_params_for_LGBboost method of the Model_Finder class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'LGBoost Parameter tuning failed. Exited the get_best_params_for_LGboost method of the Model_Finder class' ) raise Exception() def catb_classifier(self, train_x, train_y): """ Method Name: get_best_params_for_xgboost Description: get the parameters for XGBoost Algorithm which give the best accuracy. Output: The model with the best parameters On Failure: Raise Exception Version: 1.0 Revisions: None """ self.logger_object.log( self.file_object, 'Entered the get_best_params_for_xgboost method of the Model_Finder class' ) try: # creating a new model with the best parameters self.cbc = cboost.CatBoostClassifier(iterations=2000, learning_rate=0.1, depth=8, eval_metric='Accuracy', random_seed=0, bagging_temperature=0.2, od_type='Iter', metric_period=75, od_wait=100) # training the mew model self.cbc.fit(train_x, train_y) self.logger_object.log( self.file_object, 'CatBoost model train method done of the Model_Finder class') return self.cbc except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in get_best_params_for_CatBoost method of the Model_Finder class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'CatBoost Parameter tuning failed. Exited the get_best_params_for_CatBoost method of the Model_Finder class' ) raise Exception() def get_best_model(self, train_x, train_y, test_x, test_y, cls): """ Method Name: get_best_model Description: Find out the Model which has the best AUC score. Output: The best model name and the model object On Failure: Raise Exception Version: 1.0 Revisions: None """ self.logger_object.log( self.file_object, 'Entered the get_best_model method of the Model_Finder class') # create best model for KNN try: self.xgb_class = self.xgb_classifier(train_x, train_y) self.prediction_xgb_class = self.xgb_class.predict( test_x) # Predictions using the XGB Model self.prediction_xgb_auc = roc_auc_score(test_y, self.prediction_xgb_class) self.score.all_score(test_y, self.prediction_xgb_class, cls, title="XGB Testing Score") self.prediction_xgb_class_train = self.xgb_class.predict(train_x) self.score.all_score(train_y, self.prediction_xgb_class_train, cls, title="XGB Training Score") self.lgb_class = self.lgb_classifier(train_x, train_y) self.prediction_lgb_class = self.lgb_class.predict( test_x) # Predictions using the LGB Model self.prediction_lgb_auc = roc_auc_score(test_y, self.prediction_lgb_class) self.score.all_score(test_y, self.prediction_lgb_class, cls, title="LGB Testing Score") self.prediction_lgb_class_train = self.lgb_class.predict(train_x) self.score.all_score(train_y, self.prediction_lgb_class_train, cls, title="LGB Training Score") self.cb_class = self.catb_classifier(train_x, train_y) self.prediction_cb_class = self.cb_class.predict( test_x) # Predictions using the CatBoost Model self.prediction_cb_auc = roc_auc_score(test_y, self.prediction_cb_class) self.score.all_score(test_y, self.prediction_cb_class, cls, title="Catboost Testing Score") self.prediction_cb_class_train = self.cb_class.predict(train_x) self.score.all_score(train_y, self.prediction_cb_class_train, cls, title="Catboost Training Score") # #comparing the three models self.lst = [ self.prediction_xgb_auc, self.prediction_lgb_auc, self.prediction_cb_auc ] self.best_nm = np.argmax(self.lst) if self.best_nm == 0: return 'XGBoost', self.xgb_class elif self.best_nm == 1: return 'LGBoost', self.lgb_class else: return 'CatBoost', self.cb_class except Exception as e: self.logger_object.log( self.file_object, 'Exception occured in get_best_model method of the Model_Finder class. Exception message: ' + str(e)) self.logger_object.log( self.file_object, 'Model Selection Failed. Exited the get_best_model method of the Model_Finder class' ) raise Exception()
X_test, y_test = X[test_indices, :], y[test_indices] # Unpack hyperparameters, resample training data, and fit regressors reg = DecisionTreeRegressor(random_state=rrr) if 'REBAGG' in strategy else \ RandomForestRegressor(n_estimators=10, n_jobs=-1, random_state=rrr) if strategy == 'RO': cl, ch, sample_method = param relevance = resreg.sigmoid_relevance(y_train, cl=cl, ch=ch) X_train, y_train = resreg.random_oversample(X_train, y_train, relevance, relevance_threshold=0.5, over=sample_method, random_state=rrr) reg.fit(X_train, y_train) elif strategy == 'SMOTER': cl, ch, sample_method, k = param relevance = resreg.sigmoid_relevance(y_train, cl=cl, ch=ch) X_train, y_train = resreg.smoter(X_train, y_train, relevance, relevance_threshold=0.5, k=k, over=sample_method, random_state=rrr) reg.fit(X_train, y_train) elif strategy == 'GN': cl, ch, sample_method, delta = param
'Price Distribution Plot of Handphones Whose Screen Material is TFT or IPS ' ) plt.show() print( data.dropna(subset=['ROM', 'RAM', 'brand', 'price']).shape[0] / data.shape[0]) print(data.isnull().sum().sort_values(ascending=False)) #所有列缺失值数据统计 df = data.loc[:, ['price', 'rear camera', 'brand', 'weight']].dropna() to_model = pd.get_dummies(df) x = to_model.iloc[:, 1:].values y = to_model.iloc[:, 0].values model = DecisionTreeRegressor() model.fit(x, y) error_list = [] for each in df['brand'].value_counts().index: to_fill = 'brand_{}'.format(each) x_data = to_model[to_model[to_fill] == 1].iloc[:, 1:].values y_data = to_model[to_model[to_fill] == 1].iloc[:, 0].values test_result = model.predict(x_data) merror = mae(y_data.reshape(len(y_data), 1), test_result.flatten()) error = (np.abs(test_result - y_data) / y_data).mean() print(each, end=' : ') print(np.round(merror, 2), end=', ') print(str(np.round(error * 100, 3)) + '%') error_list.append([each, merror, error])
map_cat = {} for x in list_cat: labels = X_cat[x].astype('category').cat.categories.tolist() replace_map_comp = { x: {k: v for k, v in zip(labels, list(range(1, len(labels) + 1)))} } X_cat.replace(replace_map_comp, inplace=True) map_cat[x] = replace_map_comp # Replacing the missing values X_cat.fillna(X_cat.mean(), inplace=True) regressor = DecisionTreeRegressor(min_samples_leaf=10, max_depth=5) regression = regressor.fit(X_cat, y) y_1 = regressor.predict(X_cat) regression.get_depth() regression.get_n_leaves() regression.get_params() #regression.decision_path(X_cat).todense() import numpy as np RMSE = np.sqrt(np.mean(y_1 - y)**2) regression.score(X_cat, y) from sklearn.externals.six import StringIO
mat = dataset.readlines() time = [] for row in mat: pres = row.split() pres = [float(ro) for ro in pres] time.append(pres) time = np.array(time) X = time[:, :-1] y = time[:, -1] # Splitting the dataset into the Training set and Test set """from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)""" # Feature Scaling """ from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X= sc_X.fit_transform(X) sc_y = StandardScaler() y = sc_y.fit_transform(y)""" # Fitting the Regression Model to the dataset # Create your regressor here from sklearn.tree import DecisionTreeRegressor regressor = DecisionTreeRegressor(parameters) regressor.fit(X, y) pickle.dump(regressor, open(hoemdir + "/attributes.p", "wb"), protocol=2)
# In[35]: print("Model coefficients:\n") for i in range(X_tr.shape[1]): print(X_tr.columns[i], "=", clf.coef_[i].round(4)) # ### Decision tree # In[36]: dt = DecisionTreeRegressor(max_depth=3, criterion="mae") dt.fit(X_tr, y_tr) # In[37]: print("Decision Tree Results") # In[38]: print("Train ", mean_absolute_error(dt.predict(X_tr), y_tr)) # In[39]:
print("单个决策树的分类效果:{}".format(accuracy_score(y_true, one_tree_pre))) # GBDT分类树的构建过程 models = [] algo = DecisionTreeClassifier(max_depth=1) # 基模型为分类决策树 # 模型迭代次数n n = 2 for i in range(n): k_model = [] for k in range(2): # 定义下一次迭代所用的决策树 model = DecisionTreeRegressor() model.fit(x, y_label[k]) # 预测结果并将其进行指数转换为概率的形式 y_pre = model.predict(x) dy = np.exp(y_pre) / np.sum(np.exp(y_label), axis=0) # 更新对应类别的y标签为概率的残差值d y_label[k] = y_label[k] - dy # 异常将每一个类别构建好的决策树添加到列表中 k_model.append(model) # 将每一次迭代的k个模型列表添加到最终的融合模型中 models.append(k_model) print("模型构建完成!") print("开始预测:")
outF = open("output_MD.txt", "w") print('best_criterion = ', best_criterion, file=outF) print('best_splitter = ', best_splitter, file=outF) print('best_max_features = ', best_max_features, file=outF) outF.close() regr = DecisionTreeRegressor(criterion='mse', splitter='best', max_features='auto', random_state=69) regr = MultiOutputRegressor(estimator=regr, n_jobs=n_jobs) t0 = time.time() regr.fit(x_train, y_train) regr_fit = time.time() - t0 print("Complexity and bandwidth selected and model fitted in %.6f s" % regr_fit) t0 = time.time() y_regr = regr.predict(x_test) regr_predict = time.time() - t0 print("Prediction for %d inputs in %.6f s" % (x_test.shape[0], regr_predict)) x_test_dim = sc_x.inverse_transform(x_test) y_test_dim = sc_y.inverse_transform(y_test) y_regr_dim = sc_y.inverse_transform(y_regr) plt.scatter(x_test_dim[:,0], y_test_dim[:,0], s=5, c='k', marker='o', label='KAPPA') plt.scatter(x_test_dim[:,0], y_regr_dim[:,0], s=5, c='r', marker='d', label='k-Nearest Neighbour') #plt.scatter(x_test_dim[:,0], y_test_dim[:,1], s=5, c='k', marker='o', label='KAPPA')
['Open-World', 16500, 30000], ['MMOFPS', 25000, 46000], ['MMORPG', 30000, 80000]]) # select all rows by : and column 1 # by 1:2 representing features X = dataset[:, 1:2].astype(int) #covert to integer #print(X) # select all rows by : and column 2 # by 2 to Y representing labels y = dataset[:, 2].astype(int) #print(y) reg = DecisionTreeRegressor(random_state=0) #print(reg) reg.fit(X, y) pred_case = reg.predict([[3750]]) # print the predicted price print("Predicted price: % d\n" % pred_case) #Visualize results X_grid = np.arange(min(X), max(X), 0.01) X_grid = X_grid.reshape((len(X_grid), 1)) plt.scatter(X, y, color='red') #plot predicted data plt.plot(X_grid, reg.predict(X_grid), color='blue') # specify title plt.title('Profit to Production Cost (Decision Tree Regression)') # specify X axis label plt.xlabel('Production Cost')
#splitting train and test variables into X and Y variables X_train=train_data.iloc[:,0:11] X_train Y_train=train_data["Item_Outlet_Sales"] Y_train X_test=test_data.iloc[:,0:11] X_test # Implementing the Decision tree model by gini index method from sklearn.tree import DecisionTreeRegressor dt = DecisionTreeRegressor() dt.fit(X_train, Y_train) dt.tree_.node_count dt.tree_.max_depth Y_pred_train = dt.predict(X_train) Y_pred_test = dt.predict(X_test) print(f"Decision tree has {dt.tree_.node_count} nodes with maximum depth covered up to {dt.tree_.max_depth}") # Further tuning is required to decide about max depth value ie, by pruning # apply grid search cv method and pass levels with cv = 10 and look # out for the best depth at this place from sklearn.model_selection import GridSearchCV levels = {'max_depth': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37]}
data= pd.concat([data,label],axis=1) data.drop('label', axis=1,inplace=True) train=data.iloc[:, 0:4].values test=data.iloc[: ,4:].values X_train,X_test,y_train,y_test=train_test_split(train,test,test_size=0.3) from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) from sklearn.tree import DecisionTreeRegressor clf=DecisionTreeRegressor() clf.fit(X_train,y_train) pred=clf.predict(X_test) engine.say("so lets get started, from the Node m c u kit we got the following data") engine.say("the air humidity is 40 around you") engine.runAndWait() ah = 40 #AIR HUMIDITY engine.say("the temperature around you is eighty-one degrees fahrenheit") engine.runAndWait() atemp = 81 #AIR TEMPERATURE engine.say("The rainsfall in your area accordig to last year data is 119 milimeteres") engine.runAndWait() rain = 119 #RAINFALL engine.say(" The pH of your soil is five") engine.runAndWait()
labels = dataset.iloc[:, 0].values from sklearn.model_selection import train_test_split features_train, features_test, labels_train, labels_test = train_test_split( features, labels, test_size=0.2, random_state=0) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() features_train = scaler.fit_transform(features_train) features_test = scaler.transform(features_test) from sklearn.ensemble import RandomForestRegressor from sklearn.tree import DecisionTreeRegressor regressor = DecisionTreeRegressor(random_state=0) regressor.fit(features_train, labels_train) pred1 = regressor.predict(features_test) score1 = regressor.score(features_test, labels_test) ran_for_regressor = RandomForestRegressor(n_estimators=10, random_state=0) ran_for_regressor.fit(features_train, labels_train) pred2 = ran_for_regressor.predict(features_test) score2 = ran_for_regressor.score(features_test, labels_test) pred3 = ran_for_regressor.predict( scaler.transform( np.array([6, 215, 100, 2630, 22.2, 80, 3]).reshape(1, -1))) """ import statsmodels.formula.api as sm
#to prec=dict the housing price y = housing_data.price #the predictors housing_predictors = [ 'id', 'bathrooms', 'floors', 'bedrooms', 'yr_built', 'yr_renovated', 'sqft_lot' ] x = housing_data[housing_predictors] #setting the model housing_model = DecisionTreeRegressor() #fit the data i.e.., x and y housing_model.fit(x, y) # In[ ]: import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from sklearn.tree import DecisionTreeRegressor # Input data files are available in the "../input/" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory from subprocess import check_output #print(check_output(["ls", "../input"]).decode("utf8")) # Any results you write to the current directory are saved as output. housing_file_path = '../input/kc_house_data.csv'
from sklearn.tree import DecisionTreeRegressor # the decision tree is used to predict simultaneously the noisy x and y observations of a circle given a single underlying feature # as a result, it learns local linear regressions approximating the circle # Create a random dataset: rng = np.random.RandomState(1) X = np.sort(200 * rng.rand(100, 1) - 100, axis=0) Y = np.array([np.pi * np.sin(X).ravel(), np.pi * np.cos(X).ravel()]).T Y[::5, :] += (0.5 - rng.rand(20, 2)) # Fit regression model: regr_1 = DecisionTreeRegressor(max_depth=2) regr_2 = DecisionTreeRegressor(max_depth=5) regr_3 = DecisionTreeRegressor(max_depth=8) regr_1.fit(X, Y) regr_2.fit(X, Y) regr_3.fit(X, Y) # Predict: x_test = np.arange(-100.0, 100.0, 0.01)[:, np.newaxis] y_1 = regr_1.predict(x_test) y_2 = regr_2.predict(x_test) y_3 = regr_3.predict(x_test) # Plot the results plt.figure() s = 50 plt.scatter(Y[:, 0], Y[:, 1], c="navy", s=s, label="Data") plt.scatter(y_1[:, 0], y_1[:, 1], c="cornflowerblue", s=s, label="max_depth=2") plt.scatter(y_2[:, 0], y_2[:, 1], c="c", s=s, label="max_depth=5")
class Admission_Predictor: def __init__(self): path = '/Users/pragya/PycharmProjects/AI LAB/venv/Admission_Prediction/Data/' os.chdir(path) self.data = pd.read_csv("Admission_Predict.csv") describe = self.data.describe().transpose() print(describe) ### Check Missing values print((self.data.notnull().sum() / len(self.data)).sort_values(ascending=False)) def plot_data(self, test_y): cols = self.data.columns # print(cols) features = cols[1:-1] target = cols[-1] # print("Features: ", features) #Plots plt.figure(figsize=(20, 20)) for i in range(len(features)): plt.subplot(3, 3, i + 1) # print(features[i]) plt.scatter(self.data[features[i]], self.data['Chance of Admit ']) plt.title(features[i]) plt.savefig('features.pdf') plt.show() # Median student chances features2 = cols[[3, 4, 5, 7]] print('features2',features2) means = self.data['Chance of Admit '].mean() median = self.data['Chance of Admit '].median() print("Mean student chances", means) print("Median student chances",median) # Considering the best correalations features main_features = ['CGPA', 'GRE Score', 'TOEFL Score'] for i in range(len(main_features)): print(main_features[i].upper()) print(linregress(self.data[main_features[i]], self.data['Chance of Admit '])) plt.figure(figsize=(20, 6)) plt.subplot(1, 2, 1) sns.distplot(self.data[main_features[i]], kde = False) plt.title('Distributed ' + main_features[i] + ' of Applicants') plt.subplot(1, 2, 2) sns.regplot(self.data[main_features[i]], self.data['Chance of Admit ']) plt.title(main_features[i] + ' vs Chance of Admit') plt.savefig(main_features[i] +'.pdf') # Bar Plots df = self.data plt.figure(figsize=(20, 10)) for j in range(len(features2)): plt.subplot(2, 2, j + 1) values = df[features2[j]].unique() ser = pd.Series(range(len(values)), index=values, dtype='float64') for i in range(len(values)): ser[values[i]] = df[df[features2[j]] == values[i]]['Chance of Admit '].mean() ser = ser.sort_index() plt.bar(ser.index, ser.values, width=0.3) plt.title(features2[j]) plt.plot([0, len(values)], [median, median], 'k-', lw=1, dashes=[2, 2]) plt.savefig('featuresVsMedian.pdf') plt.show() # Algo comparision plots Methods = ['Decision Tree Regression', 'Linear Regression'] Scores = np.array([self.score1, self.score2]) fig, ax = plt.subplots(figsize=(8, 6)) sns.barplot(Methods, Scores) plt.title('Algorithm Prediction Accuracies') plt.ylabel('Accuracy') plt.savefig("Algorithm_Prediction_Accuracies.pdf") plt.show() #Residual Plot plt.scatter(self.predicted_2, self.predicted_2 - test_y, c='g') plt.hlines(y = 0, xmin=0.4, xmax=1) plt.title('Residual plot') plt.ylabel('Residual') plt.savefig("Residual_LR.pdf") plt.show() def model_decision(self): # data Pre - processing cols = self.data.columns features = cols[1:-1] target = cols[-1] X = self.data[features] y = self.data['Chance of Admit '] # train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25) train_X = X[:int(0.75*len(X))] test_X = X[int(0.75*len(X)):] train_y = y[:int(0.75*len(X))] test_y = y[int(0.75*len(X)):] # Decision tree Regression self.model = DecisionTreeRegressor(max_leaf_nodes=10) self.model.fit(train_X, train_y) #Linear Model self.model2 = linear_model.LinearRegression() self.model2.fit(train_X, train_y) print(self.model) print(self.model2) # print(self.model3) # Visualization with open("classifier.dot", "w") as f: f = tree.export_graphviz(self.model, feature_names=features, class_names=target, out_file=f) # Test data Prediction self.predicted = self.model.predict(test_X) self.predicted_full = self.model.predict(X) self.score1 = self.model.score(test_X,test_y) print("Score1: ", self.score1) self.predicted_2 = self.model2.predict(test_X) self.predicted_full_2 = self.model2.predict(X) self.score2 = self.model2.score(test_X, test_y) print("Score2: ", self.score2) print("Coefficients-----------", self.model2.coef_) print("Intercept-----------", self.model2.intercept_) return train_X, test_X, train_y, test_y# sample prediction def predict(self,df): train_X, test_X, train_y, test_y = self.model_decision() pred = self.model.predict(df) return pred # predicted = model.predict(train_X) # Actual - predicted for test/train data def error_calc(self, test): mae_DR = mean_absolute_error(test, self.predicted) mse_DR = mean_squared_error(test, self.predicted) r2_DR = r2_score(test, self.predicted) mae_LR = mean_absolute_error(test, self.predicted_2) mse_LR = mean_squared_error(test, self.predicted_2) r2_LR = r2_score(test, self.predicted_2) print("Errors - Linear Regression: \n Mean Absolute Error: {} \n Mean Squared Error: {} \n R2 Score: {}".format(mae_LR, mse_LR, r2_LR)) print("Errors - Decision Tree Regression: \n Mean Absolute Error: {} \n Mean Squared Error: {} \n R2 Score: {}".format(mae_DR, mse_DR, r2_DR)) def output_results(self): df1 = pd.DataFrame() df1['predictions_DR'] = self.predicted_full df1['predictions_LR'] = self.predicted_full_2 final_df = pd.merge(left=self.data, right=df1, left_index=True, right_index=True) final_df['True_Decision'] = final_df['Chance of Admit '].apply(lambda x: "Yes" if x > 0.80 else "No") final_df['Decision_DR'] = final_df['predictions_DR'].apply(lambda x: "Yes" if x > 0.80 else "No") final_df['Decision_LR'] = final_df['predictions_LR'].apply(lambda x: "Yes" if x > 0.80 else "No") print(final_df[['GRE Score','TOEFL Score','University Rating','SOP','LOR ','CGPA','Research','Chance of Admit ','predictions_LR']].tail(5)) print(final_df[['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA', 'Research', 'Chance of Admit ', 'predictions_DR']].tail(5))
# 多输出(y是多列)的决策树回归 if __name__ == "__main__": N = 300 x = np.random.rand(N) * 8 - 4 # [-4,4) x.sort() # y1 = np.sin(x) + 3 + np.random.randn(N) * 0.1 # y2 = np.cos(0.3 * x) + np.random.randn(N) * 0.01 y1 = np.sin(x) + np.random.randn(N) * 0.05 y2 = np.cos(x) + np.random.randn(N) * 0.1 # y1 = 16 * np.sin(x) ** 3 + np.random.randn(N) # y2 = 13 * np.cos(x) - 5 * np.cos(2 * x) - 2 * np.cos(3 * x) - np.cos(4 * x) + np.random.randn(N) * 0.1 y = np.vstack((y1, y2)) y = np.vstack((y1, y2)).T # .T 转置 x = x.reshape(-1, 1) # 转置后,得到N个样本,每个样本都是1维的 deep = 3 reg = DecisionTreeRegressor(criterion='mse', max_depth=deep) dt = reg.fit(x, y) x_test = np.linspace(-4, 4, num=1000).reshape(-1, 1) print(x_test) y_hat = dt.predict(x_test) print(y_hat) plt.scatter(y[:, 0], y[:, 1], c='r', s=40, label='Actual') plt.scatter(y_hat[:, 0], y_hat[:, 1], c='g', marker='s', s=100, label='Depth=%d' % deep, alpha=1) plt.legend(loc='upper left') plt.xlabel('y1') plt.ylabel('y2') plt.grid() plt.show()
# :class:`~sklearn.tree.DecisionTreeRegressor` now supports a new `'poisson'` # splitting criterion. Setting `criterion="poisson"` might be a good choice # if your target is a count or a frequency. from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import train_test_split import numpy as np n_samples, n_features = 1000, 20 rng = np.random.RandomState(0) X = rng.randn(n_samples, n_features) # positive integer target correlated with X[:, 5] with many zeros: y = rng.poisson(lam=np.exp(X[:, 5]) / 2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) regressor = DecisionTreeRegressor(criterion="poisson", random_state=0) regressor.fit(X_train, y_train) ############################################################################## # New documentation improvements # ------------------------------ # # New examples and documentation pages have been added, in a continuous effort # to improve the understanding of machine learning practices: # # - a new section about :ref:`common pitfalls and recommended # practices <common_pitfalls>`, # - an example illustrating how to :ref:`statistically compare the performance of # models <sphx_glr_auto_examples_model_selection_plot_grid_search_stats.py>` # evaluated using :class:`~sklearn.model_selection.GridSearchCV`, # - an example on how to :ref:`interpret coefficients of linear models # <sphx_glr_auto_examples_inspection_plot_linear_model_coefficient_interpretation.py>`,
features_names = [ 'year', 'weekofyear', 'reanalysis_dew_point_temp_k', 'reanalysis_min_air_temp_k', 'station_diur_temp_rng_c', 'reanalysis_tdtr_k', 'reanalysis_specific_humidity_g_per_kg', 'station_avg_temp_c', 'reanalysis_relative_humidity_percent', 'precipitation_amt_mm', 'reanalysis_precip_amt_kg_per_m2' ] features = data[features_names] labels = data['total_cases'] # Cross validation analysis from sklearn.model_selection import cross_val_score total_scores = [] for i in range(2, 30): regressor = DecisionTreeRegressor(criterion='mse', max_depth=i) regressor.fit(features, labels) scores = -cross_val_score( regressor, features, labels, scoring='neg_mean_absolute_error', cv=10) total_scores.append(scores.mean()) plt.plot(range(2, 30), total_scores, marker='o') plt.xlabel('max_depth') plt.ylabel('cv score') plt.show() # Print features relevancies print 'Feature Relevancies' regressor = DecisionTreeRegressor(criterion='mse', max_depth=3, random_state=0) regressor.fit(features, labels) list1 = zip(features, regressor.feature_importances_) print tabulate(list1, headers=['Feature', 'Relevance'])
import pandas as pd import matplotlib.pyplot as plt from sklearn.tree import DecisionTreeRegressor from dtreeviz.trees import * df_cars = pd.read_csv("tmp") X, y = df_cars[['COUNTS']], df_cars['BYTES'] dt = DecisionTreeRegressor(max_depth=3, criterion="mae") dt.fit(X, y) fig = plt.figure() ax = fig.gca() rtreeviz_univar(dt, X, y, 'COUNTS', 'BYTES', ax=ax) plt.show()
# Pick any two variables and store them to a new DataFrame use describe to summarize the X_Train heatData = ['Heating', 'HeatingQC'] print (house_data[heatData].describe()) print("================================= modeling ===================================") # prediction target==y y = house_data.SalePrice # choosing predictors X price_predictors = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr'] X = house_data[price_predictors] # define model - What type of model will it be my_model = DecisionTreeRegressor() # fit_model - Capture patterns from provided X_Train my_model.fit(X, y) print(my_model) print("================================= prediction ===================================") print("Making predictions for the following 5 houses:") print(X.head()) print("The predictions are") print(my_model.predict(X.head()))
""" test """ seed(10) X = randint(0, 100, 100) Y = uniform(low=0.5, high=13.3, size=(100, )) max_split, max_gain = max_split_gain(X, Y) print("%.2f" % info(Y)) print("%.2f" % condition_info(X, Y, 89)) print("%d" % max_split, "%.2f" % max_gain) from sklearn.tree import DecisionTreeRegressor X2 = np.array([[ele] for ele in X]) clf = DecisionTreeRegressor(criterion="mse", max_depth=1) # “mse” for the mean squared error, which is equal to variance reduction as # feature selection criterion and minimizes the L2 loss using the mean of # each terminal node # variance reduction is equivalent to standard deviation reduction # 对于x轴是一维的连续值,max_depth的增加可以在X轴上增加更多的分割点 # http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor # http://scikit-learn.org/stable/auto_examples/tree/plot_tree_regression.html clf.fit(X2, Y) values = clf.predict([[87], [88], [89], [90]]) print(values)