示例#1
0
def model_complexity(X_train, y_train, X_test, y_test):
    """Calculate the performance of the model as model complexity increases."""

    print "Model Complexity: "

    # We will vary the depth of decision trees from 2 to 25
    max_depth = np.arange(1, 25)
    train_err = np.zeros(len(max_depth))
    test_err = np.zeros(len(max_depth))

    for i, d in enumerate(max_depth):
        # Setup a Decision Tree Regressor so that it learns a tree with depth d
        regressor = DecisionTreeRegressor(max_depth=d)

        # Fit the learner to the training data
        regressor.fit(X_train, y_train)

        # Find the performance on the training set
        train_err[i] = performance_metric(y_train, regressor.predict(X_train))

        # Find the performance on the testing set
        test_err[i] = performance_metric(y_test, regressor.predict(X_test))

    # Plot the model complexity graph
    model_complexity_graph(max_depth, train_err, test_err)
示例#2
0
def test_rt():
    boston = load_boston()
    X, y = boston.data, boston.target
    feature_names = boston.feature_names

    sk_dt = SKRT(random_state=1, max_depth=3)
    our_dt = RegressionTree(feature_names=feature_names, random_state=1)

    sk_dt.fit(X, y)
    our_dt.fit(X, y)

    sk_pred = sk_dt.predict(X)
    our_pred = our_dt.predict(X)
    assert np.allclose(sk_pred, our_pred)

    # With labels
    local_expl = our_dt.explain_local(X, y)
    local_viz = local_expl.visualize(0)
    assert local_viz is not None

    # Without labels
    local_expl = our_dt.explain_local(X)
    local_viz = local_expl.visualize(0)
    assert local_viz is not None

    global_expl = our_dt.explain_global()
    global_viz = global_expl.visualize()
    assert global_viz is not None
 def nn_lin(self, testX, neighbors):
     l = DecisionTreeRegressor()
     return np.mean(self.Y[neighbors])
     l.fit(self.X[neighbors], self.Y[neighbors])
     # for idx in np.where(l.coef_)[0]:
         # self.active[idx]+=1
     return l.predict([testX])[0]
def test_thresholded_scorers():
    # Test scorers that take thresholds.
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.decision_function(X_test))
    score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)
    assert_almost_equal(score1, score3)

    logscore = get_scorer('log_loss')(clf, X_test, y_test)
    logloss = log_loss(y_test, clf.predict_proba(X_test))
    assert_almost_equal(-logscore, logloss)

    # same for an estimator without decision_function
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)

    # test with a regressor (no decision_function)
    reg = DecisionTreeRegressor()
    reg.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(reg, X_test, y_test)
    score2 = roc_auc_score(y_test, reg.predict(X_test))
    assert_almost_equal(score1, score2)

    # Test that an exception is raised on more than two classes
    X, y = make_blobs(random_state=0, centers=3)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf.fit(X_train, y_train)
    assert_raises(ValueError, get_scorer('roc_auc'), clf, X_test, y_test)
def learning_curve(depth, X_train, y_train, X_test, y_test):
    """Calculate the performance of the model after a set of training data."""

    # We will vary the training set size so that we have 50 different sizes
    sizes = np.round(np.linspace(1, len(X_train), 50))
    train_err = np.zeros(len(sizes))
    test_err = np.zeros(len(sizes))
    sizes = [int(ii) for ii in sizes]
    
    print "Decision Tree with Max Depth: "
    print depth

    for i, s in enumerate(sizes):

        # Create and fit the decision tree regressor model
        regressor = DecisionTreeRegressor(max_depth=depth)
        regressor.fit(X_train[:s], y_train[:s])

        # Find the performance on the training and testing set
        train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s]))
        test_err[i] = performance_metric(y_test, regressor.predict(X_test))


    # Plot learning curve graph
    learning_curve_graph(sizes, train_err, test_err)
def model_complexity(X_train, y_train, X_test, y_test):
    """Calculate the performance of the model as model complexity increases."""
    ### now we are using all the training data and seeing how the model complexity affects
    ### performance
    ### before we were holding the complexity constant and measuring performance as the training 
    ### data increased

    print "Model Complexity: "

    # We will vary the depth of decision trees from 2 to 25
    max_depth = np.arange(1, 25)
    train_err = np.zeros(len(max_depth))
    test_err = np.zeros(len(max_depth))
    x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]
    print "len of training set = ",len(y_train)
    for i, d in enumerate(max_depth):
        # Setup a Decision Tree Regressor so that it learns a tree with depth d
        regressor = DecisionTreeRegressor(max_depth=d,random_state = 0)

        # Fit the learner to the training data
        regressor.fit(X_train, y_train)
        
        print d,regressor.predict(x)

        # Find the performance on the training set
        train_err[i] = performance_metric(y_train, regressor.predict(X_train))

        # Find the performance on the testing set
        test_err[i] = performance_metric(y_test, regressor.predict(X_test))

    # Plot the model complexity graph
    model_complexity_graph(max_depth, train_err, test_err)
def learn_regression_tree_ensemble(img_features, gt_illuminants, num_trees, max_tree_depth):
    eps = 0.001
    inst = [[img_features[i], gt_illuminants[i][0] / (sum(gt_illuminants[i]) + eps),
                              gt_illuminants[i][1] / (sum(gt_illuminants[i]) + eps)] for i in range(len(img_features))]

    inst.sort(key = lambda obj: obj[1]) #sort by r chromaticity
    stride = int(np.ceil(len(inst) / float(num_trees+1)))
    sz = 2*stride
    dst_model = []
    for tree_idx in range(num_trees):
        #local group in the training data is additionally weighted by num_trees
        local_group_range = range(tree_idx*stride, min(tree_idx*stride+sz, len(inst)))
        X = num_trees * [inst[i][0] for i in local_group_range]
        y_r = num_trees * [inst[i][1] for i in local_group_range]
        y_g = num_trees * [inst[i][2] for i in local_group_range]

        #add the rest of the training data:
        X = X + [inst[i][0] for i in range(len(inst)) if i not in local_group_range]
        y_r = y_r + [inst[i][1] for i in range(len(inst)) if i not in local_group_range]
        y_g = y_g + [inst[i][2] for i in range(len(inst)) if i not in local_group_range]

        local_model = []
        for feature_idx in range(len(X[0])):
            tree_r = DecisionTreeRegressor(max_depth = max_tree_depth, random_state = 1234)
            tree_r.fit([el[feature_idx][0] for el in X], y_r)
            tree_g = DecisionTreeRegressor(max_depth = max_tree_depth, random_state = 1234)
            tree_g.fit([el[feature_idx][0] for el in X], y_g)
            local_model.append([tree_r, tree_g])
        dst_model.append(local_model)
    return dst_model
def learning_curve(depth, X_train, y_train, X_test, y_test):
    """Calculate the performance of the model after a set of training data."""

    # We will vary the training set size so that we have 50 different sizes
    sizes = np.round(np.linspace(1, len(X_train), 50))
    train_err = np.zeros(len(sizes))
    test_err = np.zeros(len(sizes))

    print "Decision Tree with Max Depth: "
    print depth

    for i, s in enumerate(sizes):

        # Create and fit the decision tree regressor model
        regressor = DecisionTreeRegressor(max_depth=depth)
        regressor.fit(X_train[:s], y_train[:s])

        # Find the performance on the training and testing set
        train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s]))
        test_err[i] = performance_metric(y_test, regressor.predict(X_test))

    # if depth >= 4 and depth <= 6:
    #     pl.figure()
    #     pl.plot(y_test, 'bo')
    #     pl.plot(regressor.predict(X_test), color='red')
    #     pl.savefig("test_data_depth_" + str(depth))

    # Plot learning curve graph
    learning_curve_graph(sizes, train_err, test_err, depth)
def learning_curve(depth, X_train, y_train, X_test, y_test):
    """Calculate the performance of the model after a set of training data for a given depth
    or complexity."""

    # We will vary the training set size so that we have 50 different sizes
    sizes = np.round(np.linspace(1, len(X_train), 50))
    train_err = np.zeros(len(sizes))
    test_err = np.zeros(len(sizes))
    x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]
    print "Decision Tree with Max Depth: "
 #   print depth

    for i, s in enumerate(sizes):  # iterate thru 50 different training set sizes

        # Create and fit the decision tree regressor model for the size of the training
        # set given and the depth of the decision tree (I assume the depth of the tree indicates
        # the complexity of the model, i.e as the depth grows the complexity grows)
        regressor = DecisionTreeRegressor(max_depth=depth,random_state=0)
        regressor.fit(X_train[:s], y_train[:s])
        
        # Find the performance on the training and testing set
        train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s]))
        test_err[i] = performance_metric(y_test, regressor.predict(X_test))


    #print out the prediction for the full training set for that depth
    print depth,s,regressor.predict(x)


    # Plot learning curve graph
    learning_curve_graph(sizes, train_err, test_err)
def fit_model1(X, y):
    """ Performs grid search over the 'max_depth' parameter for a
        decision tree regressor trained on the input data [X, y]. """

    # Create cross-validation sets from the training data
    cv_sets = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.20, random_state=0)

    # TODO: Create a decision tree regressor object
    regressor = DecisionTreeRegressor()
    regressor.fit(X, y)
    # TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {'max_depth': (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)}

    # TODO: Transform 'performance_metric' into a scoring function using 'make_scorer'
    scoring_fnc = make_scorer(performance_metric)

    #print(regressor.predict(X))
    ##scoring_fnc = make_scorer(mean_squared_error)

    # TODO: Create the grid search object
    grid_obj = GridSearchCV(regressor, params, scoring=scoring_fnc, cv=cv_sets)

    # Fit the grid search object to the data to compute the optimal model
    grid = grid_obj.fit(X, y)

    # Return the optimal model after fitting the data
    return grid.best_estimator_
def train_learning_model_decision_tree_ada_boost(df):
    #code taken from sklearn
    X_all, y_all = preprocess_data(df)
    X_train, X_test, y_train, y_test = split_data(X_all, y_all)

    tree_regressor = DecisionTreeRegressor(max_depth = 6)
    ada_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=6), n_estimators = 500, learning_rate = 0.01, random_state = 1)

    tree_regressor.fit(X_train, y_train)
    ada_regressor.fit(X_train, y_train)

    y_pred_tree = tree_regressor.predict(X_test)
    y_pred_ada = ada_regressor.predict(X_test)
    
    mse_tree = mean_squared_error(y_test, y_pred_tree)
    mse_ada = mean_squared_error(y_test, y_pred_ada)

    mse_tree_train = mean_squared_error(y_train, tree_regressor.predict(X_train))
    mse_ada_train = mean_squared_error(y_train, ada_regressor.predict(X_train))
    
    print ("MSE tree: %.4f " %mse_tree)
    print ("MSE ada: %.4f " %mse_ada)

    print ("MSE tree train: %.4f " %mse_tree_train)
    print ("MSE ada train: %.4f " %mse_ada_train)
示例#12
0
  def CART(self):
    "  CART"
    # Apply random forest Classifier to predict the number of bugs.
    if self.smoteit:
      self.train = SMOTE(
          self.train,
          atleast=50,
          atmost=101,
          resample=self.duplicate)

    if not self.tuning:
      clf = DecisionTreeRegressor(random_state=1)
    else:
      clf = DecisionTreeRegressor(max_depth=int(self.tunings[0]),
                                  min_samples_split=int(self.tunings[1]),
                                  min_samples_leaf=int(self.tunings[2]),
                                  max_features=float(self.tunings[3] / 100),
                                  max_leaf_nodes=int(self.tunings[4]),
                                  criterion='entropy', random_state=1)
    features = self.train.columns[:-2]
    klass = self.train[self.train.columns[-2]]
    # set_trace()
    clf.fit(self.train[features].astype('float32'), klass.astype('float32'))
    preds = clf.predict(
        self.test[self.test.columns[:-2]].astype('float32')).tolist()
    return preds
示例#13
0
def fit_predict_model(city_data):
	'''Find and tune the optimal model. Make a prediction on housing data.'''

	# Get the features and labels from the Boston housing data
	X, y = city_data.data, city_data.target
	print X
	# Setup a Decision Tree Regressor
	regressor = DecisionTreeRegressor()

	parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10)}

	
	reg = GridSearchCV(regressor, parameters,scoring=make_scorer(metrics.mean_squared_error,greater_is_better=False))
	print reg.fit(X, y)

	depth_values= list()
	for i in xrange(101):
		reg.fit(X,y)
		depth_values.append(int(reg.best_params_['max_depth']))

	print "Best model parameter:  " + str(np.median(depth_values))
	# Fit the learner to the training data

    # Use the model to predict the output of a particular sample
	regressor = DecisionTreeRegressor(max_depth=np.median(depth_values))
	print "Final Model: "
	print regressor

	regressor.fit(X, y)
	
	x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]
	y = regressor.predict(x)
	print "House: " + str(x)
	print "Prediction: " + str(y)
def test_decision_tree_regression(filename):
    start_time = time.time()
    scores = []
    from sklearn.tree import DecisionTreeRegressor
    df = pd.read_csv(filename)
    h_indep = df.columns[:-1]
    h_dep = df.columns[-1]
    for _ in xrange(10):
            # print "- ",
            sys.stdout.flush()
            msk = np.random.rand(len(df)) < 0.4
            train_data = df[msk]
            test_data = df[~msk]

            # print len(train_data), len(test_data)
            assert (len(train_data) + len(test_data) == len(df)), "Something is wrong"
            train_indep = train_data[h_indep]
            train_dep = train_data[h_dep]

            test_indep = test_data[h_indep]
            test_dep = test_data[h_dep]
            dt = DecisionTreeRegressor()
            dt.fit(train_indep, [i for i in train_dep.values.tolist()])
            prediction = dt.predict(test_indep)
            from sklearn.metrics import mean_absolute_error

            scores.append(mean_absolute_error(test_dep, prediction))
            # print len(confusion_matrices),

    extract_name = filename.split("/")[-1].split(".")[0] + ".p"
    # import pickle
    # pickle.dump(confusion_matrices, open("./Results_RF_Classification/CM_" + extract_name, "wb"))
    print round(np.mean(scores), 3), round(time.time() - start_time, 3), "sec"
示例#15
0
def arbolesRegresion(caract):
    
    clf = DecisionTreeRegressor(min_samples_leaf=10, min_samples_split=15, max_depth=13, compute_importances=True)
    
    importancias = [0,0,0,0,0,0,0,0,0,0,0,0,0]    
    mae=mse=r2=0
    
    kf = KFold(len(boston_Y), n_folds=10, indices=True)
    for train, test in kf:
        trainX, testX, trainY, testY=boston_X[train], boston_X[test], boston_Y[train], boston_Y[test]
            
        nCar=len(caract)
        train=np.zeros((len(trainX), nCar))
        test=np.zeros((len(testX), nCar))
        trainYNuevo=trainY
        
        for i in range(nCar):
            for j in range(len(trainX)):
                train[j][i]=trainX[j][caract[i]]
                
            for k in range(len(testX)):
                test[k][i]=testX[k][caract[i]]
        
        trainYNuevo=np.reshape(trainYNuevo, (len(trainY), -1))
        
        clf.fit(train, trainYNuevo)
        prediccion=clf.predict(test)            
        
#        clf.fit(trainX, trainY)
#        prediccion=clf.predict(testX)
            
        mae+=metrics.mean_absolute_error(testY, prediccion)
        mse+=metrics.mean_squared_error(testY, prediccion)
        r2+=metrics.r2_score(testY, prediccion)
        
        feature_importance = clf.feature_importances_
        feature_importance = 100.0 * (feature_importance / feature_importance.max())
        for i in range(13):
            importancias[i] = importancias[i] + feature_importance[i]
        
    print 'Error abs: ', mae/len(kf), 'Error cuadratico: ', mse/len(kf), 'R cuadrado: ', r2/len(kf)
    
    for i in range(13):
        importancias[i] = importancias[i]/10
        
    sorted_idx = np.argsort(importancias)
    pos = np.arange(sorted_idx.shape[0]) + .5
    importancias = np.reshape(importancias, (len(importancias), -1))

    boston = datasets.load_boston()
    pl.barh(pos, importancias[sorted_idx], align='center')
    pl.yticks(pos, boston.feature_names[sorted_idx])
    pl.xlabel('Importancia relativa')
    pl.show()    
    
    import StringIO, pydot 
    dot_data = StringIO.StringIO() 
    tree.export_graphviz(clf, out_file=dot_data) 
    graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
    graph.write_pdf("bostonTree.pdf") 
def learning_curve(depth, X_train, y_train, X_test, y_test):
    """Calculate the performance improvement of the model, as training size increases."""
    
    # create 50 equally spaced markers for the the graph's X axis
    sizes = np.round(np.linspace(1, len(X_train), 50))
    # create 50 open bins to fill in the training and test errors
    train_err = np.zeros(len(sizes))
    test_err = np.zeros(len(sizes))

    print "Decision Tree with Max Depth: "
    print depth

    for i, s in enumerate(sizes):
        
        # train classifier and test on each level of depth complexity
        regressor = DecisionTreeRegressor(max_depth=depth)
        regressor.fit(X_train[:s], y_train[:s])
        
        # fill in the training and test error 
        train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s]))
        test_err[i] = performance_metric(y_test, regressor.predict(X_test))

    # create the learning curve graph, using the calculated information
    learning_curve_graph(sizes, train_err, test_err)
    
    return test_err[-1]
示例#17
0
def learning_curve(depth, X_train, y_train, X_test, y_test, iteration=None):
    """Calculate the performance of the model after a set of training data."""

    # We will vary the training set size so that we have 50 different sizes
    sizes = np.linspace(1, len(X_train), 50)
    train_err = np.zeros(len(sizes))
    test_err = np.zeros(len(sizes))

    print "Decision Tree with Max Depth: "
    print depth

    for i, s in enumerate(sizes):

        # Create and fit the decision tree regressor model
        regressor = DecisionTreeRegressor(max_depth=depth)
        regressor.fit(X_train[:s], y_train[:s])

        # Find the performance on the training and testing set
        train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s]))
        test_err[i] = performance_metric(y_test, regressor.predict(X_test))

    # Plot learning curve graph
    learning_curve_graph(depth, sizes, train_err, test_err)

    # added to produce figure 2
    if iteration is not None:
        print "Final error at max_depth={}: {}".format(depth, test_err[-1])
        fully_trained_error[depth - 1][iteration] = test_err[-1]
def test():

    labeled_data, unlabeled_data = load_all_data()
    bestData = load_data(BEST_SUBMISSION_PATH)
    bestY = bestData.y[1:].astype(np.float)

    #bestY = scale(bestY)
    #labeled_data.y = scale(labeled_data.y)

    for x in xrange(5, 6):

        clf = DecisionTreeRegressor()
        clf.fit(labeled_data.X, labeled_data.y)

        newData = Data()
        newData.X = np.append(labeled_data.X, unlabeled_data[0:95000], axis=0)
        newData.y = np.append(labeled_data.y, bestY[0:95000])

        for i in xrange(x):
            clf = RandomForestRegressor()
            clf.fit(newData.X, newData.y)

            n, d = unlabeled_data.shape

            indices = np.random.choice(np.array(range(n)), size=5000, replace=False)
            labels = clf.predict(unlabeled_data[indices])
            newData.X = np.append(labeled_data.X, unlabeled_data[indices], axis=0)
            newData.y = np.append(labeled_data.y, labels)
        
            print rmse(bestY, clf.predict(unlabeled_data))

        saveRevenues(clf.predict(unlabeled_data))
示例#19
0
def plot_curve():
    # Defining our regression algorithm
    reg = DecisionTreeRegressor()
    # Fit our model using X and y
    reg.fit(X, y)
    print "Regressor score: {:.4f}".format(reg.score(X,y))

    # TODO: Use learning_curve imported above to create learning curves for both the
    # training data and testing data. You'll need reg, X, y, cv and score from above.
    # Note: Because i didnt use all the parameters in order of function definition for learning_curve fn,
    #       I have to explicitly assign values to the parameters. e.g, from learning_curve fn, after 'y'
    #       comes 'train_sizes'. But since it is optional and I am not using that parameter, for all other parameters
    #       that come after, i have to explicitly assign values to the parameter (e.g cv=cv, scoring=score)
    #       else error
    train_sizes, train_scores, test_scores = learning_curve(reg, X, y, cv=cv, scoring=score)


    # Taking the mean of the test and training scores
    train_scores_mean = np.mean(train_scores,axis=1)
    test_scores_mean = np.mean(test_scores,axis=1)

    # Plotting the training curves and the testing curves using train_scores_mean and test_scores_mean
    plt.plot(train_sizes ,train_scores_mean,'-o',color='b',label="train_scores_mean")
    plt.plot(train_sizes,test_scores_mean ,'-o',color='r',label="test_scores_mean")

    # Plot aesthetics
    plt.ylim(-0.1, 1.1)
    plt.ylabel("Curve Score")
    plt.xlabel("Training Points")
    plt.legend(bbox_to_anchor=(1.1, 1.1))
    plt.show()
def train_decision_tree(sizes, depth, X_test, X_train, y_test, y_train):
    """
    Args:
        sizes   (Numpy array): Array of training sample sizes to train on.
        depth   (int): The maximum depth of the DecisionTreeRegressor
        X_test  (Numpy array): Test set features
        X_train (Numpy array): Training set features
        y_test  (Numpy array): Test set target variable
        y_train (Numpy array): Training set target variable

    Returns:
        test_err  (Numpy array): Test set predictions.
        train_err (Numpy array): Training set predictions.
    """

    train_err = np.zeros(len(sizes))
    test_err = np.zeros(len(sizes))

    for i, s in enumerate(sizes):
        # Create and fit the decision tree regressor model
        regressor = DecisionTreeRegressor(max_depth=depth)

        # Cast to int to avoid DeprecationWarning from numpy 1.8
        regressor.fit(X_train[:int(s)], y_train[:int(s)])

        # Find the performance on the training and testing set
        train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s]))
        test_err[i] = performance_metric(y_test, regressor.predict(X_test))

    return test_err, train_err
示例#21
0
    def test_boston(self):
        from sklearn.tree import DecisionTreeRegressor as DecisionTreeRegressorSklearn
        model = DecisionTreeRegressor(tree_type='oblivious', max_n_splits=3)
        model_sklearn = DecisionTreeRegressorSklearn()

        dataset = load_boston()
        mse = []
        mse_sklearn = []

        for fold in range(5):
            X_train, X_test, y_train, y_test = train_test_split(
                dataset.data, dataset.target, test_size=0.33)

            model.fit(X_train, y_train)
            y = model.predict(X_test)
            mse.append(mean_squared_error(y, y_test))

            model_sklearn.fit(X_train, y_train)
            y = model_sklearn.predict(X_test)
            mse_sklearn.append(mean_squared_error(y, y_test))

        mean_mse = np.mean(mse)
        mean_mse_sklearn = np.mean(mse_sklearn)
        print(mean_mse, mean_mse_sklearn)
        # Check that our model differs in MSE no worse than 50%
        self.assertTrue(np.abs(mean_mse - mean_mse_sklearn) / mean_mse_sklearn < 0.5)
示例#22
0
def train_decision_tree(time_regression_df, test_size, random_state, max_depth, export_testset):
    time_regression_df_train, time_regression_df_test = cv.train_test_split(time_regression_df, test_size=test_size, random_state=random_state)
    y_train = time_regression_df_train['trip_time']
    x_train = time_regression_df_train.ix[:, 0:6]
    y_test = time_regression_df_test['trip_time']
    x_test = time_regression_df_test.ix[:, 0:6]
    
    if export_testset:
        xy_test = pd.concat([x_test, y_test], axis=1)
        xy_test.to_csv('../data/' + filename_prefix + '_testset.csv')

    tic = time.time()

    regtree = DecisionTreeRegressor(max_depth=max_depth, min_samples_split=3, random_state=random_state)
    regtree.fit(x_train, y_train)
    elapsed = time.time() - tic
    print(elapsed)


    export_meta_data(regtree, x_test, y_test, elapsed)

    target_location = ('../treelib/' + filename_prefix + '_tree_depth_' + str(regtree.tree_.max_depth))

    dump_model(regtree, target_location)
    return regtree
class TestDecisionTreeRegressorConverter(TestCase):
    def setUp(self):
        np.random.seed(1)
        self.est = DecisionTreeRegressor(max_depth=2)
        self.est.fit([
            [0, 0],
            [0, 1],
            [1, 0],
            [1, 1],
        ], [0, 1, 1, 1])
        self.ctx = TransformationContext(
            input=[IntegerNumericFeature('x1'), StringCategoricalFeature('x2', ['zero', 'one'])],
            model=[IntegerNumericFeature('x1'), StringCategoricalFeature('x2', ['zero', 'one'])],
            derived=[],
            output=[IntegerNumericFeature('output')]
        )
        self.converter = DecisionTreeConverter(
            estimator=self.est,
            context=self.ctx,
            mode=DecisionTreeConverter.MODE_REGRESSION
        )

    def test_transform(self):
        p = self.converter.pmml()
        tm = p.TreeModel[0]
        assert tm.MiningSchema is not None, 'Missing mining schema'
        assert len(tm.MiningSchema.MiningField) == 3, 'Wrong number of mining fields'
        assert tm.Node is not None, 'Missing root node'
        assert tm.Node.recordCount == 4
        assert tm.Node.True_ is not None, 'Root condition should always be True'
示例#24
0
def main():
    # Create a random dataset
    rng = np.random.RandomState(1)
    X = np.sort(5 * rng.rand(80, 1), axis=0)
    y = np.sin(X).ravel()
    y[::5] += 3 * (0.5 - rng.rand(16))

    clf_1 = DecisionTreeRegressor(max_depth=2)
    clf_2 = DecisionTreeRegressor(max_depth=5)
    clf_1.fit(X, y)
    clf_2.fit(X, y)

    # Predict
    X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
    y_1 = clf_1.predict(X_test)
    y_2 = clf_2.predict(X_test)

    # Plot the results
    plt.figure()
    plt.scatter(X, y, c="k", label="data")
    plt.plot(X_test, y_1, c="g", label="max_depth=2", linewidth=2)
    plt.plot(X_test, y_2, c="r", label="max_depth=5", linewidth=2)
    plt.xlabel("data")
    plt.ylabel("target")
    plt.title("Decision Tree Regression")
    plt.legend()
    plt.show()
class CustomClassifier(BaseEstimator, ClassifierMixin):
     """Predicts the majority class of its training data."""
     def __init__(self):
         global class_instance
         class_instance += 1
         self.instance = class_instance
         #print "instance:", self.instance
         
     def __del__(self):
         global class_instance
         class_instance -= 1
        
     def fit(self, X, y, sample_weight=array([])):
         # 1st Adaboost iteration: just return the current volatility
         if self.instance <= 2:
             self.y = y     
             return self
         # 2+ Adaboost iteration: use linera regreession as a weak learner
         else:
             self.regr = DecisionTreeRegressor(max_depth=8)
             #self.regr = linear_model.Lasso(alpha=0.01,fit_intercept=False,normalize=False,max_iter=10000000)   # they call lambda alpha
             self.regr.fit(X, y)
     
     def predict(self, X):
         # 1st Adaboost iteration: just return the current volatility
         if self.instance <= 2:
             return X[:,6]   # return 6th element of feature vector (which is the current volatility) 
         # 2+ Adaboost iteration: use linera regreession as a weak learner    
         else:
             return self.regr.predict(X)
def decision_tree_regressor(X, y, labels):

    regressor = DecisionTreeRegressor(max_depth=3)
    regressor.fit(X, y)

    estimates_z = regressor.predict(X)
    leaves = regressor.apply(X)

    leaves_hash = np.zeros(np.max(leaves) + 1)
    for i in range(len(y)):
        if (estimates_z[i] - y[i]) > 0.05 and estimates_z[i] > 0.6 and y[i] > 0:
            # print estimates_z[i]
            # print y[i]
            # print estimates_z[i]-y[i]
            # print ((estimates_z[i]-y[i])>0.1 and estimates_z[i]>0 and y[i]>0)
            # print leaves[i]
            leaves_hash[leaves[i]] += 1
            # print leaves_hash[leaves[i]]
        else:
            leaves_hash[-1] += 1

    # print regressor.tree_.decision_path(X)
    print regressor.tree_.feature
    print regressor.tree_.threshold
    print leaves_hash
    print regressor.feature_importances_

    visualize_tree(regressor.tree_, labels)
    return estimates_z
def model_complexity(X_train, y_train, X_test, y_test):
    """ Calculates the performance of the model as model complexity increases.
        The learning and testing errors rates are then plotted. """
    
    print "Creating a model complexity graph. . . "

    # We will vary the max_depth of a decision tree model from 1 to 14
    max_depth = np.arange(1, 14)
    train_err = np.zeros(len(max_depth))
    test_err = np.zeros(len(max_depth))

    for i, d in enumerate(max_depth):
        # Setup a Decision Tree Regressor so that it learns a tree with depth d
        regressor = DecisionTreeRegressor(max_depth = d)

        # Fit the learner to the training data
        regressor.fit(X_train, y_train)

        # Find the performance on the training set
        train_err[i] = performance_metric(y_train, regressor.predict(X_train))

        # Find the performance on the testing set
        test_err[i] = performance_metric(y_test, regressor.predict(X_test))

    # Plot the model complexity graph
    pl.figure(figsize=(7, 5))
    pl.title('Decision Tree Regressor Complexity Performance')
    pl.plot(max_depth, test_err, lw=2, label = 'Testing Error')
    pl.plot(max_depth, train_err, lw=2, label = 'Training Error')
    pl.legend()
    pl.xlabel('Maximum Depth')
    pl.ylabel('Total Error')
    pl.show()
def get_imp(X,y):
    #rf = RandomForestClassifier()
    rf = DecisionTreeRegressor(random_state=9)
    rf.fit(X, y)
    imp_var = rf.feature_importances_
    imp_var = pd.DataFrame({'variable':X.columns, 'imp':imp_var}).sort('imp', ascending=False)
    return(imp_var)
示例#29
0
def fit_predict_model(city_data):
    """Find and tune the optimal model. Make a prediction on housing data."""

    # Get the features and labels from the Boston housing data
    X, y = city_data.data, city_data.target

    # Setup a Decision Tree Regressor
    regressor = DecisionTreeRegressor()

    parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10)}

    ###################################
    ### Step 4. YOUR CODE GOES HERE ###
    ###################################

    # 1. Find the best performance metric
    # should be the same as your performance_metric procedure
    # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html

    scorer = make_scorer(mean_squared_error, greater_is_better=False)

    # 2. Use gridearch to fine tune the Decision Tree Regressor and find the best model
    # http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV

    grid_search = GridSearchCV(regressor, parameters, scoring=scorer)
    grid_search.fit(X, y)

    tuned_params = grid_search.best_params_

    print "Tuned Parameters: "
    print tuned_params

    regressor.set_params(**tuned_params)

    # Fit the learner to the training data
    print "Final Model: "
    print regressor.fit(X, y)

    print "R^2 of prediction: "
    print regressor.score(X, y)
    
    # Use the model to predict the output of a particular sample
    x = [[11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]]
    y = regressor.predict(x)
    print "House: " + str(x)
    print "Prediction: " + str(y)

    # Get the price of similar houses and calculate the mean, for comparison with out prediction
    nbrs = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(X)
    distances, indices = nbrs.kneighbors(x)

    sum_prices = []
    for i in indices:
        sum_prices.append(city_data.target[i])

    neighbor_avg = np.mean(sum_prices)

    print "Avg. Price of similar houses:"
    print neighbor_avg
示例#30
0
	def fit(self,X,y):
		if self.loss == 'deviance':
			classes_ = list(set(y))
			self.classes_ = classes_
			self.loss_function = MultinomialClass(classes_)

			#each class has a series of trees
			self.trees = [[] for k in classes_]

			#fx is a N*K matrix
			fx = [[0 for k in classes_] for i in y]

			#number of samples 
			n_samples = len(X)

			for m in xrange(self.n_estimators):
				print 'epoch {0}'.format(m)
				sys.stdout.flush()

				#subsample_index = self._subsampling_index(n_samples)
				#sub_X = self._subsampling_A(X,subsample_index)

				rm = self.loss_function.negative_gradient(y,fx)

				for k in range(len(classes_)):
					rmk = map(lambda a:a[k],rm)
					#tree = RegressionTree(self.max_depth)
					tree = DecisionTreeRegressor(max_depth=self.max_depth)
					#sub_rm = self._subsampling_A(rmk,subsample_index)

					tree.fit(X,rmk)
					#tree.fit(sub_X,sub_rm)
					self.trees[k].append(tree)
					print 'fit {0} trees done'.format(k)
					sys.stdout.flush()					

					gamma_mk = tree.predict(X)
					#gamma_mk = [tree.predict(x) for x in X]
					fxk = [fxi[k]+self.learning_rate*gamma_imk for fxi,gamma_imk in zip(fx,gamma_mk)]
					for fxi,next_fxik in zip(fx,fxk):
						fxi[k] = next_fxik


		else:
			fx = [0 for yi in y]

			for m in xrange(self.n_estimators):
				Loss = self.loss_function.loss(y,fx)
				print 'epoch {0} ,loss:{1}'.format(m,Loss)

				rm = self.loss_function.negative_gradient(y,fx)
				tree = RegressionTree(self.max_depth)
				sub_X,sub_rm = self._subsampling(X,rm)
				tree.fit(sub_X,sub_rm)
				self.trees.append(tree)

				gamma_m = [tree.predict(x) for x in X]
				fx = [fxi+self.learning_rate*gamma_im for fxi,gamma_im in zip(fx,gamma_m)]
示例#31
0
文件: Titanic.py 项目: DeepshriSS/DSS
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)

#RANDOM FOREST
#fitting the random forest regression model to the dataset
from sklearn.ensemble import RandomForestRegressor

max_depths = np.linspace(1, 10, 10, endpoint=True)
for max_depth in max_depths:
    regressor = RandomForestRegressor(n_estimators=512,
                                      random_state=0,
                                      max_depth=max_depth)
    regressor.fit(X_train, Y_train)
#predicting new result
y_pred = regressor.predict(X_test)
regressor.score(X_train, Y_train)
acc_regressor = round(regressor.score(X_train, Y_train) * 100, 2)

#DECISION TREE:
# Fitting Decision Tree Regression to the dataset
from sklearn.tree import DecisionTreeRegressor

regressor1 = DecisionTreeRegressor(random_state=0)
regressor1.fit(X_train, Y_train)
# Predicting a new result
y_pred1 = regressor1.predict(X_test)
acc_regressor1 = round(regressor1.score(X_train, Y_train) * 100, 2)
示例#32
0
dataset_train = pd.read_csv('fixture.csv')
dataset = pd.read_csv('test_fixture_edited.csv')

X_train = dataset_train.iloc[:, 7:].values
y_home_train = dataset_train.iloc[:, 5].values
y_away_train = dataset_train.iloc[:, 6]

X_test = dataset.iloc[:, 7:].values
y_home_test = dataset.iloc[:, 5].values
y_away_test = dataset.iloc[:, 6]

#For Home

from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X_train, y_home_train)

y_pred_home = regressor.predict(X_test)

X_grid = np.arange(min(X_test), max(X_test), 0.01)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X_test, y_pred_home, color='red')
plt.plot(X_grid, regressor.predict(X_grid), color='blue')
plt.title('Overall Power Difference or Home Score (Decision Tree Regression)')
plt.xlabel('Overall Power Difference')
plt.ylabel('Home Score')
plt.show()

#For Away

from sklearn.tree import DecisionTreeRegressor
示例#33
0
iowa_file_path='../input/home-data-for-ml-course/train.csv'
home_data=pd.read_csv(iowa_file_path)

#Create target object  and call it y
y=home_data.SalePrice
#Create X
features=['LotArea','YearBuilt','1stFlrSF','2ndFlrSF','FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X=home_data[features]

#Split into validation and training data
train_X,val_X,train_y,val_y=train_test_split(X,y,random_state=1)

#Specify Model
iowa_model=DecisionTreeRegressor(random_state=1)
#Fit model
iowa_model.fit(train_X,train_y)

#Make validation predictions and calculate mean absolute error
val_predictions=iowa_model.predict(val_X)
val_mae=mean_absolute_error(val_predictions, val_y)
print("Validation MAE: {:, .0f}".format(val_mae))

#Setup code checking
from learntools.core import binder
binder.bind(globals())
from learntools.machine_learning.ex5 import *
print("\nSetup complete")

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
  model=DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
  model.fit(train_X, train_y)
示例#34
0
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
  model=DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
  model.fit(train_X, train_y)
  preds_val=model.predict(val_X)
  mae=mean_absolute_error(val_y,pred_val)
  return mae
示例#35
0
        sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i])
    rank_result['ARD_pca'] = sumsum / float(result_row)
    rs_score['ARD_pca'] = r2_score(y_test, y)
    ARDModel = ARDRegression()
    ARDModel.fit(X_train_std, y_train)
    y = ARDModel.predict(X_test_std)
    [result_row] = y.shape
    sumsum = 0
    #print y
    for i in range(result_row):
        sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i])
    rank_result['ARD_std'] = sumsum / float(result_row)
    rs_score['ARD_std'] = r2_score(y_test, y)

    DTRModel = DecisionTreeRegressor(max_depth=2)
    DTRModel.fit(X_train_pca, y_train)
    y = DTRModel.predict(X_test_pca)
    [result_row] = y.shape
    sumsum = 0
    #print y
    for i in range(result_row):
        sumsum = sumsum + (y[i] - y_test[i]) * (y[i] - y_test[i])
    rank_result['DTR2_pca'] = sumsum / float(result_row)
    rs_score['DTR2_pca'] = r2_score(y_test, y)
    DTRModel = DecisionTreeRegressor(max_depth=2)
    DTRModel.fit(X_train_std, y_train)
    y = DTRModel.predict(X_test_std)
    [result_row] = y.shape
    sumsum = 0
    #print y
    for i in range(result_row):
示例#36
0
dv = ph.iloc[:, 6].values

from sklearn.preprocessing import LabelEncoder

le1 = LabelEncoder()
le2 = LabelEncoder()
iv[:, 1] = le1.fit_transform(iv[:, 1])
iv[:, 3] = le1.fit_transform(iv[:, 3])
iv[:, 4] = le1.fit_transform(iv[:, 4])
iv[:, 5] = le1.fit_transform(iv[:, 5])
dv = le2.fit_transform(dv)

from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(iv, dv)

print "Accuracy of Prediction:", regressor.score(iv, dv)

from sklearn.ensemble import RandomForestRegressor

new_regressor = RandomForestRegressor(n_estimators=10, random_state=0)
new_regressor.fit(iv, dv)

import numpy as np

print new_regressor.predict(np.array([10, 1, 4, 0, 1, 0]).reshape(1, -1))
print new_regressor.predict(np.array([10, 1, 4, 1, 0, 1]).reshape(1, -1))
"""
test1 = np.array([[10, 'Y', 4, 'BS', 'Y', 'N']], dtype=object).reshape(1, -1)
test1[:,1] = le1.transform(test1[:,1])
示例#37
0
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print(mean.round(3), std.round(3), params)
        print()
        print()
        best_models.append(
            (clf.best_score_.round(3), modelname, score, clf.best_params_))

predictionlist = list()
for modeldata in best_models:
    sme, model, error, params = modeldata
    if model == 'tree':
        treemodel = DecisionTreeRegressor(**params)
        treemodel.fit(x_train, y_train)
        tree_predict = treemodel.predict(x_test)
        predictionlist.append((model, tree_predict))
    if model == 'forest':
        forestmodel = RandomForestRegressor(**params)
        forestmodel.fit(x_train, y_train)
        forest_predict = forestmodel.predict(x_test)
        predictionlist.append((model, forest_predict))
    if model == 'xgb':
        xgbmodel = XGBRegressor(**params)
        xgbmodel.fit(x_train, y_train)
        xgb_predict = xgbmodel.predict(x_test)
        predictionlist.append((model, xgb_predict))

lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)
示例#38
0
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston
from sklearn.ensemble import AdaBoostRegressor
# 加载数据
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

data=load_boston()
# 分割数据
train_x, test_x, train_y, test_y = train_test_split(data.data, data.target, test_size=0.25, random_state=33)
# 使用AdaBoost回归模型
regressor=AdaBoostRegressor()
regressor.fit(train_x,train_y)
pred_y = regressor.predict(test_x)
mse = mean_squared_error(test_y, pred_y)
print("房价预测结果 ", pred_y)
print("均方误差 = ",round(mse,2))

# 使用决策树回归模型
dec_regressor=DecisionTreeRegressor()
dec_regressor.fit(train_x,train_y)
pred_y = dec_regressor.predict(test_x)
mse = mean_squared_error(test_y, pred_y)
print("决策树均方误差 = ",round(mse,2))
# 使用KNN回归模型
knn_regressor=KNeighborsRegressor()
knn_regressor.fit(train_x,train_y)
pred_y = knn_regressor.predict(test_x)
mse = mean_squared_error(test_y, pred_y)
print("KNN均方误差 = ",round(mse,2))
示例#39
0
def flash_fair_LSR(biased_col, n_obj):  # biased_col can be "sex" or "race", n_obj can be "ABCD" or "AB" or "CD"

    dataset_orig_train, dataset_orig_vt = train_test_split(dataset_orig, test_size=0.3)
    dataset_orig_valid, dataset_orig_test = train_test_split(dataset_orig_vt, test_size=0.5)

    X_train, y_train = dataset_orig_train.loc[:, dataset_orig_train.columns != 'Probability'], dataset_orig_train[
        'Probability']
    X_valid, y_valid = dataset_orig_valid.loc[:, dataset_orig_valid.columns != 'Probability'], dataset_orig_valid[
        'Probability']
    X_test, y_test = dataset_orig_test.loc[:, dataset_orig_test.columns != 'Probability'], dataset_orig_test[
        'Probability']

    def convert_lsr(index):  # 30 2 2 100
        a = int(index / 400 + 1)
        b = int(index % 400 / 200 + 1)
        c = int(index % 200 / 100 + 1)
        d = int(index % 100 + 10)
        return a, b, c, d

    all_case = set(range(0, 12000))
    modeling_pool = random.sample(all_case, 20)

    List_X = []
    List_Y = []

    for i in range(len(modeling_pool)):
        temp = convert_lsr(modeling_pool[i])
        List_X.append(temp)
        p1 = temp[0]
        if temp[1] == 1:
            p2 = 'l1'
        else:
            p2 = 'l2'
        if temp[2] == 1:
            p3 = 'liblinear'
        else:
            p3 = 'saga'
        p4 = temp[3]
        model = LogisticRegression(C=p1, penalty=p2, solver=p3, max_iter=p4)

        all_value = measure_scores(X_train, y_train, X_valid, y_valid, dataset_orig_valid, biased_col, model)
        four_goal = all_value[0] + all_value[1] + all_value[2] + all_value[3]
        two_goal_recall_far = all_value[0] + all_value[1]
        two_goal_aod_eod = all_value[2] + all_value[3]
        if n_obj == "ABCD":
            List_Y.append(four_goal)
        elif n_obj == "AB":
            List_Y.append(two_goal_recall_far)
        elif n_obj == "CD":
            List_Y.append(two_goal_aod_eod)
        else:
            print("Wrong number of objects")

    remain_pool = all_case - set(modeling_pool)
    test_list = []
    for i in list(remain_pool):
        test_list.append(convert_lsr(i))

    upper_model = DecisionTreeRegressor()
    life = 20

    while len(List_X) < 200 and life > 0:
        upper_model.fit(List_X, List_Y)
        candidate = random.sample(test_list, 1)
        test_list.remove(candidate[0])
        candi_pred_value = upper_model.predict(candidate)
        if candi_pred_value < np.median(List_Y):
            List_X.append(candidate[0])
            candi_config = candidate[0]

            pp1 = candi_config[0]
            if candi_config[1] == 1:
                pp2 = 'l1'
            else:
                pp2 = 'l2'
            if candi_config[2] == 1:
                pp3 = 'liblinear'
            else:
                pp3 = 'saga'
            pp4 = candi_config[3]

            candi_model = LogisticRegression(C=pp1, penalty=pp2, solver=pp3, max_iter=pp4)
            candi_value = measure_scores(X_train, y_train, X_valid, y_valid, dataset_orig_valid, biased_col,
                                         candi_model)
            candi_four_goal = candi_value[0] + candi_value[1] + candi_value[2] + candi_value[3]
            candi_two_goal_recall_far = candi_value[0] + candi_value[1]
            candi_two_goal_aod_eod = candi_value[2] + candi_value[3]
            if n_obj == "ABCD":
                List_Y.append(candi_four_goal)
            elif n_obj == "AB":
                List_Y.append(candi_two_goal_recall_far)
            elif n_obj == "CD":
                List_Y.append(candi_two_goal_aod_eod)
        else:
            life -= 1

    min_index = int(np.argmin(List_Y))

    return List_X[min_index]
示例#40
0
class Model_Finder:
    """
                This class shall  be used to find the model with best accuracy and AUC score.
                Version: 1.0
                Revisions: None
    """
    def __init__(self, file_object, logger_object):
        self.file_object = file_object
        self.logger_object = logger_object
        self.clf = RandomForestClassifier()
        self.DecisionTreeReg = DecisionTreeRegressor()
        self.score = PerformanceEvaluation.performance(self.file_object,
                                                       self.logger_object)

    def get_best_params_for_random_forest(self, train_x, train_y):
        """
                                Method Name: get_best_params_for_random_forest
                                Description: get the parameters for Random Forest Algorithm which give the best accuracy.
                                             Use Hyper Parameter Tuning.
                                Output: The model with the best parameters
                                On Failure: Raise Exception

                                Version: 1.0
                                Revisions: None

                        """
        self.logger_object.log(
            self.file_object,
            'Entered the get_best_params_for_random_forest method of the Model_Finder class'
        )
        try:
            # initializing with different combination of parameters
            self.param_grid = {
                "n_estimators": [10, 50, 100, 130],
                "criterion": ['gini', 'entropy'],
                "max_depth": range(2, 4, 1),
                "max_features": ['auto', 'log2']
            }

            #Creating an object of the Grid Search class
            self.grid = GridSearchCV(estimator=self.clf,
                                     param_grid=self.param_grid,
                                     cv=5,
                                     verbose=3)

            #finding the best parameters
            self.grid.fit(train_x, train_y)

            #extracting the best parameters
            self.criterion = self.grid.best_params_['criterion']
            self.max_depth = self.grid.best_params_['max_depth']
            self.max_features = self.grid.best_params_['max_features']
            self.n_estimators = self.grid.best_params_['n_estimators']

            #creating a new model with the best parameters
            self.clf = RandomForestClassifier(n_estimators=self.n_estimators,
                                              criterion=self.criterion,
                                              max_depth=self.max_depth,
                                              max_features=self.max_features)
            # training the mew model
            self.clf.fit(train_x, train_y)
            self.logger_object.log(
                self.file_object,
                'Random Forest best params: ' + str(self.grid.best_params_) +
                '. Exited the get_best_params_for_random_forest method of the Model_Finder class'
            )

            return self.clf
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in get_best_params_for_random_forest method of the Model_Finder class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Random Forest Parameter tuning  failed. Exited the get_best_params_for_random_forest method of the Model_Finder class'
            )
            raise Exception()

    def get_best_params_for_DecisionTreeRegressor(self, train_x, train_y):
        """
                                                Method Name: get_best_params_for_DecisionTreeRegressor
                                                Description: get the parameters for DecisionTreeRegressor Algorithm which give the best accuracy.
                                                             Use Hyper Parameter Tuning.
                                                Output: The model with the best parameters
                                                On Failure: Raise Exception

                                                Version: 1.0
                                                Revisions: None

                                        """
        self.logger_object.log(
            self.file_object,
            'Entered the get_best_params_for_DecisionTreeRegressor method of the Model_Finder class'
        )
        try:
            # initializing with different combination of parameters
            self.param_grid_decisionTree = {
                "criterion": ["mse", "friedman_mse", "mae"],
                "splitter": ["best", "random"],
                "max_features": ["auto", "sqrt", "log2"],
                'max_depth': range(2, 16, 2),
                'min_samples_split': range(2, 16, 2)
            }

            # Creating an object of the Grid Search class
            self.grid = GridSearchCV(self.DecisionTreeReg,
                                     self.param_grid_decisionTree,
                                     verbose=3,
                                     cv=5)
            # finding the best parameters
            self.grid.fit(train_x, train_y)

            # extracting the best parameters
            self.criterion = self.grid.best_params_['criterion']
            self.splitter = self.grid.best_params_['splitter']
            self.max_features = self.grid.best_params_['max_features']
            self.max_depth = self.grid.best_params_['max_depth']
            self.min_samples_split = self.grid.best_params_[
                'min_samples_split']

            # creating a new model with the best parameters
            self.decisionTreeReg = DecisionTreeRegressor(
                criterion=self.criterion,
                splitter=self.splitter,
                max_features=self.max_features,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split)
            # training the mew models
            self.decisionTreeReg.fit(train_x, train_y)
            self.logger_object.log(
                self.file_object, 'Decision-Tree Regressor best params: ' +
                str(self.grid.best_params_) +
                '. Exited method of the Model fit')
            return self.decisionTreeReg
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in Decision-Tree Regressor method of the Model Fit. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Grid search Parameter tuning  failed. Exited the Decision-Tree Regressor method of the Model Fit'
            )
            raise Exception()

    def get_best_params_for_xgboost(self, train_x, train_y):
        """
                                        Method Name: get_best_params_for_xgboost
                                        Description: get the parameters for XGBoost Algorithm which give the best accuracy.
                                                     Use Hyper Parameter Tuning.
                                        Output: The model with the best parameters
                                        On Failure: Raise Exception

                                        Version: 1.0
                                        Revisions: None

                                """
        self.logger_object.log(
            self.file_object,
            'Entered the get_best_params_for_xgboost method of the Model_Finder class'
        )
        try:
            # initializing with different combination of parameters
            self.param_grid_xgboost = {
                'learning_rate': [0.5, 0.1, 0.01, 0.001],
                'max_depth': [3, 5, 10, 20],
                'n_estimators': [10, 50, 100, 200]
            }
            # Creating an object of the Grid Search class
            self.grid = GridSearchCV(XGBRegressor(objective='reg:linear'),
                                     self.param_grid_xgboost,
                                     verbose=3,
                                     cv=5)
            # finding the best parameters
            self.grid.fit(train_x, train_y)

            # extracting the best parameters
            self.learning_rate = self.grid.best_params_['learning_rate']
            self.max_depth = self.grid.best_params_['max_depth']
            self.n_estimators = self.grid.best_params_['n_estimators']

            # creating a new model with the best parameters
            self.xgb = XGBRegressor(objective='reg:linear',
                                    learning_rate=self.learning_rate,
                                    max_depth=self.max_depth,
                                    n_estimators=self.n_estimators)
            # training the mew model
            self.xgb.fit(train_x, train_y)
            self.logger_object.log(
                self.file_object,
                'XGBoost best params: ' + str(self.grid.best_params_) +
                '. Exited the get_best_params_for_xgboost method of the Model_Finder class'
            )
            return self.xgb
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in get_best_params_for_xgboost method of the Model_Finder class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'XGBoost Parameter tuning  failed. Exited the get_best_params_for_xgboost method of the Model_Finder class'
            )
            raise Exception()

    def xgb_classifier(self, train_x, train_y):
        """
                                        Method Name: get_best_params_for_xgboost
                                        Description: get the parameters for XGBoost Algorithm which give the best accuracy.

                                        Output: The model with the best parameters
                                        On Failure: Raise Exception

                                        Version: 1.0
                                        Revisions: None

                                """
        self.logger_object.log(
            self.file_object,
            'Entered the get_best_params_for_xgboost method of the Model_Finder class'
        )
        try:
            # creating a new model with the best parameters
            self.xgbc = XGBClassifier(objective='binary:logistic')
            # training the mew model
            self.xgbc.fit(train_x, train_y)
            self.logger_object.log(
                self.file_object,
                'XGBoost model train method done of the Model_Finder class')
            return self.xgbc
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in get_best_params_for_xgboost method of the Model_Finder class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'XGBoost Parameter tuning  failed. Exited the get_best_params_for_xgboost method of the Model_Finder class'
            )
            raise Exception()

    def lgb_classifier(self, train_x, train_y):
        """
                                        Method Name: get_best_params_for_xgboost
                                        Description: get the parameters for XGBoost Algorithm which give the best accuracy.

                                        Output: The model with the best parameters
                                        On Failure: Raise Exception

                                        Version: 1.0
                                        Revisions: None

                                """
        self.logger_object.log(self.file_object,
                               'Entered the lgb_classifier class')
        try:
            # creating a new model with the best parameters
            self.lgbc = lgboost.LGBMClassifier()
            # training the mew model
            self.lgbc.fit(train_x, train_y)
            self.logger_object.log(
                self.file_object,
                'LGBoost model train method done of the Model_Finder class')
            return self.lgbc
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in get_best_params_for_LGBboost method of the Model_Finder class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'LGBoost Parameter tuning  failed. Exited the get_best_params_for_LGboost method of the Model_Finder class'
            )
            raise Exception()

    def catb_classifier(self, train_x, train_y):
        """
                                        Method Name: get_best_params_for_xgboost
                                        Description: get the parameters for XGBoost Algorithm which give the best accuracy.

                                        Output: The model with the best parameters
                                        On Failure: Raise Exception

                                        Version: 1.0
                                        Revisions: None
                                """
        self.logger_object.log(
            self.file_object,
            'Entered the get_best_params_for_xgboost method of the Model_Finder class'
        )
        try:
            # creating a new model with the best parameters

            self.cbc = cboost.CatBoostClassifier(iterations=2000,
                                                 learning_rate=0.1,
                                                 depth=8,
                                                 eval_metric='Accuracy',
                                                 random_seed=0,
                                                 bagging_temperature=0.2,
                                                 od_type='Iter',
                                                 metric_period=75,
                                                 od_wait=100)
            # training the mew model
            self.cbc.fit(train_x, train_y)
            self.logger_object.log(
                self.file_object,
                'CatBoost model train method done of the Model_Finder class')
            return self.cbc
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in get_best_params_for_CatBoost method of the Model_Finder class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'CatBoost Parameter tuning  failed. Exited the get_best_params_for_CatBoost method of the Model_Finder class'
            )
            raise Exception()

    def get_best_model(self, train_x, train_y, test_x, test_y, cls):
        """
                                                Method Name: get_best_model
                                                Description: Find out the Model which has the best AUC score.
                                                Output: The best model name and the model object
                                                On Failure: Raise Exception

                                                Version: 1.0
                                                Revisions: None

                                        """
        self.logger_object.log(
            self.file_object,
            'Entered the get_best_model method of the Model_Finder class')
        # create best model for KNN
        try:

            self.xgb_class = self.xgb_classifier(train_x, train_y)
            self.prediction_xgb_class = self.xgb_class.predict(
                test_x)  # Predictions using the XGB Model
            self.prediction_xgb_auc = roc_auc_score(test_y,
                                                    self.prediction_xgb_class)
            self.score.all_score(test_y,
                                 self.prediction_xgb_class,
                                 cls,
                                 title="XGB Testing Score")
            self.prediction_xgb_class_train = self.xgb_class.predict(train_x)
            self.score.all_score(train_y,
                                 self.prediction_xgb_class_train,
                                 cls,
                                 title="XGB Training Score")

            self.lgb_class = self.lgb_classifier(train_x, train_y)
            self.prediction_lgb_class = self.lgb_class.predict(
                test_x)  # Predictions using the LGB Model
            self.prediction_lgb_auc = roc_auc_score(test_y,
                                                    self.prediction_lgb_class)
            self.score.all_score(test_y,
                                 self.prediction_lgb_class,
                                 cls,
                                 title="LGB Testing Score")
            self.prediction_lgb_class_train = self.lgb_class.predict(train_x)
            self.score.all_score(train_y,
                                 self.prediction_lgb_class_train,
                                 cls,
                                 title="LGB Training Score")

            self.cb_class = self.catb_classifier(train_x, train_y)
            self.prediction_cb_class = self.cb_class.predict(
                test_x)  # Predictions using the CatBoost Model
            self.prediction_cb_auc = roc_auc_score(test_y,
                                                   self.prediction_cb_class)
            self.score.all_score(test_y,
                                 self.prediction_cb_class,
                                 cls,
                                 title="Catboost Testing Score")
            self.prediction_cb_class_train = self.cb_class.predict(train_x)
            self.score.all_score(train_y,
                                 self.prediction_cb_class_train,
                                 cls,
                                 title="Catboost Training Score")

            # #comparing the three models
            self.lst = [
                self.prediction_xgb_auc, self.prediction_lgb_auc,
                self.prediction_cb_auc
            ]
            self.best_nm = np.argmax(self.lst)
            if self.best_nm == 0:
                return 'XGBoost', self.xgb_class
            elif self.best_nm == 1:
                return 'LGBoost', self.lgb_class
            else:
                return 'CatBoost', self.cb_class

        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in get_best_model method of the Model_Finder class. Exception message:  '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Model Selection Failed. Exited the get_best_model method of the Model_Finder class'
            )
            raise Exception()
示例#41
0
    X_test, y_test = X[test_indices, :], y[test_indices]

    # Unpack hyperparameters, resample training data, and fit regressors
    reg = DecisionTreeRegressor(random_state=rrr) if 'REBAGG' in strategy else \
              RandomForestRegressor(n_estimators=10, n_jobs=-1, random_state=rrr)

    if strategy == 'RO':
        cl, ch, sample_method = param
        relevance = resreg.sigmoid_relevance(y_train, cl=cl, ch=ch)
        X_train, y_train = resreg.random_oversample(X_train,
                                                    y_train,
                                                    relevance,
                                                    relevance_threshold=0.5,
                                                    over=sample_method,
                                                    random_state=rrr)
        reg.fit(X_train, y_train)

    elif strategy == 'SMOTER':
        cl, ch, sample_method, k = param
        relevance = resreg.sigmoid_relevance(y_train, cl=cl, ch=ch)
        X_train, y_train = resreg.smoter(X_train,
                                         y_train,
                                         relevance,
                                         relevance_threshold=0.5,
                                         k=k,
                                         over=sample_method,
                                         random_state=rrr)
        reg.fit(X_train, y_train)

    elif strategy == 'GN':
        cl, ch, sample_method, delta = param
示例#42
0
    'Price Distribution Plot of Handphones Whose Screen Material is TFT or IPS '
)
plt.show()

print(
    data.dropna(subset=['ROM', 'RAM', 'brand', 'price']).shape[0] /
    data.shape[0])

print(data.isnull().sum().sort_values(ascending=False))  #所有列缺失值数据统计

df = data.loc[:, ['price', 'rear camera', 'brand', 'weight']].dropna()
to_model = pd.get_dummies(df)
x = to_model.iloc[:, 1:].values
y = to_model.iloc[:, 0].values
model = DecisionTreeRegressor()
model.fit(x, y)

error_list = []
for each in df['brand'].value_counts().index:
    to_fill = 'brand_{}'.format(each)
    x_data = to_model[to_model[to_fill] == 1].iloc[:, 1:].values
    y_data = to_model[to_model[to_fill] == 1].iloc[:, 0].values

    test_result = model.predict(x_data)
    merror = mae(y_data.reshape(len(y_data), 1), test_result.flatten())
    error = (np.abs(test_result - y_data) / y_data).mean()
    print(each, end=' : ')
    print(np.round(merror, 2), end=', ')
    print(str(np.round(error * 100, 3)) + '%')
    error_list.append([each, merror, error])
示例#43
0
map_cat = {}
for x in list_cat:
    labels = X_cat[x].astype('category').cat.categories.tolist()
    replace_map_comp = {
        x: {k: v
            for k, v in zip(labels, list(range(1,
                                               len(labels) + 1)))}
    }
    X_cat.replace(replace_map_comp, inplace=True)
    map_cat[x] = replace_map_comp

# Replacing the missing values
X_cat.fillna(X_cat.mean(), inplace=True)

regressor = DecisionTreeRegressor(min_samples_leaf=10, max_depth=5)
regression = regressor.fit(X_cat, y)

y_1 = regressor.predict(X_cat)

regression.get_depth()
regression.get_n_leaves()
regression.get_params()
#regression.decision_path(X_cat).todense()

import numpy as np

RMSE = np.sqrt(np.mean(y_1 - y)**2)

regression.score(X_cat, y)

from sklearn.externals.six import StringIO
mat = dataset.readlines()
time = []
for row in mat:
    pres = row.split()
    pres = [float(ro) for ro in pres]
    time.append(pres)

time = np.array(time)

X = time[:, :-1]
y = time[:, -1]

# Splitting the dataset into the Training set and Test set
"""from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)"""

# Feature Scaling
"""
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X= sc_X.fit_transform(X)
sc_y = StandardScaler()
y = sc_y.fit_transform(y)"""

# Fitting the Regression Model to the dataset
# Create your regressor here
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(parameters)
regressor.fit(X, y)

pickle.dump(regressor, open(hoemdir + "/attributes.p", "wb"), protocol=2)
示例#45
0
文件: Models.py 项目: cmougan/knee
# In[35]:


print("Model coefficients:\n")
for i in range(X_tr.shape[1]):
    print(X_tr.columns[i], "=", clf.coef_[i].round(4))


# ### Decision tree

# In[36]:


dt = DecisionTreeRegressor(max_depth=3, criterion="mae")
dt.fit(X_tr, y_tr)


# In[37]:


print("Decision Tree Results")


# In[38]:


print("Train ", mean_absolute_error(dt.predict(X_tr), y_tr))


# In[39]:
print("单个决策树的分类效果:{}".format(accuracy_score(y_true, one_tree_pre)))

# GBDT分类树的构建过程
models = []
algo = DecisionTreeClassifier(max_depth=1)  # 基模型为分类决策树
# 模型迭代次数n
n = 2

for i in range(n):
    k_model = []
    for k in range(2):
        # 定义下一次迭代所用的决策树
        model = DecisionTreeRegressor()

        model.fit(x, y_label[k])

        # 预测结果并将其进行指数转换为概率的形式
        y_pre = model.predict(x)
        dy = np.exp(y_pre) / np.sum(np.exp(y_label), axis=0)
        # 更新对应类别的y标签为概率的残差值d
        y_label[k] = y_label[k] - dy

        # 异常将每一个类别构建好的决策树添加到列表中
        k_model.append(model)

    # 将每一次迭代的k个模型列表添加到最终的融合模型中
    models.append(k_model)
print("模型构建完成!")
print("开始预测:")
示例#47
0
outF = open("output_MD.txt", "w")
print('best_criterion = ', best_criterion, file=outF)
print('best_splitter = ', best_splitter, file=outF)
print('best_max_features = ', best_max_features, file=outF)
outF.close()

regr = DecisionTreeRegressor(criterion='mse',
                             splitter='best',
                             max_features='auto',
                             random_state=69)

regr = MultiOutputRegressor(estimator=regr, n_jobs=n_jobs)

t0 = time.time()
regr.fit(x_train, y_train)
regr_fit = time.time() - t0
print("Complexity and bandwidth selected and model fitted in %.6f s" % regr_fit)

t0 = time.time()
y_regr = regr.predict(x_test)
regr_predict = time.time() - t0
print("Prediction for %d inputs in %.6f s" % (x_test.shape[0], regr_predict))

x_test_dim = sc_x.inverse_transform(x_test)
y_test_dim = sc_y.inverse_transform(y_test)
y_regr_dim = sc_y.inverse_transform(y_regr)

plt.scatter(x_test_dim[:,0], y_test_dim[:,0], s=5, c='k', marker='o', label='KAPPA')
plt.scatter(x_test_dim[:,0], y_regr_dim[:,0], s=5, c='r', marker='d', label='k-Nearest Neighbour')
#plt.scatter(x_test_dim[:,0], y_test_dim[:,1], s=5, c='k', marker='o', label='KAPPA')
                    ['Open-World', 16500, 30000], ['MMOFPS', 25000, 46000],
                    ['MMORPG', 30000, 80000]])

# select all rows by : and column 1
# by 1:2 representing features
X = dataset[:, 1:2].astype(int)  #covert to integer
#print(X)

# select all rows by : and column 2
# by 2 to Y representing labels
y = dataset[:, 2].astype(int)
#print(y)

reg = DecisionTreeRegressor(random_state=0)
#print(reg)
reg.fit(X, y)

pred_case = reg.predict([[3750]])
# print the predicted price
print("Predicted price: % d\n" % pred_case)
#Visualize results
X_grid = np.arange(min(X), max(X), 0.01)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X, y, color='red')
#plot predicted data
plt.plot(X_grid, reg.predict(X_grid), color='blue')

# specify title
plt.title('Profit to Production Cost (Decision Tree Regression)')
# specify X axis label
plt.xlabel('Production Cost')
#splitting train and test variables into X and Y variables 
X_train=train_data.iloc[:,0:11]
X_train

Y_train=train_data["Item_Outlet_Sales"]
Y_train

X_test=test_data.iloc[:,0:11]
X_test

# Implementing the Decision tree model by gini index method
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()

dt.fit(X_train, Y_train)

dt.tree_.node_count
dt.tree_.max_depth

Y_pred_train = dt.predict(X_train)
Y_pred_test = dt.predict(X_test)

print(f"Decision tree has {dt.tree_.node_count} nodes with maximum depth covered up to {dt.tree_.max_depth}")

# Further tuning is required to decide about max depth value ie, by pruning
# apply grid search cv method and pass levels with cv = 10 and look
# out for the best depth at this place
from sklearn.model_selection import GridSearchCV
levels = {'max_depth': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37]}
示例#50
0
data= pd.concat([data,label],axis=1)
data.drop('label', axis=1,inplace=True)
train=data.iloc[:, 0:4].values
test=data.iloc[: ,4:].values


X_train,X_test,y_train,y_test=train_test_split(train,test,test_size=0.3)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

from sklearn.tree import DecisionTreeRegressor
clf=DecisionTreeRegressor()
clf.fit(X_train,y_train)
pred=clf.predict(X_test)


engine.say("so lets get started, from the Node m c u kit we got the following data")
engine.say("the air humidity is 40 around you")
engine.runAndWait()
ah = 40    #AIR HUMIDITY
engine.say("the temperature around you is eighty-one degrees fahrenheit")
engine.runAndWait()
atemp = 81 #AIR TEMPERATURE
engine.say("The rainsfall in your area accordig to last year data is 119 milimeteres")
engine.runAndWait()
rain = 119  #RAINFALL
engine.say(" The pH of your soil is five")
engine.runAndWait()
示例#51
0
labels = dataset.iloc[:, 0].values

from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(
    features, labels, test_size=0.2, random_state=0)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features_train = scaler.fit_transform(features_train)
features_test = scaler.transform(features_test)

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(features_train, labels_train)

pred1 = regressor.predict(features_test)
score1 = regressor.score(features_test, labels_test)

ran_for_regressor = RandomForestRegressor(n_estimators=10, random_state=0)
ran_for_regressor.fit(features_train, labels_train)

pred2 = ran_for_regressor.predict(features_test)
score2 = ran_for_regressor.score(features_test, labels_test)

pred3 = ran_for_regressor.predict(
    scaler.transform(
        np.array([6, 215, 100, 2630, 22.2, 80, 3]).reshape(1, -1)))
"""
import statsmodels.formula.api as sm
示例#52
0
#to prec=dict the housing price
y = housing_data.price

#the predictors
housing_predictors = [
    'id', 'bathrooms', 'floors', 'bedrooms', 'yr_built', 'yr_renovated',
    'sqft_lot'
]
x = housing_data[housing_predictors]

#setting the model
housing_model = DecisionTreeRegressor()

#fit the data i.e.., x and y
housing_model.fit(x, y)

# In[ ]:

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.tree import DecisionTreeRegressor

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
#print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.
housing_file_path = '../input/kc_house_data.csv'
from sklearn.tree import DecisionTreeRegressor

# the decision tree is used to predict simultaneously the noisy x and y observations of a circle given a single underlying feature
# as a result, it learns local linear regressions approximating the circle

# Create a random dataset:
rng = np.random.RandomState(1)
X = np.sort(200 * rng.rand(100, 1) - 100, axis=0)
Y = np.array([np.pi * np.sin(X).ravel(), np.pi * np.cos(X).ravel()]).T
Y[::5, :] += (0.5 - rng.rand(20, 2))

# Fit regression model:
regr_1 = DecisionTreeRegressor(max_depth=2)
regr_2 = DecisionTreeRegressor(max_depth=5)
regr_3 = DecisionTreeRegressor(max_depth=8)
regr_1.fit(X, Y)
regr_2.fit(X, Y)
regr_3.fit(X, Y)

# Predict:
x_test = np.arange(-100.0, 100.0, 0.01)[:, np.newaxis]
y_1 = regr_1.predict(x_test)
y_2 = regr_2.predict(x_test)
y_3 = regr_3.predict(x_test)

# Plot the results
plt.figure()
s = 50
plt.scatter(Y[:, 0], Y[:, 1], c="navy", s=s, label="Data")
plt.scatter(y_1[:, 0], y_1[:, 1], c="cornflowerblue", s=s, label="max_depth=2")
plt.scatter(y_2[:, 0], y_2[:, 1], c="c", s=s, label="max_depth=5")
class Admission_Predictor:
    def __init__(self):
        path = '/Users/pragya/PycharmProjects/AI LAB/venv/Admission_Prediction/Data/'
        os.chdir(path)
        self.data = pd.read_csv("Admission_Predict.csv")
        describe = self.data.describe().transpose()
        print(describe)

        ### Check Missing values
        print((self.data.notnull().sum() / len(self.data)).sort_values(ascending=False))

    def plot_data(self, test_y):
        cols = self.data.columns
        # print(cols)
        features = cols[1:-1]
        target = cols[-1]
        # print("Features: ", features)

        #Plots
        plt.figure(figsize=(20, 20))
        for i in range(len(features)):
            plt.subplot(3, 3, i + 1)
            # print(features[i])
            plt.scatter(self.data[features[i]], self.data['Chance of Admit '])
            plt.title(features[i])

        plt.savefig('features.pdf')
        plt.show()

        # Median student chances
        features2 = cols[[3, 4, 5, 7]]
        print('features2',features2)
        means = self.data['Chance of Admit '].mean()
        median = self.data['Chance of Admit '].median()
        print("Mean student chances", means)
        print("Median student chances",median)

        # Considering the best correalations features
        main_features = ['CGPA', 'GRE Score', 'TOEFL Score']
        for i in range(len(main_features)):
            print(main_features[i].upper())
            print(linregress(self.data[main_features[i]], self.data['Chance of Admit ']))

            plt.figure(figsize=(20, 6))
            plt.subplot(1, 2, 1)
            sns.distplot(self.data[main_features[i]], kde = False)
            plt.title('Distributed ' + main_features[i] + ' of Applicants')

            plt.subplot(1, 2, 2)
            sns.regplot(self.data[main_features[i]], self.data['Chance of Admit '])
            plt.title(main_features[i] + ' vs Chance of Admit')
            plt.savefig(main_features[i] +'.pdf')

        # Bar Plots
        df = self.data
        plt.figure(figsize=(20, 10))
        for j in range(len(features2)):
            plt.subplot(2, 2, j + 1)
            values = df[features2[j]].unique()
            ser = pd.Series(range(len(values)), index=values, dtype='float64')
            for i in range(len(values)):
                ser[values[i]] = df[df[features2[j]] == values[i]]['Chance of Admit '].mean()
            ser = ser.sort_index()

            plt.bar(ser.index, ser.values, width=0.3)
            plt.title(features2[j])
            plt.plot([0, len(values)], [median, median], 'k-', lw=1, dashes=[2, 2])

        plt.savefig('featuresVsMedian.pdf')
        plt.show()

        # Algo comparision plots
        Methods = ['Decision Tree Regression', 'Linear Regression']
        Scores = np.array([self.score1, self.score2])

        fig, ax = plt.subplots(figsize=(8, 6))
        sns.barplot(Methods, Scores)
        plt.title('Algorithm Prediction Accuracies')
        plt.ylabel('Accuracy')
        plt.savefig("Algorithm_Prediction_Accuracies.pdf")
        plt.show()

        #Residual Plot
        plt.scatter(self.predicted_2, self.predicted_2 - test_y, c='g')
        plt.hlines(y = 0, xmin=0.4, xmax=1)
        plt.title('Residual plot')
        plt.ylabel('Residual')
        plt.savefig("Residual_LR.pdf")
        plt.show()


    def model_decision(self):

        # data Pre - processing
        cols = self.data.columns
        features = cols[1:-1]
        target = cols[-1]

        X = self.data[features]
        y = self.data['Chance of Admit ']

        # train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25)
        train_X = X[:int(0.75*len(X))]
        test_X = X[int(0.75*len(X)):]
        train_y = y[:int(0.75*len(X))]
        test_y = y[int(0.75*len(X)):]


        # Decision tree Regression
        self.model = DecisionTreeRegressor(max_leaf_nodes=10)
        self.model.fit(train_X, train_y)


        #Linear Model
        self.model2 = linear_model.LinearRegression()
        self.model2.fit(train_X, train_y)


        print(self.model)
        print(self.model2)
        # print(self.model3)

        # Visualization
        with open("classifier.dot", "w") as f:
            f = tree.export_graphviz(self.model, feature_names=features, class_names=target, out_file=f)


        # Test data Prediction
        self.predicted = self.model.predict(test_X)
        self.predicted_full = self.model.predict(X)
        self.score1 = self.model.score(test_X,test_y)
        print("Score1: ", self.score1)

        self.predicted_2 = self.model2.predict(test_X)
        self.predicted_full_2 = self.model2.predict(X)
        self.score2 = self.model2.score(test_X, test_y)
        print("Score2: ", self.score2)
        print("Coefficients-----------", self.model2.coef_)
        print("Intercept-----------", self.model2.intercept_)


        return train_X, test_X, train_y, test_y# sample prediction
    def predict(self,df):
        train_X, test_X, train_y, test_y = self.model_decision()
        pred = self.model.predict(df)
        return pred
        # predicted = model.predict(train_X)

    # Actual - predicted for test/train data
    def error_calc(self, test):
        mae_DR = mean_absolute_error(test, self.predicted)
        mse_DR = mean_squared_error(test, self.predicted)
        r2_DR  = r2_score(test, self.predicted)

        mae_LR = mean_absolute_error(test, self.predicted_2)
        mse_LR = mean_squared_error(test, self.predicted_2)
        r2_LR = r2_score(test, self.predicted_2)


        print("Errors - Linear Regression: \n Mean Absolute Error: {} \n  Mean Squared Error: {} \n R2 Score: {}".format(mae_LR, mse_LR, r2_LR))
        print("Errors - Decision Tree Regression: \n Mean Absolute Error: {} \n  Mean Squared Error: {} \n R2 Score: {}".format(mae_DR, mse_DR, r2_DR))
        

    def output_results(self):
        df1 = pd.DataFrame()
        df1['predictions_DR'] = self.predicted_full
        df1['predictions_LR'] = self.predicted_full_2
        final_df = pd.merge(left=self.data, right=df1, left_index=True, right_index=True)
        final_df['True_Decision'] = final_df['Chance of Admit '].apply(lambda x: "Yes" if x > 0.80 else "No")
        final_df['Decision_DR'] = final_df['predictions_DR'].apply(lambda x: "Yes" if x > 0.80 else "No")
        final_df['Decision_LR'] = final_df['predictions_LR'].apply(lambda x: "Yes" if x > 0.80 else "No")
        print(final_df[['GRE Score','TOEFL Score','University Rating','SOP','LOR ','CGPA','Research','Chance of Admit ','predictions_LR']].tail(5))
        print(final_df[['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA', 'Research',
                        'Chance of Admit ', 'predictions_DR']].tail(5))
# 多输出(y是多列)的决策树回归
if __name__ == "__main__":
    N = 300
    x = np.random.rand(N) * 8 - 4  # [-4,4)
    x.sort()
    # y1 = np.sin(x) + 3 + np.random.randn(N) * 0.1
    # y2 = np.cos(0.3 * x) + np.random.randn(N) * 0.01
    y1 = np.sin(x) + np.random.randn(N) * 0.05
    y2 = np.cos(x) + np.random.randn(N) * 0.1
    # y1 = 16 * np.sin(x) ** 3 + np.random.randn(N)
    # y2 = 13 * np.cos(x) - 5 * np.cos(2 * x) - 2 * np.cos(3 * x) - np.cos(4 * x) + np.random.randn(N) * 0.1
    y = np.vstack((y1, y2))
    y = np.vstack((y1, y2)).T  # .T 转置
    x = x.reshape(-1, 1)  # 转置后,得到N个样本,每个样本都是1维的

    deep = 3
    reg = DecisionTreeRegressor(criterion='mse', max_depth=deep)
    dt = reg.fit(x, y)

    x_test = np.linspace(-4, 4, num=1000).reshape(-1, 1)
    print(x_test)
    y_hat = dt.predict(x_test)
    print(y_hat)
    plt.scatter(y[:, 0], y[:, 1], c='r', s=40, label='Actual')
    plt.scatter(y_hat[:, 0], y_hat[:, 1], c='g', marker='s', s=100, label='Depth=%d' % deep, alpha=1)
    plt.legend(loc='upper left')
    plt.xlabel('y1')
    plt.ylabel('y2')
    plt.grid()
    plt.show()
示例#56
0
# :class:`~sklearn.tree.DecisionTreeRegressor` now supports a new `'poisson'`
# splitting criterion. Setting `criterion="poisson"` might be a good choice
# if your target is a count or a frequency.

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import numpy as np

n_samples, n_features = 1000, 20
rng = np.random.RandomState(0)
X = rng.randn(n_samples, n_features)
# positive integer target correlated with X[:, 5] with many zeros:
y = rng.poisson(lam=np.exp(X[:, 5]) / 2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
regressor = DecisionTreeRegressor(criterion="poisson", random_state=0)
regressor.fit(X_train, y_train)

##############################################################################
# New documentation improvements
# ------------------------------
#
# New examples and documentation pages have been added, in a continuous effort
# to improve the understanding of machine learning practices:
#
# - a new section about :ref:`common pitfalls and recommended
#   practices <common_pitfalls>`,
# - an example illustrating how to :ref:`statistically compare the performance of
#   models <sphx_glr_auto_examples_model_selection_plot_grid_search_stats.py>`
#   evaluated using :class:`~sklearn.model_selection.GridSearchCV`,
# - an example on how to :ref:`interpret coefficients of linear models
#   <sphx_glr_auto_examples_inspection_plot_linear_model_coefficient_interpretation.py>`,
示例#57
0
features_names = [
    'year', 'weekofyear', 'reanalysis_dew_point_temp_k',
    'reanalysis_min_air_temp_k', 'station_diur_temp_rng_c',
    'reanalysis_tdtr_k', 'reanalysis_specific_humidity_g_per_kg',
    'station_avg_temp_c', 'reanalysis_relative_humidity_percent',
    'precipitation_amt_mm', 'reanalysis_precip_amt_kg_per_m2'
]
features = data[features_names]
labels = data['total_cases']

# Cross validation analysis
from sklearn.model_selection import cross_val_score
total_scores = []
for i in range(2, 30):
    regressor = DecisionTreeRegressor(criterion='mse', max_depth=i)
    regressor.fit(features, labels)
    scores = -cross_val_score(
        regressor, features, labels, scoring='neg_mean_absolute_error', cv=10)
    total_scores.append(scores.mean())

plt.plot(range(2, 30), total_scores, marker='o')
plt.xlabel('max_depth')
plt.ylabel('cv score')
plt.show()

# Print features relevancies
print 'Feature Relevancies'
regressor = DecisionTreeRegressor(criterion='mse', max_depth=3, random_state=0)
regressor.fit(features, labels)
list1 = zip(features, regressor.feature_importances_)
print tabulate(list1, headers=['Feature', 'Relevance'])
示例#58
0
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from dtreeviz.trees import *

df_cars = pd.read_csv("tmp")
X, y = df_cars[['COUNTS']], df_cars['BYTES']

dt = DecisionTreeRegressor(max_depth=3, criterion="mae")
dt.fit(X, y)

fig = plt.figure()
ax = fig.gca()
rtreeviz_univar(dt, X, y, 'COUNTS', 'BYTES', ax=ax)
plt.show()
# Pick any two variables and store them to a new DataFrame use describe to summarize the X_Train
heatData = ['Heating', 'HeatingQC']
print (house_data[heatData].describe())

print("================================= modeling ===================================")
# prediction target==y
y = house_data.SalePrice

# choosing predictors X
price_predictors = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr']

X = house_data[price_predictors]

# define model - What type of model will it be
my_model = DecisionTreeRegressor()

# fit_model - Capture patterns from provided X_Train
my_model.fit(X, y)

print(my_model)

print("================================= prediction ===================================")
print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
print(my_model.predict(X.head()))




"""
test
"""
seed(10)
X = randint(0, 100, 100)
Y = uniform(low=0.5, high=13.3, size=(100, ))

max_split, max_gain = max_split_gain(X, Y)
print("%.2f" % info(Y))
print("%.2f" % condition_info(X, Y, 89))
print("%d" % max_split, "%.2f" % max_gain)

from sklearn.tree import DecisionTreeRegressor

X2 = np.array([[ele] for ele in X])

clf = DecisionTreeRegressor(criterion="mse", max_depth=1)
#  “mse” for the mean squared error, which is equal to variance reduction as
# feature selection criterion and minimizes the L2 loss using the mean of
# each terminal node
# variance reduction is equivalent to standard deviation reduction
# 对于x轴是一维的连续值,max_depth的增加可以在X轴上增加更多的分割点
# http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor
# http://scikit-learn.org/stable/auto_examples/tree/plot_tree_regression.html

clf.fit(X2, Y)
values = clf.predict([[87], [88], [89], [90]])
print(values)