Exemplo n.º 1
0
def model_complexity(X_train, y_train, X_test, y_test):
    """Calculate the performance of the model as model complexity increases."""

    print "Model Complexity: "

    # We will vary the depth of decision trees from 2 to 25
    max_depth = np.arange(1, 25)
    train_err = np.zeros(len(max_depth))
    test_err = np.zeros(len(max_depth))

    for i, d in enumerate(max_depth):
        # Setup a Decision Tree Regressor so that it learns a tree with depth d
        regressor = DecisionTreeRegressor(max_depth=d)

        # Fit the learner to the training data
        regressor.fit(X_train, y_train)

        # Find the performance on the training set
        train_err[i] = performance_metric(y_train, regressor.predict(X_train))

        # Find the performance on the testing set
        test_err[i] = performance_metric(y_test, regressor.predict(X_test))

    # Plot the model complexity graph
    model_complexity_graph(max_depth, train_err, test_err)
Exemplo n.º 2
0
def test_rt():
    boston = load_boston()
    X, y = boston.data, boston.target
    feature_names = boston.feature_names

    sk_dt = SKRT(random_state=1, max_depth=3)
    our_dt = RegressionTree(feature_names=feature_names, random_state=1)

    sk_dt.fit(X, y)
    our_dt.fit(X, y)

    sk_pred = sk_dt.predict(X)
    our_pred = our_dt.predict(X)
    assert np.allclose(sk_pred, our_pred)

    # With labels
    local_expl = our_dt.explain_local(X, y)
    local_viz = local_expl.visualize(0)
    assert local_viz is not None

    # Without labels
    local_expl = our_dt.explain_local(X)
    local_viz = local_expl.visualize(0)
    assert local_viz is not None

    global_expl = our_dt.explain_global()
    global_viz = global_expl.visualize()
    assert global_viz is not None
Exemplo n.º 3
0
def learning_curve(depth, X_train, y_train, X_test, y_test, iteration=None):
    """Calculate the performance of the model after a set of training data."""

    # We will vary the training set size so that we have 50 different sizes
    sizes = np.linspace(1, len(X_train), 50)
    train_err = np.zeros(len(sizes))
    test_err = np.zeros(len(sizes))

    print "Decision Tree with Max Depth: "
    print depth

    for i, s in enumerate(sizes):

        # Create and fit the decision tree regressor model
        regressor = DecisionTreeRegressor(max_depth=depth)
        regressor.fit(X_train[:s], y_train[:s])

        # Find the performance on the training and testing set
        train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s]))
        test_err[i] = performance_metric(y_test, regressor.predict(X_test))

    # Plot learning curve graph
    learning_curve_graph(depth, sizes, train_err, test_err)

    # added to produce figure 2
    if iteration is not None:
        print "Final error at max_depth={}: {}".format(depth, test_err[-1])
        fully_trained_error[depth - 1][iteration] = test_err[-1]
Exemplo n.º 4
0
def test_decision_tree_regression(filename):
    start_time = time.time()
    scores = []
    from sklearn.tree import DecisionTreeRegressor
    df = pd.read_csv(filename)
    h_indep = df.columns[:-1]
    h_dep = df.columns[-1]
    for _ in xrange(10):
            # print "- ",
            sys.stdout.flush()
            msk = np.random.rand(len(df)) < 0.4
            train_data = df[msk]
            test_data = df[~msk]

            # print len(train_data), len(test_data)
            assert (len(train_data) + len(test_data) == len(df)), "Something is wrong"
            train_indep = train_data[h_indep]
            train_dep = train_data[h_dep]

            test_indep = test_data[h_indep]
            test_dep = test_data[h_dep]
            dt = DecisionTreeRegressor()
            dt.fit(train_indep, [i for i in train_dep.values.tolist()])
            prediction = dt.predict(test_indep)
            from sklearn.metrics import mean_absolute_error

            scores.append(mean_absolute_error(test_dep, prediction))
            # print len(confusion_matrices),

    extract_name = filename.split("/")[-1].split(".")[0] + ".p"
    # import pickle
    # pickle.dump(confusion_matrices, open("./Results_RF_Classification/CM_" + extract_name, "wb"))
    print round(np.mean(scores), 3), round(time.time() - start_time, 3), "sec"
def learning_curve(depth, X_train, y_train, X_test, y_test):
    """Calculate the performance improvement of the model, as training size increases."""
    
    # create 50 equally spaced markers for the the graph's X axis
    sizes = np.round(np.linspace(1, len(X_train), 50))
    # create 50 open bins to fill in the training and test errors
    train_err = np.zeros(len(sizes))
    test_err = np.zeros(len(sizes))

    print "Decision Tree with Max Depth: "
    print depth

    for i, s in enumerate(sizes):
        
        # train classifier and test on each level of depth complexity
        regressor = DecisionTreeRegressor(max_depth=depth)
        regressor.fit(X_train[:s], y_train[:s])
        
        # fill in the training and test error 
        train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s]))
        test_err[i] = performance_metric(y_test, regressor.predict(X_test))

    # create the learning curve graph, using the calculated information
    learning_curve_graph(sizes, train_err, test_err)
    
    return test_err[-1]
Exemplo n.º 6
0
def arbolesRegresion(caract):
    
    clf = DecisionTreeRegressor(min_samples_leaf=10, min_samples_split=15, max_depth=13, compute_importances=True)
    
    importancias = [0,0,0,0,0,0,0,0,0,0,0,0,0]    
    mae=mse=r2=0
    
    kf = KFold(len(boston_Y), n_folds=10, indices=True)
    for train, test in kf:
        trainX, testX, trainY, testY=boston_X[train], boston_X[test], boston_Y[train], boston_Y[test]
            
        nCar=len(caract)
        train=np.zeros((len(trainX), nCar))
        test=np.zeros((len(testX), nCar))
        trainYNuevo=trainY
        
        for i in range(nCar):
            for j in range(len(trainX)):
                train[j][i]=trainX[j][caract[i]]
                
            for k in range(len(testX)):
                test[k][i]=testX[k][caract[i]]
        
        trainYNuevo=np.reshape(trainYNuevo, (len(trainY), -1))
        
        clf.fit(train, trainYNuevo)
        prediccion=clf.predict(test)            
        
#        clf.fit(trainX, trainY)
#        prediccion=clf.predict(testX)
            
        mae+=metrics.mean_absolute_error(testY, prediccion)
        mse+=metrics.mean_squared_error(testY, prediccion)
        r2+=metrics.r2_score(testY, prediccion)
        
        feature_importance = clf.feature_importances_
        feature_importance = 100.0 * (feature_importance / feature_importance.max())
        for i in range(13):
            importancias[i] = importancias[i] + feature_importance[i]
        
    print 'Error abs: ', mae/len(kf), 'Error cuadratico: ', mse/len(kf), 'R cuadrado: ', r2/len(kf)
    
    for i in range(13):
        importancias[i] = importancias[i]/10
        
    sorted_idx = np.argsort(importancias)
    pos = np.arange(sorted_idx.shape[0]) + .5
    importancias = np.reshape(importancias, (len(importancias), -1))

    boston = datasets.load_boston()
    pl.barh(pos, importancias[sorted_idx], align='center')
    pl.yticks(pos, boston.feature_names[sorted_idx])
    pl.xlabel('Importancia relativa')
    pl.show()    
    
    import StringIO, pydot 
    dot_data = StringIO.StringIO() 
    tree.export_graphviz(clf, out_file=dot_data) 
    graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
    graph.write_pdf("bostonTree.pdf") 
 def nn_lin(self, testX, neighbors):
     l = DecisionTreeRegressor()
     return np.mean(self.Y[neighbors])
     l.fit(self.X[neighbors], self.Y[neighbors])
     # for idx in np.where(l.coef_)[0]:
         # self.active[idx]+=1
     return l.predict([testX])[0]
def test_thresholded_scorers():
    # Test scorers that take thresholds.
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.decision_function(X_test))
    score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)
    assert_almost_equal(score1, score3)

    logscore = get_scorer('log_loss')(clf, X_test, y_test)
    logloss = log_loss(y_test, clf.predict_proba(X_test))
    assert_almost_equal(-logscore, logloss)

    # same for an estimator without decision_function
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)

    # test with a regressor (no decision_function)
    reg = DecisionTreeRegressor()
    reg.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(reg, X_test, y_test)
    score2 = roc_auc_score(y_test, reg.predict(X_test))
    assert_almost_equal(score1, score2)

    # Test that an exception is raised on more than two classes
    X, y = make_blobs(random_state=0, centers=3)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf.fit(X_train, y_train)
    assert_raises(ValueError, get_scorer('roc_auc'), clf, X_test, y_test)
Exemplo n.º 9
0
def plot_curve():
    # Defining our regression algorithm
    reg = DecisionTreeRegressor()
    # Fit our model using X and y
    reg.fit(X, y)
    print "Regressor score: {:.4f}".format(reg.score(X,y))

    # TODO: Use learning_curve imported above to create learning curves for both the
    # training data and testing data. You'll need reg, X, y, cv and score from above.
    # Note: Because i didnt use all the parameters in order of function definition for learning_curve fn,
    #       I have to explicitly assign values to the parameters. e.g, from learning_curve fn, after 'y'
    #       comes 'train_sizes'. But since it is optional and I am not using that parameter, for all other parameters
    #       that come after, i have to explicitly assign values to the parameter (e.g cv=cv, scoring=score)
    #       else error
    train_sizes, train_scores, test_scores = learning_curve(reg, X, y, cv=cv, scoring=score)


    # Taking the mean of the test and training scores
    train_scores_mean = np.mean(train_scores,axis=1)
    test_scores_mean = np.mean(test_scores,axis=1)

    # Plotting the training curves and the testing curves using train_scores_mean and test_scores_mean
    plt.plot(train_sizes ,train_scores_mean,'-o',color='b',label="train_scores_mean")
    plt.plot(train_sizes,test_scores_mean ,'-o',color='r',label="test_scores_mean")

    # Plot aesthetics
    plt.ylim(-0.1, 1.1)
    plt.ylabel("Curve Score")
    plt.xlabel("Training Points")
    plt.legend(bbox_to_anchor=(1.1, 1.1))
    plt.show()
def train_decision_tree(sizes, depth, X_test, X_train, y_test, y_train):
    """
    Args:
        sizes   (Numpy array): Array of training sample sizes to train on.
        depth   (int): The maximum depth of the DecisionTreeRegressor
        X_test  (Numpy array): Test set features
        X_train (Numpy array): Training set features
        y_test  (Numpy array): Test set target variable
        y_train (Numpy array): Training set target variable

    Returns:
        test_err  (Numpy array): Test set predictions.
        train_err (Numpy array): Training set predictions.
    """

    train_err = np.zeros(len(sizes))
    test_err = np.zeros(len(sizes))

    for i, s in enumerate(sizes):
        # Create and fit the decision tree regressor model
        regressor = DecisionTreeRegressor(max_depth=depth)

        # Cast to int to avoid DeprecationWarning from numpy 1.8
        regressor.fit(X_train[:int(s)], y_train[:int(s)])

        # Find the performance on the training and testing set
        train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s]))
        test_err[i] = performance_metric(y_test, regressor.predict(X_test))

    return test_err, train_err
Exemplo n.º 11
0
    def test_boston(self):
        from sklearn.tree import DecisionTreeRegressor as DecisionTreeRegressorSklearn
        model = DecisionTreeRegressor(tree_type='oblivious', max_n_splits=3)
        model_sklearn = DecisionTreeRegressorSklearn()

        dataset = load_boston()
        mse = []
        mse_sklearn = []

        for fold in range(5):
            X_train, X_test, y_train, y_test = train_test_split(
                dataset.data, dataset.target, test_size=0.33)

            model.fit(X_train, y_train)
            y = model.predict(X_test)
            mse.append(mean_squared_error(y, y_test))

            model_sklearn.fit(X_train, y_train)
            y = model_sklearn.predict(X_test)
            mse_sklearn.append(mean_squared_error(y, y_test))

        mean_mse = np.mean(mse)
        mean_mse_sklearn = np.mean(mse_sklearn)
        print(mean_mse, mean_mse_sklearn)
        # Check that our model differs in MSE no worse than 50%
        self.assertTrue(np.abs(mean_mse - mean_mse_sklearn) / mean_mse_sklearn < 0.5)
Exemplo n.º 12
0
def train_decision_tree(time_regression_df, test_size, random_state, max_depth, export_testset):
    time_regression_df_train, time_regression_df_test = cv.train_test_split(time_regression_df, test_size=test_size, random_state=random_state)
    y_train = time_regression_df_train['trip_time']
    x_train = time_regression_df_train.ix[:, 0:6]
    y_test = time_regression_df_test['trip_time']
    x_test = time_regression_df_test.ix[:, 0:6]
    
    if export_testset:
        xy_test = pd.concat([x_test, y_test], axis=1)
        xy_test.to_csv('../data/' + filename_prefix + '_testset.csv')

    tic = time.time()

    regtree = DecisionTreeRegressor(max_depth=max_depth, min_samples_split=3, random_state=random_state)
    regtree.fit(x_train, y_train)
    elapsed = time.time() - tic
    print(elapsed)


    export_meta_data(regtree, x_test, y_test, elapsed)

    target_location = ('../treelib/' + filename_prefix + '_tree_depth_' + str(regtree.tree_.max_depth))

    dump_model(regtree, target_location)
    return regtree
class TestDecisionTreeRegressorConverter(TestCase):
    def setUp(self):
        np.random.seed(1)
        self.est = DecisionTreeRegressor(max_depth=2)
        self.est.fit([
            [0, 0],
            [0, 1],
            [1, 0],
            [1, 1],
        ], [0, 1, 1, 1])
        self.ctx = TransformationContext(
            input=[IntegerNumericFeature('x1'), StringCategoricalFeature('x2', ['zero', 'one'])],
            model=[IntegerNumericFeature('x1'), StringCategoricalFeature('x2', ['zero', 'one'])],
            derived=[],
            output=[IntegerNumericFeature('output')]
        )
        self.converter = DecisionTreeConverter(
            estimator=self.est,
            context=self.ctx,
            mode=DecisionTreeConverter.MODE_REGRESSION
        )

    def test_transform(self):
        p = self.converter.pmml()
        tm = p.TreeModel[0]
        assert tm.MiningSchema is not None, 'Missing mining schema'
        assert len(tm.MiningSchema.MiningField) == 3, 'Wrong number of mining fields'
        assert tm.Node is not None, 'Missing root node'
        assert tm.Node.recordCount == 4
        assert tm.Node.True_ is not None, 'Root condition should always be True'
Exemplo n.º 14
0
def learning_curve(depth, X_train, y_train, X_test, y_test):
    """Calculate the performance of the model after a set of training data."""

    # We will vary the training set size so that we have 50 different sizes
    sizes = np.round(np.linspace(1, len(X_train), 50))
    train_err = np.zeros(len(sizes))
    test_err = np.zeros(len(sizes))
    sizes = [int(ii) for ii in sizes]
    
    print "Decision Tree with Max Depth: "
    print depth

    for i, s in enumerate(sizes):

        # Create and fit the decision tree regressor model
        regressor = DecisionTreeRegressor(max_depth=depth)
        regressor.fit(X_train[:s], y_train[:s])

        # Find the performance on the training and testing set
        train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s]))
        test_err[i] = performance_metric(y_test, regressor.predict(X_test))


    # Plot learning curve graph
    learning_curve_graph(sizes, train_err, test_err)
class SimpleGB(BaseEstimator):
    def __init__(self, tree_params_dict, iters, tau):
        self.tree_params_dict = tree_params_dict
        self.iters = iters
        self.tau = tau
        
    def fit(self, X_data, y_data):
        self.base_algo = DecisionTreeRegressor(**self.tree_params_dict).fit(X_data, y_data)
        self.estimators = []
        curr_pred = self.base_algo.predict(X_data)
        for iter_num in range(self.iters):
            # Нужно посчитать градиент функции потерь
            grad = 0. # TODO
            # Нужно обучить DecisionTreeRegressor предсказывать антиградиент
            # Не забудьте про self.tree_params_dict
            algo = DecisionTreeRegressor().fit(X_data, y_data) # TODO

            self.estimators.append(algo)
            # Обновите предсказания в каждой точке
            curr_pred += 0. # TODO
        return self
    
    def predict(self, X_data):
        # Предсказание на данных
        res = self.base_algo.predict(X_data)
        for estimator in self.estimators:
            res += self.tau * estimator.predict(X_data)
        # Задача классификации, поэтому надо отдавать 0 и 1
        return res > 0.
Exemplo n.º 16
0
def get_imp(X,y):
    #rf = RandomForestClassifier()
    rf = DecisionTreeRegressor(random_state=9)
    rf.fit(X, y)
    imp_var = rf.feature_importances_
    imp_var = pd.DataFrame({'variable':X.columns, 'imp':imp_var}).sort('imp', ascending=False)
    return(imp_var)
def decision_tree_regressor(X, y, labels):

    regressor = DecisionTreeRegressor(max_depth=3)
    regressor.fit(X, y)

    estimates_z = regressor.predict(X)
    leaves = regressor.apply(X)

    leaves_hash = np.zeros(np.max(leaves) + 1)
    for i in range(len(y)):
        if (estimates_z[i] - y[i]) > 0.05 and estimates_z[i] > 0.6 and y[i] > 0:
            # print estimates_z[i]
            # print y[i]
            # print estimates_z[i]-y[i]
            # print ((estimates_z[i]-y[i])>0.1 and estimates_z[i]>0 and y[i]>0)
            # print leaves[i]
            leaves_hash[leaves[i]] += 1
            # print leaves_hash[leaves[i]]
        else:
            leaves_hash[-1] += 1

    # print regressor.tree_.decision_path(X)
    print regressor.tree_.feature
    print regressor.tree_.threshold
    print leaves_hash
    print regressor.feature_importances_

    visualize_tree(regressor.tree_, labels)
    return estimates_z
Exemplo n.º 18
0
  def CART(self):
    "  CART"
    # Apply random forest Classifier to predict the number of bugs.
    if self.smoteit:
      self.train = SMOTE(
          self.train,
          atleast=50,
          atmost=101,
          resample=self.duplicate)

    if not self.tuning:
      clf = DecisionTreeRegressor(random_state=1)
    else:
      clf = DecisionTreeRegressor(max_depth=int(self.tunings[0]),
                                  min_samples_split=int(self.tunings[1]),
                                  min_samples_leaf=int(self.tunings[2]),
                                  max_features=float(self.tunings[3] / 100),
                                  max_leaf_nodes=int(self.tunings[4]),
                                  criterion='entropy', random_state=1)
    features = self.train.columns[:-2]
    klass = self.train[self.train.columns[-2]]
    # set_trace()
    clf.fit(self.train[features].astype('float32'), klass.astype('float32'))
    preds = clf.predict(
        self.test[self.test.columns[:-2]].astype('float32')).tolist()
    return preds
def model_complexity(X_train, y_train, X_test, y_test):
    """ Calculates the performance of the model as model complexity increases.
        The learning and testing errors rates are then plotted. """
    
    print "Creating a model complexity graph. . . "

    # We will vary the max_depth of a decision tree model from 1 to 14
    max_depth = np.arange(1, 14)
    train_err = np.zeros(len(max_depth))
    test_err = np.zeros(len(max_depth))

    for i, d in enumerate(max_depth):
        # Setup a Decision Tree Regressor so that it learns a tree with depth d
        regressor = DecisionTreeRegressor(max_depth = d)

        # Fit the learner to the training data
        regressor.fit(X_train, y_train)

        # Find the performance on the training set
        train_err[i] = performance_metric(y_train, regressor.predict(X_train))

        # Find the performance on the testing set
        test_err[i] = performance_metric(y_test, regressor.predict(X_test))

    # Plot the model complexity graph
    pl.figure(figsize=(7, 5))
    pl.title('Decision Tree Regressor Complexity Performance')
    pl.plot(max_depth, test_err, lw=2, label = 'Testing Error')
    pl.plot(max_depth, train_err, lw=2, label = 'Training Error')
    pl.legend()
    pl.xlabel('Maximum Depth')
    pl.ylabel('Total Error')
    pl.show()
Exemplo n.º 20
0
def fit_model1(X, y):
    """ Performs grid search over the 'max_depth' parameter for a
        decision tree regressor trained on the input data [X, y]. """

    # Create cross-validation sets from the training data
    cv_sets = ShuffleSplit(X.shape[0], n_iter=10, test_size=0.20, random_state=0)

    # TODO: Create a decision tree regressor object
    regressor = DecisionTreeRegressor()
    regressor.fit(X, y)
    # TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
    params = {'max_depth': (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)}

    # TODO: Transform 'performance_metric' into a scoring function using 'make_scorer'
    scoring_fnc = make_scorer(performance_metric)

    #print(regressor.predict(X))
    ##scoring_fnc = make_scorer(mean_squared_error)

    # TODO: Create the grid search object
    grid_obj = GridSearchCV(regressor, params, scoring=scoring_fnc, cv=cv_sets)

    # Fit the grid search object to the data to compute the optimal model
    grid = grid_obj.fit(X, y)

    # Return the optimal model after fitting the data
    return grid.best_estimator_
Exemplo n.º 21
0
def train_learning_model_decision_tree_ada_boost(df):
    #code taken from sklearn
    X_all, y_all = preprocess_data(df)
    X_train, X_test, y_train, y_test = split_data(X_all, y_all)

    tree_regressor = DecisionTreeRegressor(max_depth = 6)
    ada_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=6), n_estimators = 500, learning_rate = 0.01, random_state = 1)

    tree_regressor.fit(X_train, y_train)
    ada_regressor.fit(X_train, y_train)

    y_pred_tree = tree_regressor.predict(X_test)
    y_pred_ada = ada_regressor.predict(X_test)
    
    mse_tree = mean_squared_error(y_test, y_pred_tree)
    mse_ada = mean_squared_error(y_test, y_pred_ada)

    mse_tree_train = mean_squared_error(y_train, tree_regressor.predict(X_train))
    mse_ada_train = mean_squared_error(y_train, ada_regressor.predict(X_train))
    
    print ("MSE tree: %.4f " %mse_tree)
    print ("MSE ada: %.4f " %mse_ada)

    print ("MSE tree train: %.4f " %mse_tree_train)
    print ("MSE ada train: %.4f " %mse_ada_train)
def learning_curve(depth, X_train, y_train, X_test, y_test):
    """Calculate the performance of the model after a set of training data."""

    # We will vary the training set size so that we have 50 different sizes
    sizes = np.round(np.linspace(1, len(X_train), 50))
    train_err = np.zeros(len(sizes))
    test_err = np.zeros(len(sizes))

    print "Decision Tree with Max Depth: "
    print depth

    for i, s in enumerate(sizes):

        # Create and fit the decision tree regressor model
        regressor = DecisionTreeRegressor(max_depth=depth)
        regressor.fit(X_train[:s], y_train[:s])

        # Find the performance on the training and testing set
        train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s]))
        test_err[i] = performance_metric(y_test, regressor.predict(X_test))

    # if depth >= 4 and depth <= 6:
    #     pl.figure()
    #     pl.plot(y_test, 'bo')
    #     pl.plot(regressor.predict(X_test), color='red')
    #     pl.savefig("test_data_depth_" + str(depth))

    # Plot learning curve graph
    learning_curve_graph(sizes, train_err, test_err, depth)
Exemplo n.º 23
0
def test_bootstrap_samples():
    """Test that bootstraping samples generate non-perfect base estimators."""
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        random_state=rng)

    base_estimator = DecisionTreeRegressor().fit(X_train, y_train)

    # without bootstrap, all trees are perfect on the training set
    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                max_samples=1.0,
                                bootstrap=False,
                                random_state=rng).fit(X_train, y_train)

    assert_equal(base_estimator.score(X_train, y_train),
                 ensemble.score(X_train, y_train))

    # with bootstrap, trees are no longer perfect on the training set
    ensemble = BaggingRegressor(base_estimator=DecisionTreeRegressor(),
                                max_samples=1.0,
                                bootstrap=True,
                                random_state=rng).fit(X_train, y_train)

    assert_greater(base_estimator.score(X_train, y_train),
                   ensemble.score(X_train, y_train))
class CustomClassifier(BaseEstimator, ClassifierMixin):
     """Predicts the majority class of its training data."""
     def __init__(self):
         global class_instance
         class_instance += 1
         self.instance = class_instance
         #print "instance:", self.instance
         
     def __del__(self):
         global class_instance
         class_instance -= 1
        
     def fit(self, X, y, sample_weight=array([])):
         # 1st Adaboost iteration: just return the current volatility
         if self.instance <= 2:
             self.y = y     
             return self
         # 2+ Adaboost iteration: use linera regreession as a weak learner
         else:
             self.regr = DecisionTreeRegressor(max_depth=8)
             #self.regr = linear_model.Lasso(alpha=0.01,fit_intercept=False,normalize=False,max_iter=10000000)   # they call lambda alpha
             self.regr.fit(X, y)
     
     def predict(self, X):
         # 1st Adaboost iteration: just return the current volatility
         if self.instance <= 2:
             return X[:,6]   # return 6th element of feature vector (which is the current volatility) 
         # 2+ Adaboost iteration: use linera regreession as a weak learner    
         else:
             return self.regr.predict(X)
Exemplo n.º 25
0
def fit_predict_model(city_data):
	'''Find and tune the optimal model. Make a prediction on housing data.'''

	# Get the features and labels from the Boston housing data
	X, y = city_data.data, city_data.target
	print X
	# Setup a Decision Tree Regressor
	regressor = DecisionTreeRegressor()

	parameters = {'max_depth':(1,2,3,4,5,6,7,8,9,10)}

	
	reg = GridSearchCV(regressor, parameters,scoring=make_scorer(metrics.mean_squared_error,greater_is_better=False))
	print reg.fit(X, y)

	depth_values= list()
	for i in xrange(101):
		reg.fit(X,y)
		depth_values.append(int(reg.best_params_['max_depth']))

	print "Best model parameter:  " + str(np.median(depth_values))
	# Fit the learner to the training data

    # Use the model to predict the output of a particular sample
	regressor = DecisionTreeRegressor(max_depth=np.median(depth_values))
	print "Final Model: "
	print regressor

	regressor.fit(X, y)
	
	x = [11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]
	y = regressor.predict(x)
	print "House: " + str(x)
	print "Prediction: " + str(y)
Exemplo n.º 26
0
def decision_tree_regressor_fit(bk_columns, bk):
 
    clf = DecisionTreeRegressor()
    X = bk[bk_columns]
    y = bk['count']
    clf = clf.fit(X, y)
    return clf
Exemplo n.º 27
0
 def featureRelevance(self, data):
     testFeature = 'Detergents_Paper'
     new_data = data.drop([testFeature], axis = 1)
     X_train, X_test, y_train, y_test = train_test_split(new_data, data[[testFeature]], test_size=0.25, random_state=1)
     regressor = DecisionTreeRegressor(random_state=30).fit(X_train, y_train)
     score = regressor.score(X_test, y_test)
     print("feature relevance test: feature {}, score {}".format(testFeature, score))
     return
Exemplo n.º 28
0
def learn(train_file, n_trees=10, learning_rate=0.1, k=10, validate=False):
    print "Loading train file"
    train = np.loadtxt(train_file, delimiter=",", skiprows=1)

    scores = train[:, 0]

    queries = train[:, 1]

    features = train[:, 3:]

    ensemble = Ensemble(learning_rate)

    print "Training starts..."
    model_output = np.zeros(len(features))

    time.clock()
    for i in range(n_trees):
        print " Iteration: " + str(i + 1)

        # Compute psedo responces (lambdas)
        # witch act as training label for document
        start = time.clock()
        print "  --generating labels"
        lambdas = compute_lambdas(model_output, scores, queries, k)
        
        print zip(lambdas, scores)
        print "  --done", str(time.clock() - start) + " sec"
 
        # create tree and append it to the model
        print "  --fitting tree"
        start = time.clock()
        tree = DecisionTreeRegressor(max_depth=6)
        # print "Distinct lambdas", set(lambdas)
        tree.fit(features, lambdas)

        print "  ---done", str(time.clock() - start) + " sec"
        print "  --adding tree to ensemble"
        ensemble.add(tree)

        # update model score
        print "  --generating step prediction"
        prediction = tree.predict(features)
        # print "Distinct answers", set(prediction)

        print "  --updating full model output"
        model_output += learning_rate * prediction
        print model_output

        # train_score
        start = time.clock()
        print "  --scoring on train"
        train_score = score(model_output, scores, queries, 10)
        print "  --iteration train score " + str(train_score) + ", took " + str(time.clock() - start) + "sec to calculate"

    print "Finished sucessfully."
    print "------------------------------------------------"
    return ensemble
Exemplo n.º 29
0
class Regressor(BaseEstimator):
    def __init__(self):
        self.clf = DecisionTreeRegressor(max_depth=5)

    def fit(self, X, y):
        self.clf.fit(X, y)

    def predict(self, X):
        return self.clf.predict(X)
Exemplo n.º 30
0
def tune_regtree(x,y,alpha_list,scoring):
    scores=[]
    for alpha in alpha_list:
        clf = DecisionTreeRegressor(max_depth=alpha)
        clf.fit(x, y)
        scores.extend([np.mean(cross_val_score(clf, x, y, cv=5, scoring=scoring))])
    max_index = scores.index(min(scores))
    print scores
    return alpha_list[max_index]
Exemplo n.º 31
0
    def train(self, max_depth, max_leaf_nodes, model_name, output_path):
        with mlflow.start_run(run_name=self.run_origin
                              ) as run:  # NOTE: mlflow CLI ignores run_name
            run_id = run.info.run_uuid
            experiment_id = run.info.experiment_id
            print("MLflow:")
            print("  run_id:", run_id)
            print("  experiment_id:", experiment_id)
            print("  experiment_name:",
                  client.get_experiment(experiment_id).name)

            # Create model
            dt = DecisionTreeRegressor(max_depth=max_depth,
                                       max_leaf_nodes=max_leaf_nodes)
            print("Model:\n ", dt)

            # Fit and predict
            dt.fit(self.X_train, self.y_train)
            predictions = dt.predict(self.X_test)

            # MLflow params
            print("Parameters:")
            print("  max_depth:", max_depth)
            print("  max_leaf_nodes:", max_leaf_nodes)
            mlflow.log_param("max_depth", max_depth)
            mlflow.log_param("max_leaf_nodes", max_leaf_nodes)

            # MLflow metrics
            rmse = np.sqrt(mean_squared_error(self.y_test, predictions))
            mae = mean_absolute_error(self.y_test, predictions)
            r2 = r2_score(self.y_test, predictions)
            print("Metrics:")
            print("  rmse:", rmse)
            print("  mae:", mae)
            print("  r2:", r2)
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("r2", r2)
            mlflow.log_metric("mae", mae)

            # MLflow tags
            mlflow.set_tag("mlflow.runName",
                           self.run_origin)  # mlflow CLI picks this up
            mlflow.set_tag("data_path", self.data_path)
            mlflow.set_tag("run_origin", self.run_origin)
            mlflow.set_tag("mlflow_version", mlflow.__version__)
            mlflow.set_tag("sklearn_version", sklearn.__version__)

            # MLflow log model
            mlflow.sklearn.log_model(dt,
                                     "sklearn-model",
                                     registered_model_name=model_name)

            # Convert sklearn model to ONNX and log model
            if self.log_as_onnx:
                from wine_quality import onnx_utils
                onnx_utils.log_model(dt, "onnx-model", model_name, self.X_test)

            # MLflow artifact - plot file
            plot_file = "plot.png"
            plot_utils.create_plot_file(self.y_test, predictions, plot_file)
            mlflow.log_artifact(plot_file)

            # Write run ID to file
            if (output_path):
                mlflow.set_tag("output_path", output_path)
                output_path = output_path.replace("dbfs:", "/dbfs")
                with open(output_path, "w") as f:
                    f.write(run_id)

        return (experiment_id, run_id)
Exemplo n.º 32
0
X = X[:, 1:]

# Backwards elimination
import pandas.util.testing as tm
import statsmodels.tools.tools as tl
X = tl.add_constant(X)
import statsmodels.api as sm
X = X[:,
      [0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 13, 19, 20, 23, 24, 25, 26, 27, 28, 30]]
regressor_OLS = sm.OLS(endog=Y, exog=X).fit()
regressor_OLS.summary()

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=0)

# Fitting the Decision Tree Regression Model to the dataset
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(max_depth=5)
regressor.fit(X_train, Y_train)

# Predicting a new result with Linear Regression
Y_pred = regressor.predict(X_test)

# Residuals calculation
residuals = np.average(np.abs(Y_pred - Y_test))
print(residuals)
Exemplo n.º 33
0
plt.scatter(X_adr, y_adr)
plt.show()

#### Implementación del arbol de decisión #####

from sklearn.model_selection import train_test_split

# Separamos los datos de entrenamiento y validación
X_train, X_test, y_train, y_test = train_test_split(X_adr,
                                                    y_adr,
                                                    test_size=0.2)

from sklearn.tree import DecisionTreeRegressor

# Defino el algoritmo a utilizar
adr = DecisionTreeRegressor(max_depth=10)

# Entreno el modelo
adr.fit(X_train, y_train)

# Realizamos la predicción
Y_pred = adr.predict(X_test)

# Graficamos los datos de prueba junto con la predicción
X_grid = np.arange(min(X_test), max(X_test), 0.1)  # declaramos un array de X
X_grid = X_grid.reshape((len(X_grid), 1))  # lo transformamos a columna
plt.scatter(X_test, y_test)
plt.plot(X_grid, adr.predict(X_grid), color='red', linewidth=3)
plt.show()

# Calculamos la precisión del modelo
Exemplo n.º 34
0
mean_squared_error(y_test, pred)

# In[147]:

rmse = np.sqrt(mean_squared_error(np.array(y_test).reshape(-1, 1), pred))
rmse

# In[148]:

r2_score(y_test, pred)

# In[149]:

from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor(max_leaf_nodes=10)
DecisionR = model.fit(x_train, y_train)
DecisionR

# In[150]:

from sklearn.metrics import mean_squared_error, r2_score

pred = DecisionR.predict(x_test)
mean_squared_error(y_test, pred)

# In[151]:

r2_score(y_test, pred)

# In[152]:
    def test_predict_proba(self):
        # Check to make sure that the model will train as expected with sklearn.Bunch objects
        data = load_iris()
        X_train, y_train = data.data[:120], data.target[:120]
        X_test, y_test = data.data[120:], data.target[120:]

        # All '2' variables are the baseline test and what we should match up with
        default_args = {'random_state': 19}
        dt1 = DecisionTree(default_args=default_args)
        dt2 = DecisionTreeClassifier(**default_args)

        dt1.train(X_train, y_train, exp_type='classification')
        dt2.fit(X_train, y_train)

        preds1 = dt1.predict_proba(X_test)
        preds2 = dt2.predict_proba(X_test)

        self.assertTrue((preds1 == preds2).all())

        data = load_boston()
        X_train, y_train = data.data[:120], data.target[:120]
        X_test, y_test = data.data[120:], data.target[120:]

        # All '2' variables are the baseline test and what we should match up with
        default_args = {'random_state': 30}
        dt1 = DecisionTree(default_args=default_args)
        dt2 = DecisionTreeRegressor(**default_args)

        dt1.train(X_train, y_train, exp_type='regression')
        dt2.fit(X_train, y_train)

        try:
            preds1 = dt1.predict_proba(X_test)

            # The following line is not implemented in sklearn, however I've included to match the others
            #preds2 = dt2.predict_proba(X_test)
        except NotImplementedError as err:
            self.assertEqual(
                str(err),
                'The \'predict_proba\' method is not implemented for regression problems (this is an scikit-learn issue, not an alexandria issue!)'
            )

        data = load_diabetes()
        X_train, y_train = data.data[:120], data.target[:120]
        X_test, y_test = data.data[120:], data.target[120:]

        # All '2' variables are the baseline test and what we should match up with
        default_args = {'random_state': 15}
        dt1 = DecisionTree(default_args=default_args)
        dt2 = DecisionTreeRegressor(**default_args)

        dt1.train(X_train, y_train, exp_type='regression')
        dt2.fit(X_train, y_train)

        try:
            preds1 = dt1.predict_proba(X_test)

            # The following line is not implemented in sklearn, however I've included to match the others
            #preds2 = dt2.predict_proba(X_test)
        except NotImplementedError as err:
            self.assertEqual(
                str(err),
                'The \'predict_proba\' method is not implemented for regression problems (this is an scikit-learn issue, not an alexandria issue!)'
            )

        data = load_wine()
        X_train, y_train = data.data[:120], data.target[:120]
        X_test, y_test = data.data[120:], data.target[120:]

        # All '2' variables are the baseline test and what we should match up with
        default_args = {'random_state': 90, 'max_depth': 3}
        dt1 = DecisionTree(default_args=default_args)
        dt2 = DecisionTreeClassifier(**default_args)

        dt1.train(X_train, y_train, exp_type='classification')
        dt2.fit(X_train, y_train)

        preds1 = dt1.predict_proba(X_test)
        preds2 = dt2.predict_proba(X_test)

        self.assertTrue((preds1 == preds2).all())

        # Check to make sure that the model will train as expected with sklearn.Bunch objects
        data = load_iris(as_frame=True)
        data = data.frame
        X = data.loc[:, data.columns != 'target']
        y = data['target']
        X_train, y_train = X.iloc[:120], y.iloc[:120]
        X_test, y_test = X.iloc[120:], y.iloc[120:]

        default_args = {'random_state': 19, 'max_features': 'auto'}
        dt1 = DecisionTree(default_args=default_args)
        dt2 = DecisionTreeClassifier(**default_args)

        dt1.train(X_train, y_train, exp_type='classification')
        dt2.fit(X_train, y_train)

        preds1 = dt1.predict_proba(X_test)
        preds2 = dt2.predict_proba(X_test)

        self.assertTrue((preds1 == preds2).all())

        data = load_diabetes(as_frame=True)
        data = data.frame
        X = data.loc[:, data.columns != 'target']
        y = data['target']
        X_train, y_train = X.iloc[:120], y.iloc[:120]
        X_test, y_test = X.iloc[120:], y.iloc[120:]

        default_args = {'random_state': 19, 'min_samples_split': 4}
        dt1 = DecisionTree(default_args=default_args)
        dt2 = DecisionTreeRegressor(**default_args)

        dt1.train(X_train, y_train, exp_type='regression')
        dt2.fit(X_train, y_train)

        try:
            preds1 = dt1.predict_proba(X_test)

            # The following line is not implemented in sklearn, however I've included to match the others
            #preds2 = dt2.predict_proba(X_test)
        except NotImplementedError as err:
            self.assertEqual(
                str(err),
                'The \'predict_proba\' method is not implemented for regression problems (this is an scikit-learn issue, not an alexandria issue!)'
            )

        data = load_wine(as_frame=True)
        data = data.frame
        X = data.loc[:, data.columns != 'target']
        y = data['target']
        X_train, y_train = X.iloc[:120], y.iloc[:120]
        X_test, y_test = X.iloc[120:], y.iloc[120:]

        default_args = {'random_state': 19, 'criterion': 'entropy'}
        dt1 = DecisionTree(default_args=default_args)
        dt2 = DecisionTreeClassifier(**default_args)

        dt1.train(X_train, y_train, exp_type='classification')
        dt2.fit(X_train, y_train)

        preds1 = dt1.predict_proba(X_test)
        preds2 = dt2.predict_proba(X_test)

        self.assertTrue((preds1 == preds2).all())
Exemplo n.º 36
0
 X_poly = poly_reg.fit_transform(X2_train)
 lin_reg_2 = LinearRegression()
 lin_reg_2.fit(X_poly, y2_train)
 y2_pred = lin_reg_2.predict(poly_reg.fit_transform(X2_test))
 from sklearn.metrics import r2_score
 r2 = r2_score(y1_test, y1_pred)
 print(r2)
 X3 = df3.iloc[:, :-1].values
 y3 = df3.iloc[:, -1].values
 from sklearn.model_selection import train_test_split
 X3_train, X3_test, y3_train, y3_test = train_test_split(X3,
                                                         y3,
                                                         test_size=0.2,
                                                         random_state=0)
 from sklearn.tree import DecisionTreeRegressor
 regressor3 = DecisionTreeRegressor(random_state=0)
 regressor3.fit(X3_train, y3_train)
 y3_pred = regressor3.predict(X3_test)
 from sklearn.metrics import r2_score
 r3 = r2_score(y3_test, y3_pred)
 print(r3)
 X4 = df4.iloc[:, :-1].values
 y4 = df4.iloc[:, -1].values
 from sklearn.model_selection import train_test_split
 X4_train, X4_test, y4_train, y4_test = train_test_split(X4,
                                                         y4,
                                                         test_size=0.2,
                                                         random_state=0)
 from sklearn.ensemble import RandomForestRegressor
 regressor4 = RandomForestRegressor(n_estimators=10, random_state=0)
 regressor4.fit(X4_train, y4_train)
Exemplo n.º 37
0
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# Importing in the data
boston = load_boston()
y = boston.target
x = boston.data
# Splitting the data into train and testsets
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)
# Instantiating the models with default values
linear_model = LinearRegression()
rand_forest = RandomForestRegressor()
adaboost_model = AdaBoostRegressor()
decision_model = DecisionTreeRegressor()

linear_model.fit(x_train, y_train)
rand_forest.fit(x_train, y_train)
adaboost_model.fit(x_train, y_train)
decision_model.fit(x_train, y_train)

preds_rf = rand_forest.predict(x_test)
preds_linear = linear_model.predict(x_test)
preds_ada = adaboost_model.predict(x_test)
preds_dt = decision_model.predict(x_test)

print(r2_score(y_test, preds_rf))
print(mean_squared_error(y_test, preds_rf))
print(mean_absolute_error(y_test, preds_rf))
Exemplo n.º 38
0
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

dataset = pd.read_csv("true_car_listings.csv", header=0)

y_dataset = dataset[['Price']]
dataset = dataset[["Year", "Mileage", "State", "Make"]]
dataset = pd.get_dummies(dataset, columns=["State", "Make"])

X_train, X_test, y_train, y_test = train_test_split(dataset,
                                                    y_dataset.values.ravel(),
                                                    test_size=0.20,
                                                    random_state=None)
warnings.filterwarnings("ignore", category=DeprecationWarning)

dt = DecisionTreeRegressor()
# dt = reg = LinearRegression()
# dt = RandomForestRegressor(max_depth=2, n_estimators=100)

print("Treinamento")
print(dataset.shape)
dt_fit = dt.fit(X_train, y_train)

dt_scores = cross_val_score(dt_fit,
                            X_train,
                            y_train,
                            cv=10,
                            scoring="neg_mean_squared_error")
dt_predict = dt.predict(X_test)
print("Media cross validation score: {}".format(np.mean(dt_scores)))
print("RMSE Score: ", sqrt(mean_squared_error(y_test, dt_predict)))
Exemplo n.º 39
0
#Feature Selection
if feat_select==1:
    '''Three steps:
       1) Run Feature Selection
       2) Get lists of selected and non-selected features
       3) Filter columns from original dataset
       '''
    
    print('--FEATURE SELECTION ON--', '\n')
    
    ##1) Run Feature Selection #######
    #Wrapper Select via model
    
    if fs_type==2:
        rgr = DecisionTreeRegressor(criterion='friedman_mse', splitter='best', max_depth=None, min_samples_split=3, min_samples_leaf=1, max_features=None, random_state=rand_st)
        sel = SelectFromModel(rgr, prefit=False, threshold='mean', max_features=None)                   
        print ('Wrapper Select: ')

        fit_mod=sel.fit(data_np, target_np)    
        sel_idx=fit_mod.get_support()


    ##2) Get lists of selected and non-selected features (names and indexes) #######
    temp=[]
    temp_idx=[]
    temp_del=[]
    for i in range(len(data_np[0])):
        if sel_idx[i]==1:                                                           #Selected Features get added to temp header
            temp.append(header[i+feat_start])
            temp_idx.append(i)
Exemplo n.º 40
0
# In[2]:

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostRegressor
import numpy as np
import matplotlib.pyplot as plt
# Create a random dataset
rng = np.random.RandomState(1)
X = np.sort(5 * rng.rand(80, 1), axis=0)
y = np.sin(X).ravel()
y[::5] += 3 * (0.5 - rng.rand(16))

# Fit regression model
from sklearn.tree import DecisionTreeRegressor
clf_b = DecisionTreeRegressor(max_depth=5)
clf_1 = AdaBoostRegressor(base_estimator=clf_b,
                          n_estimators=10,
                          random_state=0).fit(X, y)
clf_2 = AdaBoostRegressor(base_estimator=clf_b,
                          n_estimators=20,
                          random_state=0).fit(X, y)
# Predict
X_test = np.arange(0.0, 5.0, 0.01)[:, np.newaxis]
y_1 = clf_1.predict(X_test)
y_2 = clf_2.predict(X_test)

plt.plot(X, y, 'o', color='black')
plt.plot(X_test, y_1, label='10 estimators')
plt.plot(X_test, y_2, label='20 estimators')
# Plot the resu  et")
Exemplo n.º 41
0
# Load the Data Set from the csv file
dataSet = pd.read_csv('groupStudy.csv')

# In[5]:

dataSet

# In[6]:

# selecting the row and columns to be used
hours = dataSet['Hours of study']
marks = dataSet['Marks scored']

# In[7]:

# reshaping the matrix
X = np.array(hours).reshape(-1, 1)
y = np.array(marks).reshape(-1, 1)

# In[8]:

from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor()
regressor.fit(X, y)
y_predicted = regressor.predict(np.array(5).reshape(-1, 1))

# In[9]:

y_predicted
Exemplo n.º 42
0
def predict_dtr_plot(ticker, x, y, x_train, y_train, days_predict, filePath):
    # get prediction dates
    base = date.today()
    dates = [base + timedelta(days=x) for x in range(days_predict)]
    predict_timestamp_list = []  # Used to display the date of prediction to user

    # convert to time stamp
    for dt in dates:
        predict_timestamp_list.append((str(dt)))
        timestamp = time.mktime(datetime.strptime((str(dt)), "%Y-%m-%d").timetuple())
        np.append(x, int(timestamp))

    model = DecisionTreeRegressor()                     # Define model - DTR worked best for most stocks.
    model.fit(x_train, y_train)                         # Fit to model
    predictions = model.predict(x)                      # predict

    print(len(predictions))
    length = len(predictions)
    count = [0, 0]
    # prediction_timestamp_2plot = []
    for predict in predictions:
        count[0] += 1
        if count[0] > length - days_predict:
            count[1] += 1
            print(f'Prediction ({predict_timestamp_list[count[1] - 1]}) = ' + str(predict))
            # prediction_timestamp_2plot.append(predict_timestamp_list[count[1] - 1])

    # Final step - create and show the graph
    pred_length = len(predict_timestamp_list)
    temp = []
    count = 0
    for length in range(length):
        count += 1
        temp.append(count)

    plt.cla() # Clear old plot
    plt.clf()
    # predictions = predictions[(pred_length - days_predict):pred_length]
    predictions=predictions[-90:]
    plt.figure(figsize=(20, 5))
    # prediction_dates = np.array(prediction_timestamp_2plot)
    plt.plot(predict_timestamp_list, predictions)
    plt.title(str(ticker))
    plt.ylabel('Price', fontsize=12)

    # I use slice notation for the ticks - ex: a[start_index:end_index:step]
    plt.yticks(predictions[::10])
    plt.xticks(predict_timestamp_list[::10])

    # plt.xlabel('Time (Days)', fontsize=12)
    # plt.yscale('linear')
    # plt.xlabel(predict_timestamp_list)
    # ax = plt.figure().gca()
    # plt.suptitle(ticker, fontsize=20)
    # ax.xaxis.set_major_locator(MaxNLocator(integer=True))  # Improvement
    plt.grid(True)
    # ax.set_xticklabels(predict_timestamp_list, rotation=80)
    # plt.xticks(predict_timestamp_list[1::3], temp[1::3])  # This is numpy's slicing
    if filePath: # Make directory to store our export data
        try:
            plt.savefig(f"{filePath}/{ticker}.png")
            print(f"Plot image is located at: {filePath}/{ticker}/{ticker}.png")
        except:
            print(f"There was an exporting the plot image for {ticker}.")
    plt.show()

    # print("Mean sq. error:" + str(mean_squared_error(y, predictions)))

    return predictions, predict_timestamp_list
    def test_eval(self):
        # Check to make sure that the model will evaluate as expected with sklearn.Bunch objects

        # Classification
        #  sklearn.Bunch
        data = load_iris()
        X_train, y_train = data.data[:120], data.target[:120]
        X_test, y_test = data.data[120:], data.target[120:]

        # All '2' variables are the baseline test and what we should match up with
        default_args = {'random_state': 19}
        dt1 = DecisionTree(default_args=default_args)
        dt2 = DecisionTreeClassifier(**default_args)

        dt1.train(X_train, y_train, exp_type='classification')
        dt2.fit(X_train, y_train)

        dt1.eval(X_test, y_test, metrics='acc')
        actual_acc = dt1.getMetric('acc')
        preds = dt2.predict(X_test)
        expected_acc = accuracy_score(y_test, preds)
        self.assertEqual(actual_acc, expected_acc)

        # Error if r-squared is wanted in classification problem
        # Recall
        default_args = {'random_state': 30}
        dt1 = DecisionTree(default_args=default_args)
        dt2 = DecisionTreeRegressor(**default_args)

        dt1.train(X_train, y_train, exp_type='classification')
        dt2.fit(X_train, y_train)
        try:
            dt1.eval(X_test, y_test, metrics='r2')

            fail(self)
        except ValueError as ve:
            self.assertEqual(
                str(ve), 'cannot use R2 metric for classification problem!')

        #   pandas.DataFrame
        data = load_iris(as_frame=True).frame
        data_cols = data.columns[:-1]
        target_col = 'target'
        X_train, X_test, y_train, y_test = train_test_split(data[data_cols],
                                                            data[target_col],
                                                            train_size=0.8,
                                                            random_state=0)

        default_args = {'random_state': 19}
        dt1 = DecisionTree(default_args=default_args)
        dt2 = DecisionTreeClassifier(**default_args)

        dt1.train(X_train, y_train, exp_type='classification')
        dt2.fit(X_train, y_train)

        dt1.eval(X_test, y_test, metrics=['acc', 'precision', 'recall'])
        actual_acc = dt1.getMetric('acc')
        actual_prec = dt1.getMetric('prec')
        actual_rec = dt1.getMetric('rec')
        preds = dt2.predict(X_test)
        expected_acc = accuracy_score(y_test, preds)
        expected_prec = precision_score(y_test, preds, average='weighted')
        expected_rec = recall_score(y_test, preds, average='weighted')
        self.assertEqual(actual_acc, expected_acc)
        self.assertEqual(actual_prec, expected_prec)
        self.assertEqual(actual_rec, expected_rec)

        # Error if r-squared is wanted in classification problem
        # Recall
        default_args = {'random_state': 30}
        dt1 = DecisionTree(default_args=default_args)
        dt2 = DecisionTreeRegressor(**default_args)

        dt1.train(X_train, y_train, exp_type='classification')
        dt2.fit(X_train, y_train)
        try:
            dt1.eval(X_test, y_test, metrics='r2')

            fail(self)
        except ValueError as ve:
            self.assertEqual(
                str(ve), 'cannot use R2 metric for classification problem!')

        # Regression
        #  sklearn.Bunch
        data = load_boston()
        X_train, y_train = data.data[:120], data.target[:120]
        X_test, y_test = data.data[120:], data.target[120:]

        default_args = {'random_state': 30}
        dt1 = DecisionTree(default_args=default_args)
        dt2 = DecisionTreeRegressor(**default_args)

        dt1.train(X_train, y_train, exp_type='regression')
        dt2.fit(X_train, y_train)

        dt1.eval(X_test, y_test, metrics=['r2'])
        actual_r2 = dt1.getMetric('r2')
        preds = dt2.predict(X_test)
        expected_r2 = r2_score(y_test, preds)
        self.assertEqual(actual_r2, expected_r2)

        # Error if accuracy, recall, etc is wanted in a regression problem
        # Recall
        default_args = {'random_state': 30}
        dt1 = DecisionTree(default_args=default_args)
        dt2 = DecisionTreeRegressor(**default_args)

        dt1.train(X_train, y_train, exp_type='regression')
        dt2.fit(X_train, y_train)
        try:
            dt1.eval(X_test, y_test, metrics='recall')

            fail(self)
        except ValueError as ve:
            self.assertEqual(
                str(ve), 'cannot use Recall metric for regression problem!')

        # Accuracy
        default_args = {'random_state': 30}
        dt1 = DecisionTree(default_args=default_args)
        dt2 = DecisionTreeRegressor(**default_args)

        dt1.train(X_train, y_train, exp_type='regression')
        dt2.fit(X_train, y_train)
        try:
            dt1.eval(X_test, y_test, metrics='accuracy')

            fail(self)
        except ValueError as ve:
            self.assertEqual(
                str(ve), 'cannot use Accuracy metric for regression problem!')

        # Precision
        default_args = {'random_state': 30}
        dt1 = DecisionTree(default_args=default_args)
        dt2 = DecisionTreeRegressor(**default_args)

        dt1.train(X_train, y_train, exp_type='regression')
        dt2.fit(X_train, y_train)
        try:
            dt1.eval(X_test, y_test, metrics='prec')

            fail(self)
        except ValueError as ve:
            self.assertEqual(
                str(ve), 'cannot use Precision metric for regression problem!')

        #   pandas.DataFrame
        data = load_diabetes(as_frame=True).frame
        data_cols = data.columns[:-1]
        target_col = 'target'
        X_train, X_test, y_train, y_test = train_test_split(data[data_cols],
                                                            data[target_col],
                                                            train_size=0.8,
                                                            random_state=0)

        default_args = {'random_state': 30}
        dt1 = DecisionTree(default_args=default_args)
        dt2 = DecisionTreeRegressor(**default_args)

        dt1.train(X_train, y_train, exp_type='regression')
        dt2.fit(X_train, y_train)

        dt1.eval(X_test, y_test, metrics=['r2'])
        actual_r2 = dt1.getMetric('r2')
        preds = dt2.predict(X_test)
        expected_r2 = r2_score(y_test, preds)
        self.assertEqual(actual_r2, expected_r2)
X_valid["IsWorkingTime"] = X_valid.Hour.map(getWorkingorNonWorkingHoursOfDay)

X_valid["Prediction"] = model.predict(X_valid)

showPredictionValidation(y_train, y_test, X_test, X_valid, df_result)

print(mae(y_test, y_pred))

print(mse(y_test, y_pred))

print(r2(y_test, y_pred))

pipelines = []
# =============================================================================

pipelines.append(('DSTR', DecisionTreeRegressor()))
pipelines.append(('GBM', GradientBoostingRegressor()))
pipelines.append(('RDMF', RandomForestRegressor()))
pipelines.append(('ADAB', AdaBoostRegressor()))
pipelines.append(('ETR', ExtraTreesRegressor()))
pipelines.append(('BAGR', BaggingRegressor()))
pipelines.append(('KNNR', KNeighborsRegressor(n_neighbors=7)))
#pipelines.append(('LR', LinearRegression()))
#pipelines.append(('Ridge', Ridge()))
#pipelines.append(('Lasso', Lasso()))
#pipelines.append(('SVR', SVR()))

## =============================================================================


def apply_loocv(X_train, y_train, X_test, y_test):
    def test_predict(self):
        # Check to make sure that the model will train as expected with sklearn.Bunch objects
        data = load_iris()
        X_train, y_train = data.data[:120], data.target[:120]
        X_test, y_test = data.data[120:], data.target[120:]

        # All '2' variables are the baseline test and what we should match up with
        default_args = {'random_state': 19}
        dt1 = DecisionTree(default_args=default_args)
        dt2 = DecisionTreeClassifier(**default_args)

        dt1.train(X_train, y_train, exp_type='classification')
        dt2.fit(X_train, y_train)

        preds1 = dt1.predict(X_test)
        preds2 = dt2.predict(X_test)

        self.assertTrue((preds1 == preds2).all())

        data = load_boston()
        X_train, y_train = data.data[:120], data.target[:120]
        X_test, y_test = data.data[120:], data.target[120:]

        # All '2' variables are the baseline test and what we should match up with
        default_args = {'random_state': 30}
        dt1 = DecisionTree(default_args=default_args)
        dt2 = DecisionTreeRegressor(**default_args)

        dt1.train(X_train, y_train, exp_type='regression')
        dt2.fit(X_train, y_train)

        preds1 = dt1.predict(X_test)
        preds2 = dt2.predict(X_test)

        self.assertTrue((preds1 == preds2).all())

        data = load_diabetes()
        X_train, y_train = data.data[:120], data.target[:120]
        X_test, y_test = data.data[120:], data.target[120:]

        # All '2' variables are the baseline test and what we should match up with
        default_args = {'random_state': 15}
        dt1 = DecisionTree(default_args=default_args)
        dt2 = DecisionTreeRegressor(**default_args)

        dt1.train(X_train, y_train, exp_type='regression')
        dt2.fit(X_train, y_train)

        preds1 = dt1.predict(X_test)
        preds2 = dt2.predict(X_test)

        self.assertTrue((preds1 == preds2).all())

        data = load_wine()
        X_train, y_train = data.data[:120], data.target[:120]
        X_test, y_test = data.data[120:], data.target[120:]

        # All '2' variables are the baseline test and what we should match up with
        default_args = {'random_state': 90, 'max_depth': 3}
        dt1 = DecisionTree(default_args=default_args)
        dt2 = DecisionTreeClassifier(**default_args)

        dt1.train(X_train, y_train, exp_type='classification')
        dt2.fit(X_train, y_train)

        preds1 = dt1.predict(X_test)
        preds2 = dt2.predict(X_test)

        self.assertTrue((preds1 == preds2).all())

        # Check to make sure that the model will train as expected with sklearn.Bunch objects
        data = load_iris(as_frame=True)
        data = data.frame
        X = data.loc[:, data.columns != 'target']
        y = data['target']
        X_train, y_train = X.iloc[:120], y.iloc[:120]
        X_test, y_test = X.iloc[120:], y.iloc[120:]

        default_args = {'random_state': 19, 'max_features': 'auto'}
        dt1 = DecisionTree(default_args=default_args)
        dt2 = DecisionTreeClassifier(**default_args)

        dt1.train(X_train, y_train, exp_type='classification')
        dt2.fit(X_train, y_train)

        preds1 = dt1.predict(X_test)
        preds2 = dt2.predict(X_test)

        self.assertTrue((preds1 == preds2).all())

        data = load_diabetes(as_frame=True)
        data = data.frame
        X = data.loc[:, data.columns != 'target']
        y = data['target']
        X_train, y_train = X.iloc[:120], y.iloc[:120]
        X_test, y_test = X.iloc[120:], y.iloc[120:]

        default_args = {'random_state': 19, 'min_samples_split': 4}
        dt1 = DecisionTree(default_args=default_args)
        dt2 = DecisionTreeRegressor(**default_args)

        dt1.train(X_train, y_train, exp_type='regression')
        dt2.fit(X_train, y_train)

        preds1 = dt1.predict(X_test)
        preds2 = dt2.predict(X_test)

        self.assertTrue((preds1 == preds2).all())

        data = load_wine(as_frame=True)
        data = data.frame
        X = data.loc[:, data.columns != 'target']
        y = data['target']
        X_train, y_train = X.iloc[:120], y.iloc[:120]
        X_test, y_test = X.iloc[120:], y.iloc[120:]

        default_args = {'random_state': 19, 'criterion': 'gini'}
        dt1 = DecisionTree(default_args=default_args)
        dt2 = DecisionTreeClassifier(**default_args)

        dt1.train(X_train, y_train, exp_type='classification')
        dt2.fit(X_train, y_train)

        preds1 = dt1.predict(X_test)
        preds2 = dt2.predict(X_test)

        self.assertTrue((preds1 == preds2).all())
Exemplo n.º 46
0
# generate the data
import matplotlib.pyplot as plt
import random
import pandas
depth  = int(input('Choose a number for tree depth: '))

x = pandas.DataFrame([10 * random.random() for __ in range(50)])
y = 2 * x - 1 + pandas.DataFrame([random.random() for __ in range(50)])

# pick model
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(max_depth=depth)
model.fit(x, y)

# plot the model together with the data
xfit = pandas.DataFrame([i for i in range(-1, 12)])
yfit = model.predict(xfit)
plt.scatter(x, y)
plt.plot(xfit, yfit)
plt.show()

# compute the R^2 score
print("R^2 score: {}".format(model.score(x,y)))


rm = RidgeCV(cv=5, alphas=[0.1, 1.0, 3, 10, 30, 100, 300, 1000, 3000])
rm.fit(X, y)
score = rm.score(X, y)
print score, rm.alpha_

print 'train / test score'
rm.fit(X_train, y_train)
score = rm.score(X_test, y_test)
print score, rm.alpha_

printProgress()
# ======================================================================================
print banner
print '7. Performing Decision Tree Regressor'
from sklearn.tree import DecisionTreeRegressor
dtc = DecisionTreeRegressor(max_depth=10, min_samples_split=20)
dtc.fit(X, y)
mscore = dtc.score(X, y)
print mscore

print 'test train score'
dtc.fit(X_train, y_train)
mscore = dtc.score(X_test, y_test)
print mscore

printProgress()
# ======================================================================================
print banner
print '8. prepping the word data'
cvec = CountVectorizer(stop_words='english',
                       lowercase=True,
Exemplo n.º 48
0
    X = df[cols]

    X = np.array(X)

    y = np.array(y)

    # Define classifiers to try: (clf, name) pairs

    classifiers = [
        (LinearRegression(n_jobs=-1), 'LinearRegression'),
        (RandomForestRegressor(n_estimators=100, n_jobs=-1,
                               random_state=0), "RandomForest"),
        (GradientBoostingRegressor(n_estimators=100,
                                   random_state=0), "GradientBoost"),
        (ExtraTreesRegressor(n_estimators=100, random_state=0), "ExtraTrees"),
        (DecisionTreeRegressor(random_state=0), "DecisionTrees"),
        (BaggingRegressor(n_estimators=100, n_jobs=-1,
                          random_state=0), "Bagging"),
        (AdaBoostRegressor(n_estimators=100, random_state=0), "AdaBoost")
        # ,
        #            (XGBRegressor(n_estimators=100, n_jobs=-1, randomstate=0), "XGBoost")
    ]

    ######## SQUID Prediction

    # Store all ROC curves here:
    squid_rocs = []

    for clf, name in classifiers:
        print("Evaluating %s classifier (squid)" % name)
        mae, r2 = cross_validate_and_plot(clf, X, y, cols, name + "_squid",
Exemplo n.º 49
0
start: float = time.time()

for j, i in enumerate(x_axis):

    [X_train, X_test, y_train,
     y_test] = train_test_split(X,
                                y,
                                test_size=.2,
                                random_state=randint(0, 1000))

    tree_aux: DecisionTreeRegressor = DecisionTreeRegressor(
        criterion='squared_error',
        splitter='best',
        max_depth=2,
        min_samples_split=1 * 1e-3,
        min_samples_leaf=100 * 1e-3,
        min_weight_fraction_leaf=0,
        max_features='auto',
        random_state=randint(0, 1000),
        max_leaf_nodes=None,
        min_impurity_decrease=0,
        ccp_alpha=0)
    model: AdaBoostRegressor = AdaBoostRegressor(
        base_estimator=tree_aux,
        n_estimators=30,
        learning_rate=100 * 1e-3,
        loss='square',
        random_state=randint(0, 1000)).fit(X_train, y_train)

    if cofs is None:
        cofs = model.feature_importances_
    else:
Exemplo n.º 50
0
principalComp = pca.fit_transform(X)
print(pca.explained_variance_ratio_)
principalDf = pd.DataFrame(data=principalComp,
                           columns=['c1', 'c2', 'c3', 'c4'])
print(principalDf.head())

kf = KFold(20)

Xs_array = Xs.values
Y_array = Y.values
for a, b in kf.split(Xs_array):
    X_train, X_test = Xs_array[a], Xs_array[b]
    y_train, y_test = Y_array[a], Y_array[b]

lr = LinearRegression()
DT = DecisionTreeRegressor()
RF = RandomForestRegressor()
GB = GradientBoostingRegressor()
NN = MLPRegressor(hidden_layer_sizes=(100, 8), random_state=1)
inp = input("Do you want to fit the models " + asking[1])
if inp == 'y':
    model1 = lr.fit(X_train, y_train)
    model2 = DT.fit(X_train, y_train)
    model3 = RF.fit(X_train, y_train)
    model4 = GB.fit(X_train, y_train)
    model5 = NN.fit(Xs_array, Y_array)

    print("Accuracy Score of Linear regression on train set",
          model1.score(X_train, y_train) * 100)
    print("Accuracy Score of Decision Tree on train set",
          model2.score(X_train, y_train) * 100)
# -*- coding: utf-8 -*-
"""
@author: user
"""

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv("decision tree regression dataset.csv", sep=";", header=None)

x = df.iloc[:, 0].values.reshape(-1, 1)
y = df.iloc[:, 1].values.reshape(-1, 1)

#%%  decision tree regression
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()  # random sate = 0
tree_reg.fit(x, y)

tree_reg.predict(5.5)
x_ = np.arange(min(x), max(x), 0.01).reshape(-1, 1)
y_head = tree_reg.predict(x_)
# %% visualize
plt.scatter(x, y, color="red")
plt.plot(x_, y_head, color="green")
plt.xlabel("tribun level")
plt.ylabel("ucret")
plt.show()
Exemplo n.º 52
0
filtered_melbourne_data = melbourne_data.dropna(axis=0)
# Choose target and predictors
y = filtered_melbourne_data.Price
melbourne_predictors = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 
                        'YearBuilt', 'Lattitude', 'Longtitude']
X = filtered_melbourne_data[melbourne_predictors]


# We then create the Decision tree model with this code:

# In[ ]:


from sklearn.tree import DecisionTreeRegressor
# Define model
melbourne_model = DecisionTreeRegressor()
# Fit model
melbourne_model.fit(X, y)


# The calculation of mean absolute error in the Melbourne data is

# In[ ]:


from sklearn.metrics import mean_absolute_error

predicted_home_prices = melbourne_model.predict(X)
mean_absolute_error(y, predicted_home_prices)

Exemplo n.º 53
0
mse_bins_store = []

# Monte Carlo cross validation (MCCV) loop
for rrr in range(50):
    # Resample validation set (uniform distribution)
    train_indices, test_indices = resreg.uniform_test_split(X,
                                                            y,
                                                            bins=bins,
                                                            bin_test_size=70,
                                                            verbose=False,
                                                            random_state=rrr)
    X_train, y_train = X[train_indices, :], y[train_indices]
    X_test, y_test = X[test_indices, :], y[test_indices]

    # Unpack hyperparameters, resample training data, and fit regressors
    reg = DecisionTreeRegressor(random_state=rrr) if 'REBAGG' in strategy else \
              RandomForestRegressor(n_estimators=10, n_jobs=-1, random_state=rrr)

    if strategy == 'RO':
        cl, ch, sample_method = param
        relevance = resreg.sigmoid_relevance(y_train, cl=cl, ch=ch)
        X_train, y_train = resreg.random_oversample(X_train,
                                                    y_train,
                                                    relevance,
                                                    relevance_threshold=0.5,
                                                    over=sample_method,
                                                    random_state=rrr)
        reg.fit(X_train, y_train)

    elif strategy == 'SMOTER':
        cl, ch, sample_method, k = param
Exemplo n.º 54
0
from scipy.sparse import csr_matrix
# from polylearn import PolynomialNetworkRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

from math import sqrt

regr = LinearSVR(random_state=0)
svr_lin = SVR(kernel='linear', C=1e3)
fm = pylibfm.FM()
knr = KNeighborsRegressor(n_neighbors=10)
dr = DummyRegressor()
bagrgr = BaggingRegressor()
dtreergr = DecisionTreeRegressor()
adabregr = AdaBoostRegressor()
gradbregr = GradientBoostingRegressor()


def validate(X, y):
    print "Starting cross validation"
    scores = cross_val_score(knr, X, y, scoring='neg_mean_squared_error', cv=3)
    return scores


if __name__ == "__main__":
    import music

    train_examples = music.load_examples('data/train.pkl')
    # poly = PolynomialNetworkRegressor(degreex=3, n_components=2, tol=1e-3, warm_start=True, random_state=0)
trainDataFinal = unigrams[:5395, :]
trainTargetFinal = traintarget[:5395]

#use test_set_all_instances.csv and bestFeaturesIndex to create testData, testTarget
#First Pass get just text for unigram extraction...
print("working on 2nd unigrams..")
#Create unigram + features initial array
unigramsAndFeaturesTest = np.zeros((2847, 25559))
unigramsAndFeaturesTest[:, :] = unigrams[5395:, :]

testData = unigramsAndFeaturesTest
testTarget = traintarget[5395:]

estimator = Pipeline([("imputer", Imputer()),
                      ("treeReg", DecisionTreeRegressor(max_depth=5))])
estimator.fit(trainDataFinal, trainTargetFinal)
predicted = estimator.predict(testData)
mseScore = mean_squared_error(testTarget, predicted)
print("mseScore: " + str(mseScore))

#F-scores calculations
#convert test scores to categorical values
testTargetCategorical = []
for val in testTarget:
    if val < -1:
        testTargetCategorical.append('disagree')
    elif val > 1:
        testTargetCategorical.append('agree')
    else:
        testTargetCategorical.append('neutral')
Exemplo n.º 56
0
#Preparing result
yTest = pd.DataFrame(ytest)

yPred = pd.DataFrame(ypred)

yTest.index = yPred.index

result = pd.concat((yTest, yPred), axis=1)

##################################################################################################

#2.Fitting DECISION TREE REGRESSION
from sklearn.tree import DecisionTreeRegressor

DT_regressor = DecisionTreeRegressor()

DT_regressor.fit(xtrain, ytrain)

DT_regressor.score(xtrain, ytrain)

DT_ypred = DT_regressor.predict(xtest)

#Converting back to original from feature scale
ytest = scalery.inverse_transform(ytest)

ypred = scalery.inverse_transform(DT_ypred)

#Preparing result
yTest = pd.DataFrame(ytest)
Exemplo n.º 57
0
#making Sale price as target variable
train_y = train_file.SalePrice
#test_y=test_file.SalePrice
#Feature engineering, selecting some features to practice how decision model works
features = [
    'LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr',
    'TotRmsAbvGrd'
]
train_X = train_file[features]
test_X = test_file[features]
#test_file.columns

# In[22]:

#Decision Tree model
model = DecisionTreeRegressor(random_state=1)
model.fit(train_X, train_y)

# In[25]:

#saving prediction values
test_predictions = model.predict(test_X)
test_predictions

# In[29]:

#making a dataframe with predicted sales price and test id
output = pd.DataFrame({'Id': test_file.Id, 'SalePrice': test_predictions})

# In[31]:
Exemplo n.º 58
0
# Decision Tree Regression

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('../../data/Position_Salaries.csv')
X = dataset.iloc[:, 1:-1].values
y = dataset.iloc[:, -1].values

# Training the Decision Tree Regression model on the whole dataset
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X, y)

# Predicting a new result
regressor.predict([[6.5]])

# Visualising the Decision Tree Regression results (higher resolution)
X_grid = np.arange(min(X), max(X), 0.01)
X_grid = X_grid.reshape((len(X_grid), 1))
plt.scatter(X, y, color='red')
plt.plot(X_grid, regressor.predict(X_grid), color='blue')
plt.title('Truth or Bluff (Decision Tree Regression)')
plt.xlabel('Position level')
plt.ylabel('Salary')
plt.show()
Exemplo n.º 59
0
newData1 = X1.sample(4)
newData1
clsModel.predict(newData1)

#%% Regression Tree
#regression
#predict if mpg (numerical value) on basis of am, hp, wt
X2 = df[['am','hp','wt']]
Y2 = df[['mpg']]
np.mean(Y2)
from sklearn.tree import DecisionTreeRegressor
X2.shape
X2_train, X2_test, Y2_train, Y2_test = train_test_split(X2, Y2, test_size=.20)
X2_train.shape
X2_test.shape
regrModel = DecisionTreeRegressor()  #model with parameter
regrModel.fit(X2_train, Y2_train)

#visualise
text_representation = tree.export_text(regrModel)
print(text_representation)
fnames = ['am','hp','wt']
fig = plt.figure(figsize=(40,30))
tree.plot_tree(regrModel, feature_names=fnames, filled=True)
plt.show();

fig = plt.figure(figsize=(20,10))
tree.plot_tree(regrModel, feature_names=['am','hp','wt'], filled=True, max_depth=2, fontsize=20, node_ids=True)
plt.show();

Y2_train[X2_train['hp'] <= 92].aggregate({'mpg':np.mean})
Exemplo n.º 60
0
# %%

# Elastic Net
regressor = ElasticNet()
# %%
regressor = fittingModel(regressor, X_train, y_train)
# %%
pred_train, pred_test = predictValues(regressor, X_train, X_test)
# %%
validatingResults(pred_train, pred_test, y_train, y_test)
# %%
displayResults(y_test, pred_test)
# %%

# Decision Tree
regressor = DecisionTreeRegressor(random_state=0)
# %%
regressor = fittingModel(regressor, X_train, y_train)
# %%
pred_train, pred_test = predictValues(regressor, X_train, X_test)
# %%
validatingResults(pred_train, pred_test, y_train, y_test)
# %%
displayResults(y_test, pred_test)

# %%

# Random forest
regressor = RandomForestRegressor(n_estimators=30, random_state=0)
# %%
regressor = fittingModel(regressor, X_train, y_train)