def Decisiontreeregressor(self):

        dt = DecisionTreeRegressor()
        dt.fit(self.x_train, self.y_train)
        pred = dt.predict(self.x_test)
        pred = pd.DataFrame(pred)

        x_test = self.x_test.reset_index()
        predictions = pd.concat([x_test, pred], axis=1)
        predictions['num_orders'] = predictions[0]
        predictions = predictions.drop([0], axis=1)
        print("predictions\n", predictions)
        if 'week' in predictions:
            ts_tot_pred = predictions.groupby(['week'])['num_orders'].sum()
            ts_tot_pred = pd.DataFrame(ts_tot_pred)

        else:
            ts_tot_pred = pd.DataFrame(predictions)
            ts_tot_pred = predictions.groupby(['date'])['num_orders'].sum()
        print("ts_tot_pred\n", ts_tot_pred)

        if self.is_canvas_ml == 1:
            self.canvas.get_tk_widget().pack_forget()

        fig = Figure(figsize=(5, 5), dpi=100)
        fig.add_subplot(111).plot(self.ts_tot_orders, color='Blue')
        fig.add_subplot(111).plot(ts_tot_pred, color='Red')
        ideaLib.py2idea(dataframe=ts_tot_pred,
                        databaseName='ts_tot_pred_dt',
                        client=client)

        self.canvas = FigureCanvasTkAgg(fig,
                                        master=self.ml)  # A tk.DrawingArea.
        self.canvas.get_tk_widget().pack(side=RIGHT)
        self.canvas.draw()
        self.is_canvas_ml = 1
    def FitDecisionTree(self,
                        train_predictors,
                        test_predictors,
                        train_target,
                        test_target,
                        params={}):

        if bool(params):
            print("Fitting with max_depth = " + str(params["max_depth"]) +
                  ", max_leaf_nodes = " + str(params["max_leaf_nodes"]) +
                  ", min_samples_leaf = " + str(params["min_samples_leaf"]) +
                  " ...")
            dt = DecisionTreeRegressor(
                random_state=42,
                max_depth=params["max_depth"],
                max_leaf_nodes=params["max_leaf_nodes"],
                min_samples_leaf=params["min_samples_leaf"])
        else:
            print("Fitting with default parameters...")
            dt = DecisionTreeRegressor(random_state=42)

        dt_model = dt.fit(train_predictors, train_target.values.ravel())

        dt_rmse, dt_predictions = self.evaluateModel(
            model=dt_model,
            test_predictors=test_predictors,
            test_target=test_target,
            modelName='Decision Tree')

        dt_paramMap = dt_model.get_params()

        for key in dt_paramMap.keys():
            # print(key, dt_paramMap[key])

            if key in ['min_samples_leaf']:
                min_samples_leaf = dt_paramMap[key]
            if key in ['max_depth']:
                max_depth = dt_paramMap[key]
            if key in ['max_leaf_nodes']:
                max_leaf_nodes = dt_paramMap[key]
            if bool(params) == False:
                if key in ['min_samples_leaf', 'max_depth', 'max_leaf_nodes']:
                    print(key, dt_paramMap[key])

        # print("Decision Tree Root Mean Squared Error (RMSE) on test data = %g" % dt_rmse)

        return [min_samples_leaf, max_depth, max_leaf_nodes,
                dt_rmse], dt_predictions
Пример #3
0
plt.title('LogReg Precision, Recall, and fbeta Curves')
sns.despine()

lr_coefs = list(zip(X.columns, logreg.coef_[0]))
lr_coefs_df = pd.DataFrame(lr_coefs)
lr_top_coefs = [x for x in lr_coefs if np.abs(x[1]) > .07]
lr_top_coefs = sorted(lr_top_coefs, key=(lambda x: x[1]), reverse=True)
lr_top_coefs_df = pd.DataFrame(lr_top_coefs)

plt.barh([x[0] for x in lr_top_coefs], width=[x[1] for x in lr_top_coefs])
plt.title('LogOdds')
plt.grid(b=False)
sns.despine()

dt = DecisionTreeClassifier(max_depth=5)
dt.fit(X_train, y_train)

# Calculate fbeta for decision tree
all_fbeta_dt, best_fbeta_dt = fbeta(dt,
                                    X_test=X_test)  # not scaled data for dt

# Calculate ROC Score and AUC for decision tree
fpr_dt, tpr_dt, thresholds_dt = roc_curve(
    y_test,
    dt.predict_proba(X_test)[:, 1])  # not scaled data for dt
auc_dt = roc_auc_score(y_test, dt.predict_proba(X_test)[:, 1])

# This allows us to make a decision tree real fast directly in the notebook!
dot_data = StringIO()
export_graphviz(dt,
                out_file=dot_data,
Пример #4
0
# Linear Regression model

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)
lr_rmse_score = np.sqrt(mean_squared_error(y_pred, y_val))
lr_r2_score = r2_score(y_pred, y_val)
print("Root Mean Squared Error :", lr_rmse_score)
print("R2Score                 :", lr_r2_score)

# In[86]:

# Decission tree

dt = DecisionTreeRegressor()
dt_model = dt.fit(X_train, y_train)
y_pred_dtone = dt_model.predict(X_val)

## calculate RMSE

rms_dt = np.sqrt(mean_squared_error(y_pred_dtone, y_val))
r2_dt = r2_score(y_val, y_pred_dtone)
print('RMSE of Decision Tree Regression:', rms_dt)
print('R-Squared value:', r2_dt)
R2 = r2_score(y_val, y_pred)
n = X_train.shape[0]
p = len(X_train.columns)
Adj_r2 = 1 - (1 - R2) * (n - 1) / (n - p - 1)
print('Adjusted R-Square is : ', Adj_r2)

# In[ ]:
Пример #5
0
sc = students.join(courses, "CO_CURSO")
sci = sc.join(institutions,
              "CO_IES").drop("CO_ALUNO_SITUACAO", "CO_OCDE_AREA_GERAL",
                             "CO_UF_IES", "CO_IES", "CO_CURSO")

todas = ["EVASOR", "IN_RESERVA_ENSINO_PUBLICO", "IN_RESERVA_RENDA_FAMILIAR"]

# for i in range(0,len(todas)-1):
features = todas
i = 0
varx = features.pop(0)
assembler = VectorAssembler(inputCols=features, outputCol="features")
dataFinal = assembler.transform(sci)
dt = DecisionTreeClassifier(labelCol=varx, featuresCol='features', maxDepth=5)
(treinamento, teste) = dataFinal.randomSplit([0.8, 0.2])
model = dt.fit(treinamento)
predictions = model.transform(teste)
# print model.toDebugString
total = predictions.count()
missed = predictions.where(str(varx) + " != prediction").count()
_00 = predictions.where(varx + "=0 and prediction = 0").count()
_01 = predictions.where(varx + "=0 and prediction = 1").count()
_10 = predictions.where(varx + "=1 and prediction = 0").count()
_11 = predictions.where(varx + "=1 and prediction = 1").count()
print sys.argv[1]
print "-----\n"
print total, "Erradas: ", missed, "Erro(%): ", float(missed) / float(
    total) * 100
print "0\t", _00, "\t|\t", _01
print "1\t", _10, "\t|\t", _11