Пример #1
0
def uniRegression(p, xLabel, yLabel):
    global image_num
    # Randomly shuffle rows
    p = p.sample(frac=1).reset_index(drop=True)
    # Split train and test
    twentyPercent = -1 * round(p.shape[0] * 0.2)
    xCol = p[xLabel].values.reshape(-1, 1)
    X_train = xCol[:twentyPercent]
    X_test = xCol[twentyPercent:]
    y_train = p[yLabel][:twentyPercent].values.reshape(-1, 1)
    y_test = p[yLabel][twentyPercent:].values.reshape(-1, 1)
    # Fit linear regression model
    lr = linear_model.LinearRegression()
    lr.fit(X_train, y_train)
    # Make predictions
    predicted = lr.predict(X_test)
    r2 = r2_score(y_test, predicted)
    mse = mean_squared_error(y_test, predicted)
    # Plot expected vs. predicted
    plt.scatter(X_test, y_test, color='black')
    plt.plot(X_test, predicted, color='blue', linewidth=2)
    plt.xlabel(xLabel)
    plt.ylabel(yLabel)
    plt.show()
    plt.savefig(image_path.format(image_num), bbox_inches='tight')
    image_num += 1
    print("R2 = ", r2)
    print("MSE = ", mse)
    visualizer = ResidualsPlot(lr)
    # Plot residuals
    visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
    visualizer.score(X_test, y_test)  # Evaluate the model on the test data
    visualizer.show()  # Finalize and render the figure
Пример #2
0
def train_model(rf, healed_data, target_string):
    #rf.fit(healed_data["train_features"], healed_data["train_target"])
    model = Ridge()
    visualizer = ResidualsPlot(rf)
    try:
        visualizer.fit(healed_data["train_features"],
                       healed_data["train_target"])
    except Exception as e:
        st.error("Fit error: " + str(e))

    try:
        visualizer.score(healed_data["test_features"],
                         healed_data["test_target"])
    except Exception as e:
        st.error("Score error: " + str(e))

    visualizer.show()
    # st.write(visualizer)
    st.pyplot(plt.savefig("models/rf_reg_eval_" + target_string + ".png"))
    # save model output
    model_output_loc = "models/rf_reg_" + target_string + "_rf_reg_model.pkl"
    model_output = open(model_output_loc, "wb")
    pickle.dump(rf, model_output)
    model_output.close()
    print("saving model to: " + model_output_loc)
    return
Пример #3
0
def linregress(*args):
    #import dependencies
    import sklearn as sk
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression()
    from sklearn import feature_selection
    import statsmodels.api as sm
    from patsy import dmatrices
    import numpy as np

    #define arguments
    dataframe = args[0]
    y = args[1]
    xvars = []
    for i in range(2, len(args)):
        xvars.append(args[i])
    x = dataframe[[item for item in xvars]]
    y = dataframe[y]
    #fit the model
    model.fit(x, y)

    #Generate Fit Statistics
    ##prep data for patsy
    list = []
    for item in xvars:
        list.append(f' + {item}')
    string = "".join(list)
    newstring = string[3:]

    ind = args[1]
    ind = ind.strip('"')

    ##Fit the Model
    Y, X = dmatrices(f"{ind} ~ {newstring}",
                     data=dataframe,
                     return_type="dataframe")
    logit = sm.Logit(Y, X)
    logit_result = logit.fit()

    #Print Log Odds
    print("LOG ODDS")
    print(logit_result.summary())
    print(np.exp(logit_result.params))

    #Plot the Residuals
    print("\n Residual Plot")
    from sklearn.linear_model import Ridge
    from yellowbrick.datasets import load_concrete
    from yellowbrick.regressor import ResidualsPlot

    model = Ridge()

    visualizer = ResidualsPlot(model, hist=True)
    y2 = y.values.reshape(-1, 1)
    visualizer.fit(x, y2)  # Fit the training data to the visualizer
    visualizer.score(x, y2)  # Evaluate the model on the test data
    visualizer.show()  # Finalize and render the figure
Пример #4
0
    def residual_plot(lin_model,x_train, y_train, x_test, y_test):
        fig = plt.figure(figsize=(16,12))
        ax = fig.add_subplot(111)
        visualizer = ResidualsPlot(lin_model, ax=ax)

        fig = plt.figure(figsize=(16,12))
        visualizer.fit(x_train, y_train)  # Fit the training data to the visualizer
        visualizer.score(x_test, y_test)  # Evaluate the model on the test data
        visualizer.show()
def plotResidualsAgainstHoldout(df, holdOut_df, task, seed, schema):
    X_train = df[COLUMNS.get(task)].values
    X_test = holdOut_df[COLUMNS.get(task)].values
    y_train = df[TARGETS.get(task)].values
    y_test = holdOut_df[TARGETS.get(task)].values

    # Instantiate the linear model and visualizer
    wrapped_model = LinearRegression()
    visualizer = ResidualsPlot(wrapped_model, title="Residuals for schema {}".format(schema))

    visualizer.fit(X_train, y_train)
    visualizer.score(X_test, y_test)  # Evaluate the model on the test data
    visualizer.show(outpath="figs/residuals_{}_seed{}_{}.png".format(task, seed, schema))
    plt.close()
Пример #6
0
    def residuals_plot(self) -> None:
        """Plot the difference between the observed value of the target variable (y)
        and the predicted value (ŷ), i.e. the error of the prediction"""

        visualizer = ResidualsPlot(self.trained_model)
        visualizer.fit(self.X_train,
                       self.y_train)  # Fit the training data to the visualizer
        visualizer.score(self.X_test,
                         self.y_test)  # Evaluate the model on the test data
        save_dir = f"{self.plots_dir}/residuals_plot_{self.model_id}.png"
        visualizer.show(outpath=save_dir)
        if not LOCAL:
            upload_to_s3(save_dir,
                         f'plots/residuals_plot_{self.model_id}.png',
                         bucket=S3_BUCKET_NAME)
        plt.clf()
def ridge_regression(X_train, y_train, X_test, y_test, plot):
    """
    Perfomring a ridge regression with built in CV and plotting the feature importance
    """
    # Fit the ridge regression
    reg = RidgeCV()
    reg.fit(X_train, y_train)
    print("Best alpha using built-in RidgeCV: %f" % reg.alpha_)
    print("Best score using built-in RidgeCV: %f" % reg.score(X_train, y_train))
    coef = pd.Series(reg.coef_, index=X_train.columns)
    print(
        "Ridge picked "
        + str(sum(coef != 0))
        + " variables and eliminated the other "
        + str(sum(coef == 0))
        + " variables"
    )
    # Extract the feature importance
    imp_coef = coef.sort_values()
    # Plot the feature importance
    if plot:
        plt.rcParams["figure.figsize"] = (8.0, 10.0)
        imp_coef.plot(kind="barh")
        plt.title("Feature importance using Ridge Model")
        plt.show()
        # Visualizing the regression
        visualizer = ResidualsPlot(reg, size=(1080, 720))
        visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
        visualizer.score(X_test, y_test)  # Evaluate the model on the test data
        visualizer.show()                 # Finalize and render the figure
    # Using the test data to calculate a score
    y_pred = reg.predict(X_test)
    # Return metrics
    return {
        "name": "Ridge Regression",
        "R squared": reg.score(X_test, y_test),
        "R squared training": reg.score(X_train, y_train),
        "RMSE": rmse(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred),
    }
Пример #8
0
#Predicting the price of a 150m2 living area
model.predict([[150]])

#Predicting the price of a 200m2 living area
model.predict([[200]])

from sklearn.linear_model import Ridge
from yellowbrick.regressor import ResidualsPlot

#Create a new model for the and plot the residuals for the regression
model = Ridge()
visualizer = ResidualsPlot(model)

#Fit the training data to the visualizer
visualizer.fit(LIVING_AREA, Selling_price)
visualizer.show()

#Creating a multiple variable regression
df1 = df[df['Land_size'].notna()]
df1 = df1[df1['Rooms'].notna()]
X = df1[['Living_area', 'Rooms', 'Land_size', 'Age']]
y = df1['Selling_price']
regr = LinearRegression()
regr.fit(X, y)

#The intercept and coefficients of the new model
print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)

#Plot the residuals for the model based on multiple variables
model = Ridge()
Пример #9
0
#load data
visualizations = load_dataset(file_name=config.TRAINING_DATA_FILE)

#set X and y
#adjust X based on feature set to use from config.py (TOP5_FEATURES or FEATURES)
X = visualizations[config.TOP5_FEATURES]
y = visualizations[config.TARGET]

#train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42)

#yellowbrick ResidualsPlotVisualization visual
visualizer = ResidualsPlot(config.BEST_MODEL)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show(outpath="visualizations/ResidualsPlotVisualization.pdf")
visualizer.show(outpath="visualizations/ResidualsPlotVisualization.png")
visualizer.show()

#yellowbrick prediction error visual
visualizer = PredictionError(config.BEST_MODEL)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show(outpath="visualizations/PredictionErrorVisualization.pdf")
visualizer.show(outpath="visualizations/PredictionErrorVisualization.png")
visualizer.show()
Пример #10
0
def show_residuals_plot(model, X_train, y_train, X_valid, y_valid):
    residuals_plot = ResidualsPlot(model)
    residuals_plot.fit(X_train, y_train)
    residuals_plot.score(X_valid, y_valid)
    residuals_plot.show()
Пример #11
0
#fitting lasso regression and making predictions
lasso = Lasso(alpha=14)
lasso.fit(X_train, y_train)
predictions3 = lasso.predict(X_test)
score3 = lasso.score(X_test, y_test)

#assessing performance of lasso
mae3 = MAE(y_test, predictions3)
mse3 = MSE(y_test, predictions3)
rmse3 = mse3**(1 / 2)

#feature importance
lasso_coef = lasso.fit(X, y).coef_

#visualizing regression model
visualizer = PredictionError(lasso)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()

#visualizing regression residuals plot
visualizer2 = ResidualsPlot(lasso)
visualizer2.fit(X_train, y_train)
visualizer2.score(X_test, y_test)
visualizer2.show()

print('Linear Regression Score: ', score1)
print('Ridge Regression Score: ', score2)
print('Lasso Regression Score: ', score3)
Пример #12
0
# The residuals plot shows how the model is injecting error, the bold \
# horizontal line at residuals = 0 is no error, and any point above or below \
# that line, indicates the magnitude of error.
# (https://www.scikit-yb.org/en/latest/quickstart.html#installation)

# Load a regression dataset
X, y = load_concrete()

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

visualizer = ResidualsPlot(LinearRegression())
visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
visualizer.show()  # Finalize and render the figure

# Xenia: Saving my plots
plt.show()
fig.set_size_inches(7, 5)
plt.savefig("6._Residuals_Plot.png")
fig.savefig("6._Residuals_Plot.png")

# %%
# New Plots with Temperature & Precipitation

# Time series of flow values with the x axis range limited
fig, ax = plt.subplots()
ax.plot(flow_weekly['flow'], label='Streamflow', color='black', linewidth=1)
ax.plot(data_weekly['Precipitation'],
        'r:',
    x = models(i[0], i[1])
    results.append(x)

#%%
results = pd.DataFrame(results)

#%%
sns.barplot(x = results['Model'], y = results['MSE'])

#%%
from yellowbrick.regressor import ResidualsPlot

visual_LR = ResidualsPlot(LR, hist = True)
visual_LR.fit(x_train, y_train)
visual_LR.score(x_test, y_test)
visual_LR.show()


#%%
visual_RF = ResidualsPlot(RF_Base, hist = True)
visual_RF.fit(x_train, y_train)
visual_RF.score(x_test, y_test)
visual_RF.show()

#%%
visual_SGD = ResidualsPlot(SGD_Base, hist = True)
visual_SGD.fit(x_train, y_train)
visual_SGD.score(x_test, y_test)
visual_SGD.show()

#%%
Пример #14
0
#Hc. What is the regression equation as per the model?
print("Hc. Regression equation: y=",linReg.coef_,"x + ", linReg.intercept_)
print("Hc. Regression equation: y = -0.06x + 35.17")

#Hd. For your model, does the predicted value for mpg increase or decrease as the displacement increases?
print("Hd. The predicted value for mpg decreases as the displacement increases")

#He. Given a car with a displacement value of 220, what would your model predict its mpg to be?
newframe = pd.DataFrame([[220]]) #New dataframe to predict
pred = linReg.predict(newframe)
print("He. Displacement is 220, MPG predicted: ", pred)

#Hf. Display a scatterplot of the actual mpg vs displacement and superimpose the linear regression line.
plt.scatter(X,y, marker=".") #Actual mpg vs displacement
#Formatting
plt.xlabel("Displacement")
plt.ylabel("Actual MPG")
plt.suptitle("Hf. Scatterplot and Regression of Actual MPG vs Displacement\nKuehler_Danielle_FinalProject_Q4")
plt.plot(X,y_pred) #Regression line
plt.show() #Display

#Hg. Plot the residuals
ridge = Ridge() #From sklearn
visualizer = ResidualsPlot(ridge)
visualizer.fit(X,y) #Fit to data
#Formatting
plt.suptitle("Hg. Plot of Residuals\nKuehler_Danielle_FinalProject_Q4")
visualizer.show() #Display

Пример #15
0
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from yellowbrick.regressor import ResidualsPlot
from sklearn.linear_model import LinearRegression

scaler = StandardScaler()
# neigh = KNeighborsRegressor(n_neighbors=5)
# regression_visualizers = [ResidualsPlot(neigh), PredictionError(neigh)]
features = [
    "longitude", "latitude", "peak_load", "off-grid", "avg_peak_winter",
    "avg_peak_spring", "avg_peak_summer", "avg_peak_autumn", "avg_base_winter",
    "avg_base_spring", "avg_base_summer", "avg_base_autumn"
]

case_name = "mg_sizing_dataset_with_loc"
df = pd.read_csv("results/" + case_name + ".csv",
                 sep=";|,",
                 engine="python",
                 index_col='index')
X = df[features]
scaler.fit(X)
X = scaler.transform(X)
targets = ["PV", "BAT", "RBAT", "INV", "GEN", "NPV"]
y = df[targets[0]]

model = LinearRegression()
visualizer_residuals = ResidualsPlot(model)
visualizer_residuals.fit(X, y)
visualizer_residuals.show()
Пример #16
0
def show_residusal(model, train_tup, test_tup):
    resPlot = ResidualsPlot(model)
    resPlot.fit(*train_tup)
    resPlot.score(*test_tup)
    resPlot.show()