def azureml_main(frame1): # Set backend import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import milkutilities as mu from sklearn import linear_model Azure = False X = frame1[['Month.Count', 'monthNumCubed']].as_matrix() Y = frame1['Milk.Prod'].as_matrix() regr = linear_model.LinearRegression() regr.fit(X, Y) frame1['Predicted'] = regr.predict(X).tolist() frame1['Resid'] = frame1['Milk.Prod'] - frame1['Predicted'] ## Add a time index to the data frame frame1 = mu.add_time_index(frame1) fig1 = plt.figure(1, figsize = (12,9)) ax = fig1.gca() frame1[['Milk.Prod', 'Predicted']].plot(ax = ax) plt.xlabel("Date") plt.ylabel("Log CA milk production") plt.title("Log of milk produciton vs. date") plt.show() if(Azure == True): fig1.savefig('scatter1.png') fig2 = plt.figure(1, figsize = (12,9)) fig2.clf() ax = fig2.gca() frame1['Resid'].plot(ax = ax) plt.xlabel("Date") plt.ylabel("Residuals of linear model") plt.title("Residuals of linear model vs. date") plt.show() if(Azure == True): fig2.savefig('scatter2.png') fig3 = plt.figure(1, figsize = (12,9)) fig3.clf() ax = fig3.gca() frame1.boxplot( column = ['Resid'], ax = ax, by = ['Month.Number','Month']) plt.xlabel("Month of year") plt.ylabel("Residuals of linear model") plt.title("Residuals of linear model by Month") plt.show() if(Azure == True): fig3.savefig('scatter3.png') return frame1
def azureml_main(frame1): # Set backend import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import milkutilities as mu from sklearn import linear_model Azure = False X = frame1[['Month.Count', 'monthNumCubed']].as_matrix() Y = frame1['Milk.Prod'].as_matrix() regr = linear_model.LinearRegression() regr.fit(X, Y) frame1['Predicted'] = regr.predict(X).tolist() frame1['Resid'] = frame1['Milk.Prod'] - frame1['Predicted'] ## Add a time index to the data frame frame1 = mu.add_time_index(frame1) fig1 = plt.figure(1, figsize=(12, 9)) ax = fig1.gca() frame1[['Milk.Prod', 'Predicted']].plot(ax=ax) plt.xlabel("Date") plt.ylabel("Log CA milk production") plt.title("Log of milk produciton vs. date") plt.show() if (Azure == True): fig1.savefig('scatter1.png') fig2 = plt.figure(1, figsize=(12, 9)) fig2.clf() ax = fig2.gca() frame1['Resid'].plot(ax=ax) plt.xlabel("Date") plt.ylabel("Residuals of linear model") plt.title("Residuals of linear model vs. date") plt.show() if (Azure == True): fig2.savefig('scatter2.png') fig3 = plt.figure(1, figsize=(12, 9)) fig3.clf() ax = fig3.gca() frame1.boxplot(column=['Resid'], ax=ax, by=['Month.Number', 'Month']) plt.xlabel("Month of year") plt.ylabel("Residuals of linear model") plt.title("Residuals of linear model by Month") plt.show() if (Azure == True): fig3.savefig('scatter3.png') return frame1
def azureml_main(frame1): import pandas as pd import numpy as np import milkutilities as mu import os.path ## If not in the Azure environment, read the data from a csv ## file for testing purposes. Use os.path.join to create an ## OS independent way to create the full path, which should ## work on Windows, Mac, and Linux. Azure = False if(Azure == False): pathName = "C:/Users/Steve/Documents/AzureML/Data Sets/CA_Milk" fileName = "cadairydata.csv" filePath = os.path.join(pathName, fileName) frame1 = pd.read_csv(filePath) ## Trim the month codes to 3 characters to ensure they ## are sonsistent. frame1['Month'] = frame1['Month'].map(lambda x: str(x)[:3]) ## Add a time index to the data frame frame1 = mu.add_time_index(frame1) ## Add a date-time type column. # frame1['datetime'] = frame1['Year'].apply(str) + '-' + frame1['Month.Number'].apply(str)+ '-01' # frame1['datetime'] = pd.to_datetime(frame1['datetime'], format="%Y-%m-%d") ## Compute new columns containing the polynomial values ## of the count of months. frame1['Month.Count'] = frame1['Month.Number'] + \ 12 * (frame1['Year'] - 1995) x = frame1['Month.Count'].as_matrix() frame1['monthNumSqred'] = np.power(x, 2).tolist() frame1['monthNumCubed'] = np.power(x, 3).tolist() return frame1
def azureml_main(frame1): import pandas as pd import milkutilities as mu import os.path ## If not in the Azure environment, read the data from a csv ## file for testing purposes. Use os.path.join to create an ## OS independent way to create the full path, which should ## work on Windows, Mac, and Linux. Azure = False if(Azure == False): pathName = "C:/Users/Steve/Documents/AzureML/Data Sets/CA_Milk" fileName = "cadairydata.csv" filePath = os.path.join(pathName, fileName) frame1 = pd.read_csv(filePath) ## Add a time index to the data frame frame1 = mu.add_time_index(frame1) ## Cut the last 12 months off the end of the ## data frame using pandas time series indexing. frame2 = frame1[:'2013-01-01'] return frame2
def azureml_main(frame1): # Set backend import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import milkutilities as mu import statsmodels.api as sm import pandas as pd Azure = False ## Add a time index to the data frame frame1 = mu.add_time_index(frame1) ## Compute the residuals frame1['Resid'] = frame1['Milk.Prod'] - frame1['Scored Labels'] fig1 = plt.figure(1, figsize=(12, 9)) ax = fig1.gca() frame1[['Milk.Prod', 'Scored Labels']].plot(ax=ax) plt.xlabel("Date") plt.ylabel("Log CA milk production") plt.title("Log of milk produciton vs. date") plt.show() if (Azure == True): fig1.savefig('scatter1.png') fig2 = plt.figure(1, figsize=(12, 9)) fig2.clf() ax = fig2.gca() frame1['Resid'].plot(ax=ax) plt.xlabel("Date") plt.ylabel("Residuals of linear model") plt.title("Residuals of linear model vs. date") plt.show() if (Azure == True): fig2.savefig('scatter2.png') fig3 = plt.figure(1, figsize=(12, 9)) fig3.clf() ax = fig3.gca() frame1.boxplot(column=['Resid'], ax=ax, by=['Month.Number', 'Month']) plt.xlabel("Month of year") plt.ylabel("Residuals of linear model") plt.title("Residuals of linear model by Month") plt.show() if (Azure == True): fig3.savefig('scatter3.png') ## QQ Normal plot of residuals fig4 = plt.figure(figsize=(12, 6)) fig4.clf() ax = fig4.gca() sm.qqplot(frame1['Resid'], ax=ax) ax.set_title('QQ Normal plot of residuals') if (Azure == True): fig4.savefig('plot4.png') ## Histograms of the residuals fig5 = plt.figure(figsize=(12, 6)) fig5.clf() fig5.clf() ax = fig5.gca() ax.hist(frame1['Resid'].as_matrix(), bins=40) ax.set_xlabel("Model residuals") ax.set_ylabel("Density") ax.set_title("Histogram of residuals") if (Azure == True): fig5.savefig('plot5.png') ## Compute a data frame with the rms errors for the model out_frame = pd.DataFrame({ \ 'rmse_Overall' : [rmse(frame1['Resid'])], \ 'rmse_test' : [rmse(frame1.ix['2013-01-31':, 'Resid'])] }) return out_frame
def azureml_main(frame1): # Set backend import matplotlib matplotlib.use('agg') import matplotlib.pyplot as plt import milkutilities as mu import statsmodels.api as sm import pandas as pd Azure = False ## Add a time index to the data frame frame1 = mu.add_time_index(frame1) ## Compute the residuals frame1['Resid'] = frame1['Milk.Prod'] - frame1['Scored Labels'] fig1 = plt.figure(1, figsize = (12,9)) ax = fig1.gca() frame1[['Milk.Prod', 'Scored Labels']].plot(ax = ax) plt.xlabel("Date") plt.ylabel("Log CA milk production") plt.title("Log of milk produciton vs. date") plt.show() if(Azure == True): fig1.savefig('scatter1.png') fig2 = plt.figure(1, figsize = (12,9)) fig2.clf() ax = fig2.gca() frame1['Resid'].plot(ax = ax) plt.xlabel("Date") plt.ylabel("Residuals of linear model") plt.title("Residuals of linear model vs. date") plt.show() if(Azure == True): fig2.savefig('scatter2.png') fig3 = plt.figure(1, figsize = (12,9)) fig3.clf() ax = fig3.gca() frame1.boxplot( column = ['Resid'], ax = ax, by = ['Month.Number','Month']) plt.xlabel("Month of year") plt.ylabel("Residuals of linear model") plt.title("Residuals of linear model by Month") plt.show() if(Azure == True): fig3.savefig('scatter3.png') ## QQ Normal plot of residuals fig4 = plt.figure(figsize = (12,6)) fig4.clf() ax = fig4.gca() sm.qqplot(frame1['Resid'], ax = ax) ax.set_title('QQ Normal plot of residuals') if(Azure == True): fig4.savefig('plot4.png') ## Histograms of the residuals fig5 = plt.figure(figsize = (12,6)) fig5.clf() fig5.clf() ax = fig5.gca() ax.hist(frame1['Resid'].as_matrix(), bins = 40) ax.set_xlabel("Model residuals") ax.set_ylabel("Density") ax.set_title("Histogram of residuals") if(Azure == True): fig5.savefig('plot5.png') ## Compute a data frame with the rms errors for the model out_frame = pd.DataFrame({ \ 'rmse_Overall' : [rmse(frame1['Resid'])], \ 'rmse_test' : [rmse(frame1.ix['2013-01-31':, 'Resid'])] }) return out_frame