class SgdLibraryLinearRegression: def __init__(self, data): self.df = pd.read_csv(data) self.regressor = SGDRegressor(max_iter=40, tol=1e-5, learning_rate='constant', eta0=0.06) def preprocess(self): # Removing Null values self.df.dropna() # Removing Duplicates self.df.drop_duplicates() # Checking the type of input data self.df.dtypes # Since horsepower is of object type we want to determine the nature of the attribute self.df['horsepower'].unique() # We are able to see ? in between numerical values so we are disregarding those instances self.df = self.df[self.df.horsepower != '?'] # We are then casting the object to float for further processing self.df['horsepower'] = self.df['horsepower'].astype('float') # We are removing the car name attribute since that does not correlate with the mpg of the car self.df.drop(['car name'], axis=1, inplace=True) # Attributes are starting from column 1 self.X = self.df.iloc[:, 1:].values self.Y = self.df.iloc[:, 0].values # Scaling the input attributes self.X = StandardScaler().fit_transform(self.X) # Splitting the data into training and test data set of the proportion 70:30 self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split( self.X, self.Y, test_size=0.3, random_state=1) def train(self, epoch_count=40, learning_rate=0.06): self.regressor = SGDRegressor(max_iter=epoch_count, tol=1e-5, learning_rate='constant', eta0=learning_rate) # Running the training by calling the library method self.regressor.fit(self.X_train, self.Y_train) def predictTrain(self): # Predicting the values based on the test data self.Y_pred = self.regressor.predict(self.X_train) # Getting the accuracy from the library method self.accuracy_score = self.regressor.score(self.X_train, self.Y_train) # print("accuracy score ", accuracy_score) # Getting the mean squared error by comparing the predicted value with the actual test value self.calculated_mse = mean_squared_error(self.Y_train, self.Y_pred) # print("mean square error ",calculated_mse) # Getting the r2 score by comparing the predicted value with the actual test value self.r2_scor = r2_score(self.Y_train, self.Y_pred) # print("r2_score ", r2_scor) return self.calculated_mse def print(self): print("accuracy score ", self.accuracy_score) print("mean square error ", self.calculated_mse) print("r2_score ", self.r2_scor) def predictTest(self): # Predicting the values based on the test data self.Y_pred = self.regressor.predict(self.X_test) # Getting the accuracy from the library method self.accuracy_score = self.regressor.score(self.X_test, self.Y_test) # print("accuracy score ", accuracy_score) # Getting the mean squared error by comparing the predicted value with the actual test value self.calculated_mse = mean_squared_error(self.Y_test, self.Y_pred) # print("mean square error ", calculated_mse) # Getting the r2 score by comparing the predicted value with the actual test value self.r2_scor = r2_score(self.Y_test, self.Y_pred) # print("r2_score ", r2_scor) return self.calculated_mse def plotLearningRate(self, epoch_count, min, max, step, color): mse_error = list() step_size = max x_scale = list() label = "epoch = " label += str(epoch_count) while (step_size >= min): self.train(epoch_count, step_size) mse_error.append(self.predictTest()) x_scale.append(step_size) step_size = step_size - step return plt.scatter(x_scale, mse_error, color=color)
y_SGD = linear_SGD.predict(test_x) linear_rg = LinearRegression( fit_intercept=True, #计算截距 normalize=False, #回归之前不对数据集进行规范化处理 copy_X=True, #复制X,不会对X的原始值产生影响 n_jobs=-1) #使用所有的CPU linear_rg.fit(train_x, train_y) y_rg = linear_rg.predict(test_x) print('模拟数据参数', coef) print('SGDRegressor模型参数', linear_SGD.coef_) print('LinearRegression模型参数', linear_rg.coef_) scores = cross_val_score(linear_SGD, train_x, train_y, cv=5) print('SGDRegressor交叉验证R方值:', scores) print('SGDRegressor交叉验证R方均值:', np.mean(scores)) print('SGDRegressor测试集R方值:', linear_SGD.score(test_x, test_y)) scores = cross_val_score(linear_rg, train_x, train_y, cv=5) print('LinearRegression交叉验证R方值:', scores) print('LinearRegression交叉验证R方均值:', np.mean(scores)) print('LinearRegression测试集R方值:', linear_rg.score(test_x, test_y)) #%% ax2 = plt_helper('ax1', '观察不同回归模型的效果') ax2.plot(X[:, 0], y, 'r*', label="模拟数据") ax2.plot(test_x[:, 0], y_SGD, '-k', label='SGDRegressor模型') ax2.plot(test_x[:, 0], y_rg, '-k', label='线性回归模型') ax2.legend(loc='best', prop=myfont)