def testStandRegres(): xArr, yArr = re.loadDataSet('ex0.txt') ws = re.standRegres(xArr, yArr) xMat = mat(xArr) yMat = mat(yArr) fig = plt.figure() ax = fig.add_subplot(111) # 将画布分成1行1列,将从左到右,从上到下第一块显示图画 ax.scatter(xMat[:, 1].flatten().A[0], yMat.T[:, 0].flatten().A[0], c='purple', label='realData', marker='.') # scatter散点图 # matrix[a:b,c:d] 第a到b行,且第c到d列 左闭右开 # matrix[a,b] 第a行,第b列 xCopy = xMat.copy() # xCopy.sort(0)#y轴方向排序 yHat = xCopy * ws ax.plot(xCopy[:, 1], yHat, c='green') # print yHat.T.flatten().A[0].size # print yMat.flatten().A[0].size # print yHat # print yMat correlation = corrcoef(yHat.T, yMat) print correlation plt.show()
def readfile1(): f = open('iris.data') #f = open('../Ch08/abalone.txt') line = f.readline() num_feat = len(line.split(',')) - 1 #num_feat = len(line.split('\t')) - 1 #print num_feat #exit(0) data_mat = [] label_mat = [] while line: if line.strip() == '': #print "line is null" line = f.readline() continue line = line.strip('\n') #print line #arr = line.split('\t') arr = line.split(',') f_arr = [] for i in range(num_feat): #print arr[i] f_arr.append(float(arr[i])) #exit(0) data_mat.append(f_arr) label = arr[-1] if label == 'Iris-setosa': label_mat.append(float(1.0)) if label == 'Iris-versicolor': label_mat.append(float(2.0)) if label == 'Iris-virginica': label_mat.append(float(3.0)) #label_mat.append(float(arr[-1])) line = f.readline() #print data_mat #print shape(data_mat) #print label_mat #print shape(label_mat) #print data_mat[0:2] print regression.standRegres(data_mat, label_mat)
def testAbalone(): xArr, yArr = re.loadDataSet('abalone.txt') ws = re.standRegres(xArr, yArr) print ws for k in [2, 10]: calcErr(xArr, yArr, 0, 299, k, 300, 350, ws) calcErr(xArr, yArr, 0, 299, k, 350, 400, ws) calcErr(xArr, yArr, 0, 299, k, 400, 450, ws) calcErr(xArr, yArr, 0, 299, k, 450, 500, ws) calcErr(xArr, yArr, 0, 299, k, 500, 600, ws) print ''
def lineResult(fileName): xArr, yArr = regression.loadDataSet(fileName) ws = regression.standRegres(xArr, yArr) #取得回归系数矩阵 # 画出回归曲线 xMat = mat(xArr) yHat = xMat * ws fig = plt.figure() ax = fig.add_subplot(111) xCopy = xMat.copy() xCopy.sort(0) yHat = xCopy * ws ax.plot(xCopy[:, 1], yHat) ax.scatter(xMat[:, 1].flatten().A[0], mat(yArr).T.flatten().A[0], s=2, c='red') plt.show()
def useStandRegres(): """ 函数说明:使用简单的线性回归 Parameters: 无 Returns: 无 """ lgX = [] lgY = [] setDataCollect(lgX, lgY) data_num, features_num = np.shape(lgX) lgX1 = np.mat(np.ones((data_num, features_num + 1))) lgX1[:, 1:5] = np.mat(lgX) ws = rg.standRegres(lgX1, lgY) print('%f%+f*年份%+f*部件数量%+f*是否为全新%+f*原价' % (ws[0], ws[1], ws[2], ws[3], ws[4]))
def abaloneTest(): """ 预测鲍鱼的年龄 描述:机器学习实战示例8.3 预测鲍鱼的年龄 INPUT: 无 OUPUT: 无 """ # 加载数据 abX, abY = regression.loadDataSet("./data/abalone.txt") # 使用不同的核进行预测 oldyHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1) oldyHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1) oldyHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10) # 打印出不同的核预测值与训练数据集上的真实值之间的误差大小 print("old yHat01 error Size is :", regression.rssError(abY[0:99], oldyHat01.T)) print("old yHat1 error Size is :", regression.rssError(abY[0:99], oldyHat1.T)) print("old yHat10 error Size is :", regression.rssError(abY[0:99], oldyHat10.T)) # 打印出不同的核预测值与新数据集(测试数据集)上的真实值之间的误差大小 newyHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1) print("new yHat01 error Size is :", regression.rssError(abY[0:99], newyHat01.T)) newyHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1) print("new yHat1 error Size is :", regression.rssError(abY[0:99], newyHat1.T)) newyHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10) print("new yHat10 error Size is :", regression.rssError(abY[0:99], newyHat10.T)) # 使用简单的线性回归进行预测,与上面的计算进行比较 standWs = regression.standRegres(abX[0:99], abY[0:99]) standyHat = mat(abX[100:199]) * standWs print("standRegress error Size is:", regression.rssError(abY[100:199], standyHat.T.A))
import LoadData import RidgeRegression import regression import matplotlib.pyplot as plt from numpy import * X,Y,attNum,trainingSampleNum=LoadData.loadDataSet('abalone.txt') xStd=std(X,0) #得到标准化前X的标准差 yMean=mean(Y,0) #得到中心化前的Y的均值 yMean是1×1的矩阵 XStand,YCentered=LoadData.standardize(X),LoadData.centered(Y) testNum=30 wMat=RidgeRegression.ridgeTest(XStand,YCentered,testNum) for i in range(testNum): theta=mat(wMat[i,:]).T YHat1=XStand*theta+yMean #广播 print("参数",i,"的岭回归的Error为", LoadData.rssError(Y, YHat1)) wMat=wMat/xStd #还原到没有标准化前的系数 xOne=LoadData.addAllOneColumn(X) thetaStd=regression.standRegres(xOne,Y) YHat2=xOne*thetaStd print("标准线性回归的Error为",LoadData.rssError(Y,YHat2)) # 岭迹图 lambdas = [i - 10 for i in range(testNum)] plt.plot(lambdas, wMat) plt.show()
#!usr/bin/python #coding=utf8 import regression from numpy import * import matplotlib.pyplot as plt dataMat, labels = regression.loadDataSet('ex0.txt') ws = regression.standRegres(dataMat, labels) # print ws # # # 绘制原始数据 xMat = mat(dataMat) yMat = mat(labels) fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(xMat[:, 1].flatten().A[0], yMat.T[:, 0].flatten().A[0], s=2, c='red') # # # # 预测数据 # yHat = regression.lwlrTest(dataMat, dataMat, labels,k=0.003) # srtInd = xMat[:,1].argsort(0) # xSort = xMat[srtInd][:,0,:] # ax.plot(xSort[:,1], yHat[srtInd]) # plt.show() xCopy = xMat.copy() xCopy.sort(0)
from numpy import * import regression as regress import matplotlib.pyplot as plt plt.switch_backend('agg') if __name__ == "__main__": k = input("input the k:") filename = "data/ex0.txt" xArr, yArr = regress.loadDataSet(filename) ws = regress.standRegres(xArr, yArr) #xMat是一个n*2的矩阵 xMat = mat(xArr) #将xMat按照第二列排序,返回各个元素排序后的位置srtInd srtInd = xMat[:, 1].argsort(0) #获取排序后的二维矩阵 xSort = xMat[srtInd][:, 0, :] #print("srtInd============") #print(srtInd) #print("xMat============") #print(xMat) #print("xSort===========") #print(xSort) yMat = mat(yArr) fig = plt.figure() ax = fig.add_subplot(111) #print(xArr[:,1].flatten().A[0]) #print(yArr.T[:,0].flatten().A[0]) ax.scatter(xMat[:, 1].flatten().A[0], yMat.T[:, 0].flatten().A[0], s=2,
for line in fr.readlines(): lineArr = [] curLine = line.strip().split('\t') # print "curLine:" , curLine for i in range(numFeat): # print "curLine[%d]" % i, (curLine[i]) lineArr.append(float(curLine[i])) # print "lineArr: ", lineArr # print "curLine[-1]", curLine[2] dataMat.append(lineArr) labelMat.append(float(curLine[-1])) return dataMat, labelMat x, y = loadDataSet('ex0.txt') ws = regression.standRegres(x, y) ws2 = regression.gradRegres(x, y) #ws3 = regression.gradRegressMatrix(x, y) print ws print ws2 #print ws3 xArr = np.asarray(x) yArr = np.asarray(y) yHat = xArr * ws print np.corrcoef(yHat.T, yArr) fig = plt.figure() ax = fig.add_subplot(111)
import numpy as np import matplotlib as cm import matplotlib.pyplot as plt import matplotlib.ticker as mtick import regression as re if __name__ == '__main__': X, y = re.loadDataSet("data/ex1.txt") # coursera的《machine learning》第二周实验数据 m, n = X.shape X = np.concatenate((np.ones((m, 1)), X), axis=1) theta, timeConsumed = re.standRegres(X, y) print('消耗[%s] s \n 参数矩阵:\n %s' % (timeConsumed, theta)) fittingFig = plt.figure() title = 'StandRegress time: %s' % timeConsumed ax = fittingFig.add_subplot(111, title=title) trainingSet = ax.scatter(X[:, 1].flatten().A[0], y[:, 0].flatten().A[0]) xCopy = X.copy() xCopy.sort(0) yHat = xCopy * theta fittingLine, = ax.plot(xCopy[:, 1], yHat, color='g') ax.set_xlabel('Population of City in 10,000s') ax.set_ylabel('Profit in $10,000s') plt.legend([trainingSet, fittingLine], ['Training Set', 'Linear Regression']) plt.show()
yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1.0) yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10) error01 = regression.rssError(abY[0:99], yHat01) error1 = regression.rssError(abY[0:99], yHat1) error10 = regression.rssError(abY[0:99], yHat10) #结论,使用较小的核可以得到较低的误差 #但较小的核会造成过拟合的,对新数据不定能达到最好的预测效果 print("error01 is %s" % error01) #error01 is 56.7862596807 print("error1 is %s" % error1) #error1 is 429.89056187 print("error10 is %s" % error10) #error10 is 549.118170883 yyHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1) yyHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1.0) yyHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10) eerror01 = regression.rssError(abY[100:199], yyHat01) eerror1 = regression.rssError(abY[100:199], yyHat1) eerror10 = regression.rssError(abY[100:199], yyHat10) print("eerror01 is %s" % eerror01) #eerror01 is 33652.8973161 print("eerror1 is %s" % eerror1) #eerror1 is 573.52614419 print("eerror10 is %s" % eerror10) #eerror10 is 517.571190538 #对新数据,k=10得到较好的效果 #和线性做比较 #结论:必须在未知数据上做比较效果才能取到最佳模型 ws = regression.standRegres(abX[0:99], abY[0:99]) #用前100个数据做训练集 yHat = mat(abX[100:199]) * ws errorLine = regression.rssError(abY[100:199], yHat.T.A) print("errorLine is %s" % errorLine) #errorLine is 518.636315324
import regression from numpy import * xarr, yarr = regression.loadDataSet('ex0.txt') #print(xarr) ws = regression.standRegres(xarr, yarr) #print(ws) xmat = mat(xarr) ymat = mat(yarr) yhat = xmat * ws import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(xmat[:, 1].flatten().A[0], ymat.T[:, 0].flatten().A[0]) xcopy = xmat.copy() xcopy.sort(0) yhat = xcopy * ws ax.plot(xcopy[:, 1], yhat) #plt.show() yhat = xmat * ws print(corrcoef(yhat.T, ymat))
# regressionTest.py import regression from numpy import * import matplotlib.pyplot as plt xArray, yArray = regression.loadDataSet('ex0.txt') xArray = array(xArray, dtype=float) yArray = array(yArray, dtype=float) # print(xArray[0 : 2]) # print(yArray[0]) # print(regression.lwlr(xArray[0], xArray, yArray, 1.0)) ws = regression.standRegres(xArray, yArray) # print(ws) xMat = mat(xArray) yMat = mat(yArray) yHat = xMat * ws # print(corrcoef(yHat.T, yMat)) ''' fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(xMat[:, 1].flatten().A[0], yMat.T[:, 0].flatten().A[0]) xCopy = xMat.copy() xCopy.sort(0) yHat = xCopy * ws
#!/usr/bin/python import regression from numpy import * xArr,yArr=regression.loadDataSet('ex0.txt') ws=regression.standRegres(xArr,yArr) xMat=mat(xArr) yMat=mat(yArr) yHat=xMat*ws import matplotlib.pyplot as plt fig=plt.figure() ax=fig.add_subplot(111) ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0].flatten().A[0]) xCopy=xMat.copy() xCopy.sort(0) yHat=xCopy*ws ax.plot(xCopy[:,1],yHat) plt.show()
return ((yArr - yHatArr) ** 2).sum() if __name__ == '__main__': plotlwlrRegression() # 预测鲍鱼的年龄 abX, abY = rg.loadDataSet('abalone.txt') print('训练集与测试集相同:局部加权线性回归,核k的大小对预测的影响:') yHat01 = lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1) yHat1 = lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1) yHat10 = lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10) print('k=0.1时,误差大小为:', rssError(abY[0:99], yHat01.T)) print('k=1 时,误差大小为:', rssError(abY[0:99], yHat1.T)) print('k=10 时,误差大小为:', rssError(abY[0:99], yHat10.T)) print('') print('训练集与测试集不同:局部加权线性回归,核k的大小是越小越好吗?更换数据集,测试结果如下:') yHat01 = lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1) yHat1 = lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1) yHat10 = lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10) print('k=0.1时,误差大小为:', rssError(abY[100:199], yHat01.T)) print('k=1 时,误差大小为:', rssError(abY[100:199], yHat1.T)) print('k=10 时,误差大小为:', rssError(abY[100:199], yHat10.T)) print('') print('训练集与测试集不同:简单的线性归回与k=1时的局部加权线性回归对比:') print('k=1时,误差大小为:', rssError(abY[100:199], yHat1.T)) ws = rg.standRegres(abX[0:99], abY[0:99]) yHat = np.mat(abX[100:199]) * ws print('简单的线性回归误差大小:', rssError(abY[100:199], yHat.T.A))
xArr, yArr = regression.loadDataSet( r'C:\Users\v_wangdehong\PycharmProjects\MachineLearning_V\Regression\data\abalone.txt' ) #使用前99行数据测试算法 yHat01 = regression.lwlrTest(xArr[0:99], xArr[0:99], yArr[0:99], 0.1) yHat1 = regression.lwlrTest(xArr[0:99], xArr[0:99], yArr[0:99], 1) yHat10 = regression.lwlrTest(xArr[0:99], xArr[0:99], yArr[0:99], 10) print(regression.rssError(yArr[0:99], yHat01)) #56.7842091184 print(regression.rssError(yArr[0:99], yHat1)) #429.89056187 print(regression.rssError(yArr[0:99], yHat10)) #549.118170883 """ 从上面可以看到,使用较小的核将得到较低的误差,那么为什么不在所有数据集上都使用最小的核呢? 因为使用最小的核将造成过拟合,对新数据不一定能达到最好的效果,下面就看看它在新数据上的表现 """ yHat01 = regression.lwlrTest(xArr[100:199], xArr[0:99], yArr[0:99], 0.1) yHat1 = regression.lwlrTest(xArr[100:199], xArr[0:99], yArr[0:99], 1) yHat10 = regression.lwlrTest(xArr[100:199], xArr[0:99], yArr[0:99], 10) print(regression.rssError(yArr[100:199], yHat01)) # 25119.4591112 print(regression.rssError(yArr[100:199], yHat1)) # 573.52614419 print(regression.rssError(yArr[100:199], yHat10)) # 517.571190538 """ 从上面结果可以看到,核大小等于10时测试误差最小,但是它在训练集上的误差却是最大的。 接下来再和简单的线性回归做个比较。 """ ws = regression.standRegres(xArr[0:99], yArr[0:99]) yHat = mat(xArr[100:199]) * ws #shape(99,1) print(regression.rssError(yArr[100:199], yHat.T.A)) """ 简单的线性回归达到了局部加权线性回归类似的效果。这也表明了一点,必须在未知数据上比较效果才能选取到最佳模型。 """
error01 = regression.rssError(abY[0:99], yHat01) error1 = regression.rssError(abY[0:99], yHat1) error10 = regression.rssError(abY[0:99], yHat10) #结论,使用较小的核可以得到较低的误差 #但较小的核会造成过拟合的,对新数据不定能达到最好的预测效果 print ("error01 is %s" % error01) #error01 is 56.7862596807 print ("error1 is %s" % error1) #error1 is 429.89056187 print ("error10 is %s" % error10) #error10 is 549.118170883 yyHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1) yyHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1.0) yyHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10) eerror01 = regression.rssError(abY[100:199], yyHat01) eerror1 = regression.rssError(abY[100:199], yyHat1) eerror10 = regression.rssError(abY[100:199], yyHat10) print ("eerror01 is %s" % eerror01) #eerror01 is 33652.8973161 print ("eerror1 is %s" % eerror1) #eerror1 is 573.52614419 print ("eerror10 is %s" % eerror10) #eerror10 is 517.571190538 #对新数据,k=10得到较好的效果 #和线性做比较 #结论:必须在未知数据上做比较效果才能取到最佳模型 ws = regression.standRegres(abX[0:99], abY[0:99]) #用前100个数据做训练集 yHat = mat(abX[100:199]) * ws errorLine = regression.rssError(abY[100:199], yHat.T.A) print ("errorLine is %s" % errorLine) #errorLine is 518.636315324
abX, abY = regression.loadDataSet('abalone.txt') yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1) yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1) yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10) print regression.rssError(abY[0:99], yHat01.T) print regression.rssError(abY[0:99], yHat1.T) print regression.rssError(abY[0:99], yHat10.T) yHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1) yHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1) yHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10) print regression.rssError(abY[100:199], yHat01.T) print regression.rssError(abY[100:199], yHat1.T) print regression.rssError(abY[100:199], yHat10.T) ws = regression.standRegres(abX[0:99], abY[0:99]) yHat = mat(abX[100:199])*ws print regression.rssError(abY[100:199], yHat.T.A) ridgeWeights = regression.ridgeTest(abX, abY) #print ridgeWeights import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(111) ax.plot(ridgeWeights) plt.show() xArr, yArr = regression.loadDataSet('abalone.txt') #regression.stageWise(xArr, yArr, 0.01, 200) #regression.stageWise(xArr, yArr, 0.001, 5000)
returnMat = zeros((numIter, n)) ws = zeros((n, 1)) wsMax = ws.copy() for i in range(numIter): print(ws.T) lowestError = inf for j in range(n): for sign in [-1, 1]: wsTest = ws.copy() wsTest[j] += eps * sign yTest = xMat * wsTest rssE = rssError(yMat.A, yTest.A) if rssE < lowestError: lowestError = rssE wsMax = wsTest ws = wsMax.copy() returnMat[i, :] = ws.T return returnMat if __name__ == '__main__': print('Forward Step-wise Regression: ') xArr, yArr = regression.loadDataSet('abalone.txt') stageWise(xArr, yArr, 0.005, 5000) print('Standard Regression: ') weights = regression.standRegres(regularize(mat(xArr)), (mat(yArr).T - mean(mat(yArr).T, 0)).T) print(weights.T)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- '8.3 mechinelearing in action' __author__ = 'lxp' import regression import numpy as np abX, abY = regression.loadDataSet('abalone.txt') yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1) yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1) yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10) print(regression.rssError(abY[0:99], yHat01.T)) print(regression.rssError(abY[0:99], yHat1.T)) print(regression.rssError(abY[0:99], yHat10.T)) yHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1) yHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1) yHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10) print(regression.rssError(abY[100:199], yHat01.T)) print(regression.rssError(abY[100:199], yHat1.T)) print(regression.rssError(abY[100:199], yHat10.T)) ws = regression.standRegres(abX[0:99], abY[0:99]) yHat = np.mat(abX[100:199]) * ws print(regression.rssError(abY[100:199], yHat.T.A))
import LoadData import ForwardStagewiseR import regression import matplotlib.pyplot as plt from numpy import * X,Y,attNum,trainingSampleNum=LoadData.loadDataSet('abalone.txt') xStd=std(X,0) #得到标准化前X的标准差 yMean=mean(Y,0) #得到中心化前的Y的均值 XStand,YCentered=LoadData.standardize(X),LoadData.centered(Y) numIt=5000 allWS=ForwardStagewiseR.forwardStagewiseR(XStand,YCentered,0.005,numIt) YHat1=XStand*(mat(allWS[numIt-1]).T)+yMean allWS=allWS/xStd print("前向逐步回归系数最后一次迭代系数为:",allWS[numIt-1]) print("前向逐步回归的Error为",LoadData.rssError(Y,YHat1)) xOne=LoadData.addAllOneColumn(X) thetaStd=regression.standRegres(xOne, Y) # theta 是n*1 训练数据直接当测试数据 print('线性回归系数为:',thetaStd.T) YHat2=xOne*thetaStd print("标准线性回归的Error为",LoadData.rssError(Y,YHat2)) plt.plot(range(numIt),allWS) plt.show()
# -*- coding: utf-8 -*- """ Created on Fri May 12 16:07:29 2017 @author: 凯风 """ import regression from numpy import * from imp import reload import matplotlib.pyplot as plt reload(regression) xArr, yArr = regression.loadDataSet('ex0.txt') xArr[0:2] ws = regression.standRegres(xArr, yArr) # 求回归系数 ws xMat = mat(xArr) yMat = mat(yArr) yHat = xMat * ws # 拟合曲线 # 绘制拟合直线和散点图 fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(xMat[:, 1].flatten().A[0], yMat.T[:, 0].flatten().A[0]) xCopy = xMat.copy() xCopy.sort(0) yHat = xCopy * ws ax.plot(xCopy[:, 1], yHat) plt.show()
print("岭回归的系数(已经缩放至原尺寸):",unStandCoff) print("岭回归的截距(已经缩放至原尺寸):",-1*sum(multiply(unStandCoff,xMean))+yMean) print('岭回归RSS:', LoadData.rssError(Y, YHat)) if __name__ == '__main__': lgX = [] lgY = [] setDataCollect(lgX, lgY) lgX = mat(lgX) lgY = mat(lgY).T m = lgX.shape[0] n = lgX.shape[1] lgX1 = LoadData.addAllOneColumn(lgX) print('属性个数:', n) print('训练实例个数:', m) set_printoptions(suppress=True) print('属性矩阵:', lgX) print('类标签矩阵矩阵:', lgY.T) theta = regression.standRegres(lgX1, lgY) # theta 是n*1 训练数据直接当测试数据 print('线性回归系数为:', theta) YHat = lgX1 * theta # YHat 是 m*1 # for i in range(m): # print('真实值:', lgY[i], '预测值:', YHat[i]) print('标准线性回归RSS:', LoadData.rssError(lgY, YHat)) crossValidationRidgeRegression(lgX, lgY)
# xMat=regression.regularize(xMat) # yM = mean(yMat,0) # yMat =yMat-yM # weights=regression.standRegres(xMat,yMat.T) # print(weights.T) #--------产生图8-7的代码------------# # xArr,yArr =regression.loadDataSet('abalone.txt') # rightweights=regression.stageWise(xArr,yArr,0.005,1000)#运行时请把stageWise()中注释的三句代码恢复 # import matplotlib.pyplot as plt # fig = plt.figure() # ax=fig.add_subplot(111) # ax.plot(rightweights) # plt.show() #-------lego积木预测-------------# import legoAPI lgx = [] lgy = [] legoAPI.setDataCollect(lgx, lgy) #乐高URL已经过期,所以使用legoAPI.py本地解释文件夹setHtml下的网页 # regression.scrapePage('./setHtml/lego10030.html','out.txt', 2002, 3096, 269.99)#也可以这样使用作者注释掉的scrapePage()函数 lgx1 = mat(ones((63, 5))) lgx1[:, 1:5] = mat(lgx) print(lgx[0]) print(lgx1[0]) ws = regression.standRegres(lgx1, lgy) #最小二乘法,线性回归 print('ws', end='=') print(ws) # print('lgx1[0]*ws',end='=');print(lgx1[0]*ws) # print('lgx1[0]*ws',end='=');print(lgx1[-1]*ws) # print('lgx1[0]*ws',end='=');print(lgx1[43]*ws) regression.crossValidation(lgx, lgy, 10) print(regression.ridgeTest(lgx, lgy))
# 测试导入数据 import regression from numpy import * xArr, yArr = regression.loadDataSet('ex0.txt') xArr[0:2] # 测试标准回归 ws = regression.standRegres(xArr, yArr) ws xMat = mat(xArr) yMat = mat(yArr) yHat = xMat * ws # 绘制数据集散点图和最佳拟合直线图 import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(xMat[:, 1].flatten().A[0], yMat.T[:, 0].flatten().A[0]) # 先将数据点按升序排列 xCopy = xMat.copy() xCopy.sort(0) yHat = xCopy * ws ax.plot(xCopy[:, 1], yHat) plt.show() # 计算预测值和真实值的相关性 yHat = xMat * ws corrcoef(yHat.T, yMat)
import matplotlib.pyplot as plt import LoadData import regression from numpy import * # m 条数据 n个属性 X, Y, attNum, trainingSampleNum = LoadData.loadDataSet( 'abalone.txt') # X是 m*n Y 是 m*1 theta = regression.standRegres(X, Y) # theta 是n*1 训练数据直接当测试数据 print('线性回归系数为:', theta) YHat = X * theta # YHat 是 m*1 print('标准线性回归预测y值的转置为:', YHat.T) for i in range(trainingSampleNum): print('真实值:', Y[i], '预测值:', YHat[i]) #绘制原始数据 fig = plt.figure() ax = fig.add_subplot(1, 1, 1) #add_subplot(349)函数的参数的意思是,将画布分成3行4列图像画在从左到右从上到下第9块 ax.scatter(X[:, 1].flatten().A[0].T, Y.flatten().A[0].T) #scatter第一个参数是x坐标,第二个是y坐标,都是array类型的 #第一个参数是x坐标,取的是属性矩阵的第1列,X[:,1]的结果是m*1的矩阵,flattern把m*1的矩阵变成了 1×m的矩阵,A把这个1×m矩阵 #变成了1×m的数组,取个A[0] 就是取第一行 #print(X[:,1].flatten().A.shape) X[:,1].flatten().A是1*200的数组 #绘制预测数据 ax.plot(X[:, 1].flatten().A[0], YHat.flatten().A[0], 'r-') plt.show() #不加这句不显示图像 print('皮尔逊积矩相关系数:', corrcoef(YHat.T, Y.T)) print('RSS:', LoadData.rssError(Y, YHat))
import regression lgX = [] lgY = [] regression.setDataCollect(lgX, lgY) print(shape(lgX)) lgX1 = mat(ones((58, 5))) lgX1[:, 1:5] = mat(lgX) print(lgX[0]) print(lgX1[0]) ws = regression.standRegres(lgX1, lgY) print(ws) print(lgX1[0] * ws) print(lgX1[-1] * ws) print(lgX1[43] * ws)
import LoadData import LocallyWeightedLR import regression from numpy import * X, Y, attNum, trainingSampleNum = LoadData.loadDataSet('abalone.txt') #YHat01=LocallyWeightedLR.testLocallyWeightedLR(X[100:199],X[:99],Y[:99],0.1) #有许多不可逆的矩阵 #theta01=LocallyWeightedLR.LocallyWeightedLR(X[129],X[:99],Y[:99],0.1) 第129个实例的xTWX矩阵不满秩 #print "129:",X[129] #print "129 theta:",theta01 #print "129 预测值:",X[129]*theta01 YHat1 = LocallyWeightedLR.testLocallyWeightedLR(X[100:199], X[:99], Y[:99], 1) YHat10 = LocallyWeightedLR.testLocallyWeightedLR(X[100:199], X[:99], Y[:99], 10) YStandR = X[100:199] * regression.standRegres(X[:99], Y[:99]) #乘在了测试数据集上 #print("k=1目标函数为:",LoadData.rssError(Y[100:199],YHat01)) print("k=1目标函数为:", LoadData.rssError(Y[100:199], YHat1)) print("k=10目标函数为:", LoadData.rssError(Y[100:199], YHat10)) print("标准线性回归目标函数为:", LoadData.rssError(Y[100:199], YStandR))
#coding:utf-8 from numpy import * import regression xArr, yArr = regression.loadDataSet('ex0.txt') #print xArr[0:2] ws = regression.standRegres(xArr, yArr) print ws #使用新的ws值计算预测的值yHat xMat = mat(xArr) yMat = mat(yArr) yHat = xMat * ws #绘出数据集散点图和最佳拟合直线图 import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(xMat[:, 1].flatten().A[0], yMat.T[:, 0].flatten().A[0]) #为了绘制计算出的最佳拟合曲线,需要绘出yHat的值 #若直线上的数据点次序混乱,绘图时将会出现问题,固要先将点按照升序排列 xCopy = xMat.copy() xCopy.sort(0) #这个应该是np中的sort,意思是按照0维度排序 yHat = xCopy * ws ax.plot(xCopy[:, 1], yHat) plt.show() #对单点进行估计 print yArr[0]
# -*- coding: utf-8 -*- import regression from numpy import * dm, ls = regression.loadDataSet('ex0.txt') ws = regression.standRegres(dm, ls) xMat = mat(dm) yMat = mat(ls) import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(xMat[:, 1].flatten().A[0], yMat.T[:, 0].flatten().A[0]) # flatten()将多维矩阵折叠成一维 xCopy = xMat.copy() xCopy.sort(0) yHat = xCopy * ws ax.plot(xCopy[:, 1], yHat) plt.show() xMat = mat(dm) yMat = mat(ls) yHat = xMat * ws corrcoef(yHat.T, yMat) # 求相关系数 # 局部加权线性回归 import regression from numpy import * dm, ls = regression.loadDataSet('ex0.txt') ls[0] regression.lwlr(dm[0], dm, ls, 1.0)
import regression from numpy import * xArr, yArr = regression.loadDataSet('abalone.txt') regression.stageWise(xArr, yArr, 0.01, 200) regression.stageWise(xArr, yArr, 0.001, 5000) xMat = mat(xArr) yMat = mat(yArr).T xMat = regression.regularize(xMat) yM = mean(yMat, 0) yMat = yMat - yM weights = regression.standRegres(xMat, yMat.T) print(weights.T)
abX, abY = regression.loadDataSet('abalone.txt') yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1) yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1) yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10) print regression.rssError(abY[0:99], yHat01.T) print regression.rssError(abY[0:99], yHat1.T) print regression.rssError(abY[0:99], yHat10.T) yHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1) yHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1) yHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10) print regression.rssError(abY[100:199], yHat01.T) print regression.rssError(abY[100:199], yHat1.T) print regression.rssError(abY[100:199], yHat10.T) ws = regression.standRegres(abX[0:99], abY[0:99]) yHat = mat(abX[100:199]) * ws print regression.rssError(abY[100:199], yHat.T.A) ridgeWeights = regression.ridgeTest(abX, abY) #print ridgeWeights import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(111) ax.plot(ridgeWeights) plt.show() xArr, yArr = regression.loadDataSet('abalone.txt') #regression.stageWise(xArr, yArr, 0.01, 200) #regression.stageWise(xArr, yArr, 0.001, 5000)