def best_k(xArr, yArr): k_small = 0.01 k_big = 2.0 k = (k_big + k_small) / 2.0 while True: yHat = regression.lwlrTest(xArr, xArr, yArr, k) yHat_small = regression.lwlrTest(xArr, xArr, yArr, k_small) yHat_big = regression.lwlrTest(xArr, xArr, yArr, k_big) re = regression.rssError(yArr, yHat) re_small = regression.rssError(yArr, yHat_small) re_big = regression.rssError(yArr, yHat_big) if re_small > re and re_big > re: k_big = k + (k_big - k) / 2.0 k_small = k_small + (k - k_small) / 2.0 elif re_small > re and re_big < re: k_small = k k = (k_big + k_small) / 2.0 elif re_small < re and re_big > re: k_big = k k = (k_big + k_small) / 2.0 else: k_big = k + (k_big - k) / 2.0 k_small = k_small + (k - k_small) / 2.0 if k_big - k_small < 0.01: k = k_small break return k
def crossValidation(xArr, yArr, numVal=10): """ 交叉验证测试岭回归 :param xArr: 数据的特征集 :param yArr: 类别标签 :param numVal: 算计中交叉验证的次数。如果没有指定,默认是10. :return: """ #获取数据点的个数 m = len(yArr) indexList = arange(m) errorMat = zeros((numVal, 30)) # create error mat 30columns numVal rows #主循环, for i in range(numVal): #创建训练集和测试集的容器 trainX = [] trainY = [] testX = [] testY = [] #使用numpy提供的shuffle函数对indexList中的元素进行混洗。 #因此可以实现训练集或测试集数据点的随机选取。 random.shuffle(indexList) #切分训练集和测试集 for j in range(m): #创建一个基于数据集大小90%的训练集 if j < m * 0.9: trainX.append(xArr[indexList[j]]) trainY.append(yArr[indexList[j]]) else: testX.append(xArr[indexList[j]]) testY.append(yArr[indexList[j]]) #利用岭回归获得回归系数矩阵,得到30组回归系数组成的矩阵 wMat = ridge_regression.ridgeTest(trainX, trainY) #循环遍历矩阵中的30组回归系数 for k in range(30): #读取训练集和测试集 matTestX = mat(testX) matTrainX = mat(trainX) #对数据进行标准化处理 meanTrain = mean(matTrainX, 0) varTrain = var(matTrainX, 0) matTestX = (matTestX - meanTrain) / varTrain #测试回归效果并存储 yEst = matTestX * mat(wMat[k, :]).T + mean(trainY) # yEst = matTestX * mat(wMat[k, :]).T errorMat[i, k] = regression.rssError(yEst.T.A, array(testY)) #计算误差估计值得均值 meanErrors = mean(errorMat, 0) minMean = float(min(meanErrors)) bestWeights = wMat[nonzero(meanErrors == minMean)] #为了将得到的回归系数与standRegres()作对比,需要计算这些误差估计值的均值。 #有一点值得注意,岭回归使用了数据标准化,而standRegres()没有,因此为了将上述比较可视化,还需将数据还原。 xMat = mat(xArr) yMat = mat(yArr).T meanX = mean(xMat, 0) varX = var(xMat, 0) unReg = bestWeights / varX print("the best model from Ridge 7.Regression is:\n", unReg) print("with constant term: ", -1 * sum(multiply(meanX, unReg)) + mean(yMat))
def test1(): abX, abY = regression.loadDataSet('abalone.txt') yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1) yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1) yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10) print(regression.rssError(abY[0:99], yHat01.T)) print(regression.rssError(abY[0:99], yHat1.T)) print(regression.rssError(abY[0:99], yHat10.T)) print('-------------------------------------------') yHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1) yHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1) yHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10) print(regression.rssError(abY[100:199], yHat01.T)) print(regression.rssError(abY[100:199], yHat1.T)) print(regression.rssError(abY[100:199], yHat10.T))
def stageWise(xArr, yArr, eps=0.01, numIt=100): """ 前向逐步回归算法 :param xArr: 样本的数据特征 :param yArr: 类别标签 :param eps: 每次迭代需要调整的步长 :param numIt: 迭代次数 :return: """ xMat = mat(xArr) yMat = mat(yArr).T yMean = mean(yMat, 0) yMat = yMat - yMean xMat = regularize(xMat) m, n = shape(xMat) #创建一个创建numIt* n的全部数据为0的矩阵 returnMat = zeros((numIt, n)) #创建一个n*1的向量来保存w的值 ws = zeros((n, 1)) wsMax = ws.copy() #开始迭代 for i in range(numIt): print(ws.T) lowestError = inf #对每个特征进行循环 for j in range(n): for sign in [-1, 1]: wsTest = ws.copy() #改变一个系数得到一个新的w wsTest[j] += eps * sign #计算新w下的误差 yTest = xMat * wsTest rssE = regression.rssError(yMat.A, yTest.A) #如果误差Error小于当前最小误差lowesError,这是wsTest等于当前W,否则的话不改变。 if rssE < lowestError: lowestError = rssE wsMax = wsTest ws = wsMax.copy() returnMat[i, :] = ws.T return returnMat
def abaloneTest(): """ 预测鲍鱼的年龄 描述:机器学习实战示例8.3 预测鲍鱼的年龄 INPUT: 无 OUPUT: 无 """ # 加载数据 abX, abY = regression.loadDataSet("./data/abalone.txt") # 使用不同的核进行预测 oldyHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1) oldyHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1) oldyHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10) # 打印出不同的核预测值与训练数据集上的真实值之间的误差大小 print("old yHat01 error Size is :", regression.rssError(abY[0:99], oldyHat01.T)) print("old yHat1 error Size is :", regression.rssError(abY[0:99], oldyHat1.T)) print("old yHat10 error Size is :", regression.rssError(abY[0:99], oldyHat10.T)) # 打印出不同的核预测值与新数据集(测试数据集)上的真实值之间的误差大小 newyHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1) print("new yHat01 error Size is :", regression.rssError(abY[0:99], newyHat01.T)) newyHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1) print("new yHat1 error Size is :", regression.rssError(abY[0:99], newyHat1.T)) newyHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10) print("new yHat10 error Size is :", regression.rssError(abY[0:99], newyHat10.T)) # 使用简单的线性回归进行预测,与上面的计算进行比较 standWs = regression.standRegres(abX[0:99], abY[0:99]) standyHat = mat(abX[100:199]) * standWs print("standRegress error Size is:", regression.rssError(abY[100:199], standyHat.T.A))
import regression abX, abY = regression.loadDataSet('abalone.txt') yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1) yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1) yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10) print('Error on training data when K=0.1 - ', regression.rssError(abY[0:99], yHat01.T)) print('Error on training data when K=1.0 - ', regression.rssError(abY[0:99], yHat1.T)) print('Error on training data when K=10 - ', regression.rssError(abY[0:99], yHat10.T)) yHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1) print('Error on Test Data when k=0.1: ', regression.rssError(abY[100:199], yHat01.T)) yHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1) print('Error on Test Data when k=1: ', regression.rssError(abY[100:199], yHat1.T)) yHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10) print('Error on Test Data when k=10:', regression.rssError(abY[100:199], yHat10.T))
""" @file:abalone.py @author:姚水林 @time:2018-12-16 16:02:01 @function: """ import regression import matplotlib.pyplot as plt abX, abY = regression.loadDataSet('abalone.txt') yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1) yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1) yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10) ressError01 = regression.rssError(abY[0:99], yHat01.T) ressError1 = regression.rssError(abY[0:99], yHat1.T) ressError10 = regression.rssError(abY[0:99], yHat10.T) print("ressError01=", ressError01, "ressError1=", ressError1, "ressError10=", ressError10) ridgeWeights = regression.ridgeTest(abX, abY) print("ridgeWeights=", ridgeWeights) fig = plt.figure() ax = fig.add_subplot(111) ax.plot(ridgeWeights) plt.show()
# print "srtInd:",srtInd # print "xSort:",xSort fig = plt.figure() ax = fig.add_subplot(111) ax.plot(xSort[:, 1], yHat[srtInd]) ax.scatter(xMat[:, 1].flatten().A[0], mat(yArr).T.flatten().A[0], s=2, c='red') plt.show() #8.3 示例:预测鲍鱼的年龄 abX, abY = regression.loadDataSet(homedir + 'abalone.txt') print "abX:", abX print "abY:", abY yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1) yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1) yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10) print "regression.rssError(abY[0:99],yHat01.T):", regression.rssError( abY[0:99], yHat01.T) print "regression.rssError(abY[0:99],yHat1.T):", regression.rssError( abY[0:99], yHat1.T) print "regression.rssError(abY[0:99],yHat10.T):", regression.rssError( abY[0:99], yHat10.T) yHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1) print "regression.rssError(abY[100:199],yHat01.T):", regression.rssError( abY[100:199], yHat01.T) yHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1) print "regression.rssError(abY[100:199],yHat1.T):", regression.rssError( abY[100:199], yHat1.T) yHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10) print "regression.rssError(abY[100:199],yHat10.T):", regression.rssError( abY[100:199], yHat10.T) ws = regression.standRegres(abX[0:99], abY[0:99]) yHat = mat(abX[100:199]) * ws
invalidKMax = 0.009 yMatTmp = yMat[:, yIdx] yTMatTmp = yTMat[:, yIdx] for k in arange(5, 0.09, -0.1): ##find best k yAssume = regression.lwlrTest(xTMat, xMat, yMatTmp.T, k) print k if yAssume.all() == 0: #print("%s %d: regression.lwlr failed by k = %f." %(myDebug.file(), myDebug.line(), k)) invalidKNum += 1 if k > invalidKMax: invalidKMax = k if k < invalidKMin: invalidKMin = k continue #transfer Mat to list yTList = yTMatTmp.reshape(-1).tolist() yTList = [j for i in yTList for j in i] rssE = regression.rssError(yTList, yAssume) if len(bestKList) == 0: bestKList.insert(0, [rssE, k]) else: for idx in range(0, len(bestKList)): if rssE < bestKList[idx][0]: bestKList.insert(idx, [rssE, k]) if len(bestKList) > 50: #save 50 top k bestKList.pop() break print bestKList
#conding=utf-8 from numpy import * import regression """ 案例一:我们将回归用于真实数据 """ if __name__ == '__main__': """#####################################################################################################################""" xArr, yArr = regression.loadDataSet( r'C:\Users\v_wangdehong\PycharmProjects\MachineLearning_V\Regression\data\abalone.txt' ) #使用前99行数据测试算法 yHat01 = regression.lwlrTest(xArr[0:99], xArr[0:99], yArr[0:99], 0.1) yHat1 = regression.lwlrTest(xArr[0:99], xArr[0:99], yArr[0:99], 1) yHat10 = regression.lwlrTest(xArr[0:99], xArr[0:99], yArr[0:99], 10) print(regression.rssError(yArr[0:99], yHat01)) #56.7842091184 print(regression.rssError(yArr[0:99], yHat1)) #429.89056187 print(regression.rssError(yArr[0:99], yHat10)) #549.118170883 """ 从上面可以看到,使用较小的核将得到较低的误差,那么为什么不在所有数据集上都使用最小的核呢? 因为使用最小的核将造成过拟合,对新数据不一定能达到最好的效果,下面就看看它在新数据上的表现 """ yHat01 = regression.lwlrTest(xArr[100:199], xArr[0:99], yArr[0:99], 0.1) yHat1 = regression.lwlrTest(xArr[100:199], xArr[0:99], yArr[0:99], 1) yHat10 = regression.lwlrTest(xArr[100:199], xArr[0:99], yArr[0:99], 10) print(regression.rssError(yArr[100:199], yHat01)) # 25119.4591112 print(regression.rssError(yArr[100:199], yHat1)) # 573.52614419 print(regression.rssError(yArr[100:199], yHat10)) # 517.571190538 """ 从上面结果可以看到,核大小等于10时测试误差最小,但是它在训练集上的误差却是最大的。 接下来再和简单的线性回归做个比较。
# -*- coding=utf-8 -*- import regression from numpy import * abX,abY = regression.loadDataSet("abalone.txt") yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1) yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1.0) yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10) error01 = regression.rssError(abY[0:99], yHat01) error1 = regression.rssError(abY[0:99], yHat1) error10 = regression.rssError(abY[0:99], yHat10) #结论,使用较小的核可以得到较低的误差 #但较小的核会造成过拟合的,对新数据不定能达到最好的预测效果 print ("error01 is %s" % error01) #error01 is 56.7862596807 print ("error1 is %s" % error1) #error1 is 429.89056187 print ("error10 is %s" % error10) #error10 is 549.118170883 yyHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1) yyHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1.0) yyHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10) eerror01 = regression.rssError(abY[100:199], yyHat01) eerror1 = regression.rssError(abY[100:199], yyHat1) eerror10 = regression.rssError(abY[100:199], yyHat10) print ("eerror01 is %s" % eerror01) #eerror01 is 33652.8973161 print ("eerror1 is %s" % eerror1) #eerror1 is 573.52614419 print ("eerror10 is %s" % eerror10) #eerror10 is 517.571190538 #对新数据,k=10得到较好的效果 #和线性做比较
#coding:utf-8 import regression from numpy import * abX, abY = regression.loadDataSet('abalone.txt') yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1) yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1) yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10) print regression.rssError(abY[0:99], yHat01.T) print regression.rssError(abY[0:99], yHat1.T) print regression.rssError(abY[0:99], yHat10.T) yHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1) yHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1) yHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10) print regression.rssError(abY[100:199], yHat01.T) print regression.rssError(abY[100:199], yHat1.T) print regression.rssError(abY[100:199], yHat10.T) ws = regression.standRegres(abX[0:99], abY[0:99]) yHat = mat(abX[100:199])*ws print regression.rssError(abY[100:199], yHat.T.A) ridgeWeights = regression.ridgeTest(abX, abY) #print ridgeWeights import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(111) ax.plot(ridgeWeights) plt.show()
# -*- coding=utf-8 -*- import regression from numpy import * abX, abY = regression.loadDataSet("abalone.txt") yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1) yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1.0) yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10) error01 = regression.rssError(abY[0:99], yHat01) error1 = regression.rssError(abY[0:99], yHat1) error10 = regression.rssError(abY[0:99], yHat10) #结论,使用较小的核可以得到较低的误差 #但较小的核会造成过拟合的,对新数据不定能达到最好的预测效果 print("error01 is %s" % error01) #error01 is 56.7862596807 print("error1 is %s" % error1) #error1 is 429.89056187 print("error10 is %s" % error10) #error10 is 549.118170883 yyHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1) yyHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1.0) yyHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10) eerror01 = regression.rssError(abY[100:199], yyHat01) eerror1 = regression.rssError(abY[100:199], yyHat1) eerror10 = regression.rssError(abY[100:199], yyHat10) print("eerror01 is %s" % eerror01) #eerror01 is 33652.8973161 print("eerror1 is %s" % eerror1) #eerror1 is 573.52614419 print("eerror10 is %s" % eerror10) #eerror10 is 517.571190538 #对新数据,k=10得到较好的效果 #和线性做比较
yHat = xMat*ws ''' #Retry by lwlr to get best k corrcoefMin=100 bestK=1 keysets = [0.1,1,10,0.02,0.3] for step in keysets: print(step) yHat = regression.lwlrTest(xArr[4000:],xArr[0:4000],yArr[0:4000],step) if(sum(yHat) != 0): if(corrcoefMin >= linalg.det(corrcoef(yHat.T, yArr[4000:]))): corrcoefMin = linalg.det(corrcoef(yHat.T, yArr[4000:])) bestK=step print(regression.rssError(yArr[4000:], yHat.T)) print("=======================") print(bestK) print(corrcoefMin) ''' fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(xMat[:,1], yMat.T[:,0]) xCopy = xMat.copy() xCopy.sort(0) yHat = xCopy*ws ax.plot(xCopy[:, 1], yHat) plt.show()
#print yArr #print #ws = regression.standRegres(xArr, yArr) #print ws #yHat = regression.lwlrTest(xArr, xArr, yArr, 0.3) #yHat = regression.lwlrTest(xArr, xArr, yArr, 0.001) #print regression.rssError(yArr[:], yHat.T) #yHat = regression.lwlrTest(xArr, xArr, yArr, 0.003) #print regression.rssError(yArr[:], yHat.T) yHat = regression.lwlrTest(xArr, xArr, yArr, 0.01) #print regression.rssError(yArr[:], yHat.T) #print yHat #yHat = regression.lwlrTest(xArr, xArr, yArr, 0.1) #print regression.rssError(yArr[:], yHat.T) #yHat = regression.lwlrTest(xArr, xArr, yArr, 1) print regression.rssError(yArr[:], yHat.T) #exit(0) #yHat = regression.lwlrTest(xArr, xArr, yArr, 0.01) #print #yHat = regression.lwlrTest(xArr, xArr, yArr, 0.003) #print xMat = mat(xArr) srtInd = xMat[:,1].argsort(0) xSort = xMat[srtInd][:,0,:] import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(111) ax.plot(xSort[:,1], yHat[srtInd])
#!/usr/bin/env python3 # -*- coding: utf-8 -*- '8.3 mechinelearing in action' __author__ = 'lxp' import regression import numpy as np abX, abY = regression.loadDataSet('abalone.txt') yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1) yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1) yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10) print(regression.rssError(abY[0:99], yHat01.T)) print(regression.rssError(abY[0:99], yHat1.T)) print(regression.rssError(abY[0:99], yHat10.T)) yHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1) yHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1) yHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10) print(regression.rssError(abY[100:199], yHat01.T)) print(regression.rssError(abY[100:199], yHat1.T)) print(regression.rssError(abY[100:199], yHat10.T)) ws = regression.standRegres(abX[0:99], abY[0:99]) yHat = np.mat(abX[100:199]) * ws print(regression.rssError(abY[100:199], yHat.T.A))
srtInd = xMat[:, 1].argsort(0) xSort = xMat[srtInd][:, 0, :] fig = plt.figure() ax = fig.add_subplot(111) ax.plot(xSort[:, 1], yHat[srtInd]) ax.scatter(xMat[:, 1].flatten().A[0], mat(yArr).T.flatten().A[0], s=2, c='red') plt.show() # 在真实数据上 reload(regression) abX, abY = regression.loadDataSet('abalone.txt') yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1) yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1) yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10) # 看下k取什么比较好,所谓的交叉验证也算是 regression.rssError(abY[0:99], yHat01.T) # you are best~ regression.rssError(abY[0:99], yHat1.T) regression.rssError(abY[0:99], yHat10.T) # 看看是不是最好的k,测试集上也表现的良好 yHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1) yHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1) yHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10) regression.rssError(abY[100:199], yHat01.T) # 明显过拟合了~ regression.rssError(abY[100:199], yHat1.T) regression.rssError(abY[100:199], yHat10.T) # You are really the best...有没有写错.... # 岭回归测试 reload(regression) abX, abY = regression.loadDataSet('abalone.txt')
import regression from numpy import * def rssError(yArr,yHatArr): return ((yArr-yHatArr)**2).sum() abX,abY=regression.loadDataSet('abalone.txt') yHat01=regression.lwlrTest(abX[0:99],abX[0:99],abY[0:99],0.1) yHat1=regression.lwlrTest(abX[0:99],abX[0:99],abY[0:99],1) yHat10=regression.lwlrTest(abX[0:99],abX[0:99],abY[0:99],10) print regression.rssError(abY[0:99],yHat01.T) print regression.rssError(abY[0:99],yHat1.T) print regression.rssError(abY[0:99],yHat10.T)