Exemplo n.º 1
0
def testStandRegres():
    xArr, yArr = re.loadDataSet('ex0.txt')
    ws = re.standRegres(xArr, yArr)
    xMat = mat(xArr)
    yMat = mat(yArr)
    fig = plt.figure()
    ax = fig.add_subplot(111)  # 将画布分成1行1列,将从左到右,从上到下第一块显示图画
    ax.scatter(xMat[:, 1].flatten().A[0],
               yMat.T[:, 0].flatten().A[0],
               c='purple',
               label='realData',
               marker='.')
    # scatter散点图
    # matrix[a:b,c:d]  第a到b行,且第c到d列 左闭右开
    # matrix[a,b] 第a行,第b列
    xCopy = xMat.copy()
    # xCopy.sort(0)#y轴方向排序
    yHat = xCopy * ws
    ax.plot(xCopy[:, 1], yHat, c='green')
    # print yHat.T.flatten().A[0].size
    # print yMat.flatten().A[0].size
    # print yHat
    # print yMat
    correlation = corrcoef(yHat.T, yMat)
    print correlation
    plt.show()
Exemplo n.º 2
0
def readfile1():
	f = open('iris.data')
	#f = open('../Ch08/abalone.txt')
	line = f.readline()
	num_feat = len(line.split(',')) - 1
	#num_feat = len(line.split('\t')) - 1
	#print num_feat
	#exit(0)
	data_mat = []
	label_mat = []
	while line:
		if line.strip() == '':
			#print "line is null"
			line = f.readline()
			continue
		line = line.strip('\n')
		#print line
		#arr = line.split('\t')
		arr = line.split(',')
		f_arr = []
		for i in range(num_feat):
			#print arr[i]
			f_arr.append(float(arr[i]))
		#exit(0)
		data_mat.append(f_arr)
		
		label = arr[-1]
		if label == 'Iris-setosa':
			label_mat.append(float(1.0))
		if label == 'Iris-versicolor':
			label_mat.append(float(2.0))
		if label == 'Iris-virginica':
			label_mat.append(float(3.0))
		#label_mat.append(float(arr[-1]))		
		line = f.readline()

	#print data_mat
	#print shape(data_mat)
	#print label_mat
	#print shape(label_mat)
	#print data_mat[0:2]
	print regression.standRegres(data_mat, label_mat)
Exemplo n.º 3
0
def testAbalone():
    xArr, yArr = re.loadDataSet('abalone.txt')
    ws = re.standRegres(xArr, yArr)
    print ws
    for k in [2, 10]:
        calcErr(xArr, yArr, 0, 299, k, 300, 350, ws)
        calcErr(xArr, yArr, 0, 299, k, 350, 400, ws)
        calcErr(xArr, yArr, 0, 299, k, 400, 450, ws)
        calcErr(xArr, yArr, 0, 299, k, 450, 500, ws)
        calcErr(xArr, yArr, 0, 299, k, 500, 600, ws)
        print ''
Exemplo n.º 4
0
def lineResult(fileName):
    xArr, yArr = regression.loadDataSet(fileName)
    ws = regression.standRegres(xArr, yArr)  #取得回归系数矩阵
    # 画出回归曲线
    xMat = mat(xArr)
    yHat = xMat * ws
    fig = plt.figure()
    ax = fig.add_subplot(111)
    xCopy = xMat.copy()
    xCopy.sort(0)
    yHat = xCopy * ws
    ax.plot(xCopy[:, 1], yHat)
    ax.scatter(xMat[:, 1].flatten().A[0],
               mat(yArr).T.flatten().A[0],
               s=2,
               c='red')
    plt.show()
Exemplo n.º 5
0
def useStandRegres():
    """
    函数说明:使用简单的线性回归
    Parameters:
        无
    Returns:
        无
    """
    lgX = []
    lgY = []
    setDataCollect(lgX, lgY)
    data_num, features_num = np.shape(lgX)
    lgX1 = np.mat(np.ones((data_num, features_num + 1)))
    lgX1[:, 1:5] = np.mat(lgX)
    ws = rg.standRegres(lgX1, lgY)
    print('%f%+f*年份%+f*部件数量%+f*是否为全新%+f*原价' %
          (ws[0], ws[1], ws[2], ws[3], ws[4]))
def abaloneTest():
    """ 预测鲍鱼的年龄

    描述:机器学习实战示例8.3 预测鲍鱼的年龄
    INPUT:
        无
    OUPUT: 
        无 
    """
    # 加载数据
    abX, abY = regression.loadDataSet("./data/abalone.txt")
    # 使用不同的核进行预测
    oldyHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1)
    oldyHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1)
    oldyHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10)
    # 打印出不同的核预测值与训练数据集上的真实值之间的误差大小
    print("old yHat01 error Size is :",
          regression.rssError(abY[0:99], oldyHat01.T))
    print("old yHat1 error Size is :",
          regression.rssError(abY[0:99], oldyHat1.T))
    print("old yHat10 error Size is :",
          regression.rssError(abY[0:99], oldyHat10.T))
    # 打印出不同的核预测值与新数据集(测试数据集)上的真实值之间的误差大小
    newyHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1)
    print("new yHat01 error Size is :",
          regression.rssError(abY[0:99], newyHat01.T))
    newyHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1)
    print("new yHat1 error Size is :",
          regression.rssError(abY[0:99], newyHat1.T))
    newyHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10)
    print("new yHat10 error Size is :",
          regression.rssError(abY[0:99], newyHat10.T))
    # 使用简单的线性回归进行预测,与上面的计算进行比较
    standWs = regression.standRegres(abX[0:99], abY[0:99])
    standyHat = mat(abX[100:199]) * standWs
    print("standRegress error Size is:",
          regression.rssError(abY[100:199], standyHat.T.A))
import LoadData
import RidgeRegression
import regression
import matplotlib.pyplot as plt
from numpy import *

X,Y,attNum,trainingSampleNum=LoadData.loadDataSet('abalone.txt')
xStd=std(X,0) #得到标准化前X的标准差
yMean=mean(Y,0) #得到中心化前的Y的均值 yMean是1×1的矩阵
XStand,YCentered=LoadData.standardize(X),LoadData.centered(Y)

testNum=30
wMat=RidgeRegression.ridgeTest(XStand,YCentered,testNum)
for i in range(testNum):
    theta=mat(wMat[i,:]).T
    YHat1=XStand*theta+yMean #广播
    print("参数",i,"的岭回归的Error为", LoadData.rssError(Y, YHat1))
wMat=wMat/xStd #还原到没有标准化前的系数

xOne=LoadData.addAllOneColumn(X)
thetaStd=regression.standRegres(xOne,Y)
YHat2=xOne*thetaStd
print("标准线性回归的Error为",LoadData.rssError(Y,YHat2))

# 岭迹图
lambdas = [i - 10 for i in range(testNum)]
plt.plot(lambdas, wMat)
plt.show()

Exemplo n.º 8
0
#!usr/bin/python
#coding=utf8

import regression
from numpy import *
import matplotlib.pyplot as plt

dataMat, labels = regression.loadDataSet('ex0.txt')
ws = regression.standRegres(dataMat, labels)
# print ws
#
# # 绘制原始数据
xMat = mat(dataMat)
yMat = mat(labels)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xMat[:, 1].flatten().A[0],
           yMat.T[:, 0].flatten().A[0],
           s=2,
           c='red')
#
#
# # 预测数据
# yHat = regression.lwlrTest(dataMat, dataMat, labels,k=0.003)
# srtInd = xMat[:,1].argsort(0)
# xSort = xMat[srtInd][:,0,:]
# ax.plot(xSort[:,1], yHat[srtInd])
# plt.show()

xCopy = xMat.copy()
xCopy.sort(0)
Exemplo n.º 9
0
from numpy import *
import regression as regress
import matplotlib.pyplot as plt
plt.switch_backend('agg')

if __name__ == "__main__":
    k = input("input the k:")
    filename = "data/ex0.txt"
    xArr, yArr = regress.loadDataSet(filename)
    ws = regress.standRegres(xArr, yArr)
    #xMat是一个n*2的矩阵
    xMat = mat(xArr)
    #将xMat按照第二列排序,返回各个元素排序后的位置srtInd
    srtInd = xMat[:, 1].argsort(0)
    #获取排序后的二维矩阵
    xSort = xMat[srtInd][:, 0, :]
    #print("srtInd============")
    #print(srtInd)
    #print("xMat============")
    #print(xMat)
    #print("xSort===========")
    #print(xSort)
    yMat = mat(yArr)

    fig = plt.figure()
    ax = fig.add_subplot(111)
    #print(xArr[:,1].flatten().A[0])
    #print(yArr.T[:,0].flatten().A[0])
    ax.scatter(xMat[:, 1].flatten().A[0],
               yMat.T[:, 0].flatten().A[0],
               s=2,
Exemplo n.º 10
0
    for line in fr.readlines():
        lineArr = []
        curLine = line.strip().split('\t')
#        print "curLine:" , curLine
        for i in range(numFeat):
#            print "curLine[%d]" % i, (curLine[i])
            lineArr.append(float(curLine[i]))
#        print "lineArr: ", lineArr
#        print "curLine[-1]", curLine[2]
        dataMat.append(lineArr)
        labelMat.append(float(curLine[-1]))
    return dataMat, labelMat

x, y = loadDataSet('ex0.txt')

ws = regression.standRegres(x, y)
ws2 = regression.gradRegres(x, y)
#ws3 = regression.gradRegressMatrix(x, y)

print ws
print ws2
#print ws3

xArr = np.asarray(x)
yArr = np.asarray(y)
yHat = xArr * ws

print np.corrcoef(yHat.T, yArr)

fig = plt.figure()
ax = fig.add_subplot(111)
Exemplo n.º 11
0
import numpy as np
import matplotlib as cm
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import regression as re

if __name__ == '__main__':
    X, y = re.loadDataSet("data/ex1.txt")  # coursera的《machine learning》第二周实验数据
    m, n = X.shape
    X = np.concatenate((np.ones((m, 1)), X), axis=1)
    theta, timeConsumed = re.standRegres(X, y)
    print('消耗[%s] s \n 参数矩阵:\n %s' % (timeConsumed, theta))

    fittingFig = plt.figure()
    title = 'StandRegress  time: %s' % timeConsumed
    ax = fittingFig.add_subplot(111, title=title)
    trainingSet = ax.scatter(X[:, 1].flatten().A[0], y[:, 0].flatten().A[0])
    xCopy = X.copy()
    xCopy.sort(0)
    yHat = xCopy * theta
    fittingLine, = ax.plot(xCopy[:, 1], yHat, color='g')
    ax.set_xlabel('Population of City in 10,000s')
    ax.set_ylabel('Profit in $10,000s')
    plt.legend([trainingSet, fittingLine],
               ['Training Set', 'Linear Regression'])
    plt.show()
Exemplo n.º 12
0
yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1.0)
yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10)

error01 = regression.rssError(abY[0:99], yHat01)
error1 = regression.rssError(abY[0:99], yHat1)
error10 = regression.rssError(abY[0:99], yHat10)

#结论,使用较小的核可以得到较低的误差
#但较小的核会造成过拟合的,对新数据不定能达到最好的预测效果
print("error01 is %s" % error01)  #error01 is 56.7862596807
print("error1 is %s" % error1)  #error1 is 429.89056187
print("error10 is %s" % error10)  #error10 is 549.118170883

yyHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1)
yyHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1.0)
yyHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10)
eerror01 = regression.rssError(abY[100:199], yyHat01)
eerror1 = regression.rssError(abY[100:199], yyHat1)
eerror10 = regression.rssError(abY[100:199], yyHat10)
print("eerror01 is %s" % eerror01)  #eerror01 is 33652.8973161
print("eerror1 is %s" % eerror1)  #eerror1 is 573.52614419
print("eerror10 is %s" %
      eerror10)  #eerror10 is 517.571190538       #对新数据,k=10得到较好的效果

#和线性做比较
#结论:必须在未知数据上做比较效果才能取到最佳模型
ws = regression.standRegres(abX[0:99], abY[0:99])  #用前100个数据做训练集
yHat = mat(abX[100:199]) * ws
errorLine = regression.rssError(abY[100:199], yHat.T.A)
print("errorLine is %s" % errorLine)  #errorLine is 518.636315324
Exemplo n.º 13
0
import regression
from numpy import *

xarr, yarr = regression.loadDataSet('ex0.txt')
#print(xarr)

ws = regression.standRegres(xarr, yarr)
#print(ws)

xmat = mat(xarr)
ymat = mat(yarr)
yhat = xmat * ws

import matplotlib.pyplot as plt

fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xmat[:, 1].flatten().A[0], ymat.T[:, 0].flatten().A[0])
xcopy = xmat.copy()
xcopy.sort(0)
yhat = xcopy * ws
ax.plot(xcopy[:, 1], yhat)
#plt.show()

yhat = xmat * ws
print(corrcoef(yhat.T, ymat))
Exemplo n.º 14
0
# regressionTest.py

import regression
from numpy import *

import matplotlib.pyplot as plt

xArray, yArray = regression.loadDataSet('ex0.txt')
xArray = array(xArray, dtype=float)
yArray = array(yArray, dtype=float)
# print(xArray[0 : 2])
# print(yArray[0])
# print(regression.lwlr(xArray[0], xArray, yArray, 1.0))

ws = regression.standRegres(xArray, yArray)

# print(ws)

xMat = mat(xArray)
yMat = mat(yArray)
yHat = xMat * ws

# print(corrcoef(yHat.T, yMat))
'''
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xMat[:, 1].flatten().A[0], yMat.T[:, 0].flatten().A[0])

xCopy = xMat.copy()
xCopy.sort(0)
yHat = xCopy * ws
Exemplo n.º 15
0
#!/usr/bin/python
import regression
from numpy import *
xArr,yArr=regression.loadDataSet('ex0.txt')
ws=regression.standRegres(xArr,yArr)
xMat=mat(xArr)
yMat=mat(yArr)
yHat=xMat*ws
import matplotlib.pyplot as plt
fig=plt.figure()
ax=fig.add_subplot(111)
ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0].flatten().A[0])
xCopy=xMat.copy()
xCopy.sort(0)
yHat=xCopy*ws
ax.plot(xCopy[:,1],yHat)
plt.show()

Exemplo n.º 16
0
   return ((yArr - yHatArr) ** 2).sum()


if __name__ == '__main__':
    plotlwlrRegression()

    # 预测鲍鱼的年龄
    abX, abY = rg.loadDataSet('abalone.txt')
    print('训练集与测试集相同:局部加权线性回归,核k的大小对预测的影响:')
    yHat01 = lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1)
    yHat1 = lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1)
    yHat10 = lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10)
    print('k=0.1时,误差大小为:', rssError(abY[0:99], yHat01.T))
    print('k=1  时,误差大小为:', rssError(abY[0:99], yHat1.T))
    print('k=10 时,误差大小为:', rssError(abY[0:99], yHat10.T))
    print('')
    print('训练集与测试集不同:局部加权线性回归,核k的大小是越小越好吗?更换数据集,测试结果如下:')
    yHat01 = lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1)
    yHat1 = lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1)
    yHat10 = lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10)
    print('k=0.1时,误差大小为:', rssError(abY[100:199], yHat01.T))
    print('k=1  时,误差大小为:', rssError(abY[100:199], yHat1.T))
    print('k=10 时,误差大小为:', rssError(abY[100:199], yHat10.T))
    print('')
    print('训练集与测试集不同:简单的线性归回与k=1时的局部加权线性回归对比:')
    print('k=1时,误差大小为:', rssError(abY[100:199], yHat1.T))
    ws = rg.standRegres(abX[0:99], abY[0:99])
    yHat = np.mat(abX[100:199]) * ws
    print('简单的线性回归误差大小:', rssError(abY[100:199], yHat.T.A))

Exemplo n.º 17
0
    xArr, yArr = regression.loadDataSet(
        r'C:\Users\v_wangdehong\PycharmProjects\MachineLearning_V\Regression\data\abalone.txt'
    )
    #使用前99行数据测试算法
    yHat01 = regression.lwlrTest(xArr[0:99], xArr[0:99], yArr[0:99], 0.1)
    yHat1 = regression.lwlrTest(xArr[0:99], xArr[0:99], yArr[0:99], 1)
    yHat10 = regression.lwlrTest(xArr[0:99], xArr[0:99], yArr[0:99], 10)
    print(regression.rssError(yArr[0:99], yHat01))  #56.7842091184
    print(regression.rssError(yArr[0:99], yHat1))  #429.89056187
    print(regression.rssError(yArr[0:99], yHat10))  #549.118170883
    """
    从上面可以看到,使用较小的核将得到较低的误差,那么为什么不在所有数据集上都使用最小的核呢?
    因为使用最小的核将造成过拟合,对新数据不一定能达到最好的效果,下面就看看它在新数据上的表现
    """
    yHat01 = regression.lwlrTest(xArr[100:199], xArr[0:99], yArr[0:99], 0.1)
    yHat1 = regression.lwlrTest(xArr[100:199], xArr[0:99], yArr[0:99], 1)
    yHat10 = regression.lwlrTest(xArr[100:199], xArr[0:99], yArr[0:99], 10)
    print(regression.rssError(yArr[100:199], yHat01))  # 25119.4591112
    print(regression.rssError(yArr[100:199], yHat1))  # 573.52614419
    print(regression.rssError(yArr[100:199], yHat10))  # 517.571190538
    """
    从上面结果可以看到,核大小等于10时测试误差最小,但是它在训练集上的误差却是最大的。
    接下来再和简单的线性回归做个比较。
    """
    ws = regression.standRegres(xArr[0:99], yArr[0:99])
    yHat = mat(xArr[100:199]) * ws  #shape(99,1)
    print(regression.rssError(yArr[100:199], yHat.T.A))
    """
    简单的线性回归达到了局部加权线性回归类似的效果。这也表明了一点,必须在未知数据上比较效果才能选取到最佳模型。
    """
Exemplo n.º 18
0
error01 = regression.rssError(abY[0:99], yHat01)
error1 = regression.rssError(abY[0:99], yHat1)
error10 = regression.rssError(abY[0:99], yHat10)

#结论,使用较小的核可以得到较低的误差
#但较小的核会造成过拟合的,对新数据不定能达到最好的预测效果
print ("error01 is %s"  % error01)     #error01 is 56.7862596807
print ("error1 is %s"  % error1)         #error1 is 429.89056187
print ("error10 is %s"  % error10)     #error10 is 549.118170883

yyHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1)
yyHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1.0)
yyHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10)
eerror01 = regression.rssError(abY[100:199], yyHat01)
eerror1 = regression.rssError(abY[100:199], yyHat1)
eerror10 = regression.rssError(abY[100:199], yyHat10)
print ("eerror01 is %s"  % eerror01)     #eerror01 is 33652.8973161
print ("eerror1 is %s"  % eerror1)         #eerror1 is 573.52614419
print ("eerror10 is %s"  % eerror10)     #eerror10 is 517.571190538       #对新数据,k=10得到较好的效果


#和线性做比较
#结论:必须在未知数据上做比较效果才能取到最佳模型
ws = regression.standRegres(abX[0:99], abY[0:99])   #用前100个数据做训练集
yHat = mat(abX[100:199]) * ws
errorLine = regression.rssError(abY[100:199], yHat.T.A)
print ("errorLine is %s"  % errorLine)     #errorLine is 518.636315324


Exemplo n.º 19
0
abX, abY = regression.loadDataSet('abalone.txt')
yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1)
yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1)
yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10)
print regression.rssError(abY[0:99], yHat01.T)
print regression.rssError(abY[0:99], yHat1.T)
print regression.rssError(abY[0:99], yHat10.T)

yHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1)
yHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1)
yHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10)
print regression.rssError(abY[100:199], yHat01.T)
print regression.rssError(abY[100:199], yHat1.T)
print regression.rssError(abY[100:199], yHat10.T)

ws = regression.standRegres(abX[0:99], abY[0:99])
yHat = mat(abX[100:199])*ws
print regression.rssError(abY[100:199], yHat.T.A)

ridgeWeights = regression.ridgeTest(abX, abY)
#print ridgeWeights
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(ridgeWeights)
plt.show()

xArr, yArr = regression.loadDataSet('abalone.txt')
#regression.stageWise(xArr, yArr, 0.01, 200)
#regression.stageWise(xArr, yArr, 0.001, 5000)
Exemplo n.º 20
0
    returnMat = zeros((numIter, n))
    ws = zeros((n, 1))
    wsMax = ws.copy()

    for i in range(numIter):
        print(ws.T)
        lowestError = inf
        for j in range(n):
            for sign in [-1, 1]:
                wsTest = ws.copy()
                wsTest[j] += eps * sign
                yTest = xMat * wsTest
                rssE = rssError(yMat.A, yTest.A)
                if rssE < lowestError:
                    lowestError = rssE
                    wsMax = wsTest
        ws = wsMax.copy()
        returnMat[i, :] = ws.T
    return returnMat


if __name__ == '__main__':
    print('Forward Step-wise Regression: ')
    xArr, yArr = regression.loadDataSet('abalone.txt')
    stageWise(xArr, yArr, 0.005, 5000)

    print('Standard Regression: ')
    weights = regression.standRegres(regularize(mat(xArr)),
                                     (mat(yArr).T - mean(mat(yArr).T, 0)).T)
    print(weights.T)
Exemplo n.º 21
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

'8.3 mechinelearing in action'

__author__ = 'lxp'

import regression
import numpy as np

abX, abY = regression.loadDataSet('abalone.txt')
yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1)
yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1)
yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10)
print(regression.rssError(abY[0:99], yHat01.T))
print(regression.rssError(abY[0:99], yHat1.T))
print(regression.rssError(abY[0:99], yHat10.T))

yHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1)
yHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1)
yHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10)
print(regression.rssError(abY[100:199], yHat01.T))
print(regression.rssError(abY[100:199], yHat1.T))
print(regression.rssError(abY[100:199], yHat10.T))

ws = regression.standRegres(abX[0:99], abY[0:99])
yHat = np.mat(abX[100:199]) * ws
print(regression.rssError(abY[100:199], yHat.T.A))
import LoadData
import ForwardStagewiseR
import regression
import matplotlib.pyplot as plt
from numpy import *

X,Y,attNum,trainingSampleNum=LoadData.loadDataSet('abalone.txt')
xStd=std(X,0) #得到标准化前X的标准差
yMean=mean(Y,0) #得到中心化前的Y的均值
XStand,YCentered=LoadData.standardize(X),LoadData.centered(Y)

numIt=5000
allWS=ForwardStagewiseR.forwardStagewiseR(XStand,YCentered,0.005,numIt)
YHat1=XStand*(mat(allWS[numIt-1]).T)+yMean
allWS=allWS/xStd
print("前向逐步回归系数最后一次迭代系数为:",allWS[numIt-1])
print("前向逐步回归的Error为",LoadData.rssError(Y,YHat1))

xOne=LoadData.addAllOneColumn(X)
thetaStd=regression.standRegres(xOne, Y) # theta 是n*1 训练数据直接当测试数据
print('线性回归系数为:',thetaStd.T)
YHat2=xOne*thetaStd
print("标准线性回归的Error为",LoadData.rssError(Y,YHat2))

plt.plot(range(numIt),allWS)
plt.show()
Exemplo n.º 23
0
# -*- coding: utf-8 -*-
"""
Created on Fri May 12 16:07:29 2017

@author: 凯风
"""

import regression
from numpy import *
from imp import reload
import matplotlib.pyplot as plt

reload(regression)
xArr, yArr = regression.loadDataSet('ex0.txt')
xArr[0:2]
ws = regression.standRegres(xArr, yArr)  # 求回归系数
ws

xMat = mat(xArr)
yMat = mat(yArr)
yHat = xMat * ws  # 拟合曲线

# 绘制拟合直线和散点图
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xMat[:, 1].flatten().A[0], yMat.T[:, 0].flatten().A[0])
xCopy = xMat.copy()
xCopy.sort(0)
yHat = xCopy * ws
ax.plot(xCopy[:, 1], yHat)
plt.show()
Exemplo n.º 24
0
    print("岭回归的系数(已经缩放至原尺寸):",unStandCoff)
    print("岭回归的截距(已经缩放至原尺寸):",-1*sum(multiply(unStandCoff,xMean))+yMean)
    print('岭回归RSS:', LoadData.rssError(Y, YHat))




if __name__ == '__main__':
    lgX = []
    lgY = []
    setDataCollect(lgX, lgY)
    lgX = mat(lgX)
    lgY = mat(lgY).T
    m = lgX.shape[0]
    n = lgX.shape[1]
    lgX1 = LoadData.addAllOneColumn(lgX)
    print('属性个数:', n)
    print('训练实例个数:', m)
    set_printoptions(suppress=True)
    print('属性矩阵:', lgX)
    print('类标签矩阵矩阵:', lgY.T)

    theta = regression.standRegres(lgX1, lgY)  # theta 是n*1 训练数据直接当测试数据
    print('线性回归系数为:', theta)
    YHat = lgX1 * theta  # YHat 是 m*1
    # for i in range(m):
    #     print('真实值:', lgY[i], '预测值:', YHat[i])
    print('标准线性回归RSS:', LoadData.rssError(lgY, YHat))

    crossValidationRidgeRegression(lgX, lgY)
# xMat=regression.regularize(xMat)
# yM = mean(yMat,0)
# yMat =yMat-yM
# weights=regression.standRegres(xMat,yMat.T)
# print(weights.T)
#--------产生图8-7的代码------------#
# xArr,yArr =regression.loadDataSet('abalone.txt')
# rightweights=regression.stageWise(xArr,yArr,0.005,1000)#运行时请把stageWise()中注释的三句代码恢复
# import  matplotlib.pyplot as plt
# fig = plt.figure()
# ax=fig.add_subplot(111)
# ax.plot(rightweights)
# plt.show()
#-------lego积木预测-------------#
import legoAPI
lgx = []
lgy = []
legoAPI.setDataCollect(lgx, lgy)  #乐高URL已经过期,所以使用legoAPI.py本地解释文件夹setHtml下的网页
# regression.scrapePage('./setHtml/lego10030.html','out.txt', 2002, 3096, 269.99)#也可以这样使用作者注释掉的scrapePage()函数
lgx1 = mat(ones((63, 5)))
lgx1[:, 1:5] = mat(lgx)
print(lgx[0])
print(lgx1[0])
ws = regression.standRegres(lgx1, lgy)  #最小二乘法,线性回归
print('ws', end='=')
print(ws)
# print('lgx1[0]*ws',end='=');print(lgx1[0]*ws)
# print('lgx1[0]*ws',end='=');print(lgx1[-1]*ws)
# print('lgx1[0]*ws',end='=');print(lgx1[43]*ws)
regression.crossValidation(lgx, lgy, 10)
print(regression.ridgeTest(lgx, lgy))
Exemplo n.º 26
0
# 测试导入数据
import regression
from numpy import *

xArr, yArr = regression.loadDataSet('ex0.txt')
xArr[0:2]

# 测试标准回归
ws = regression.standRegres(xArr, yArr)
ws
xMat = mat(xArr)
yMat = mat(yArr)
yHat = xMat * ws

# 绘制数据集散点图和最佳拟合直线图
import matplotlib.pyplot as plt

fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xMat[:, 1].flatten().A[0], yMat.T[:, 0].flatten().A[0])
# 先将数据点按升序排列
xCopy = xMat.copy()
xCopy.sort(0)
yHat = xCopy * ws
ax.plot(xCopy[:, 1], yHat)
plt.show()

# 计算预测值和真实值的相关性
yHat = xMat * ws
corrcoef(yHat.T, yMat)
Exemplo n.º 27
0
import matplotlib.pyplot as plt
import LoadData
import regression
from numpy import *

# m 条数据 n个属性
X, Y, attNum, trainingSampleNum = LoadData.loadDataSet(
    'abalone.txt')  # X是 m*n  Y 是 m*1
theta = regression.standRegres(X, Y)  # theta 是n*1 训练数据直接当测试数据
print('线性回归系数为:', theta)
YHat = X * theta  # YHat 是 m*1
print('标准线性回归预测y值的转置为:', YHat.T)
for i in range(trainingSampleNum):
    print('真实值:', Y[i], '预测值:', YHat[i])

#绘制原始数据
fig = plt.figure()
ax = fig.add_subplot(1, 1,
                     1)  #add_subplot(349)函数的参数的意思是,将画布分成3行4列图像画在从左到右从上到下第9块
ax.scatter(X[:, 1].flatten().A[0].T, Y.flatten().A[0].T)
#scatter第一个参数是x坐标,第二个是y坐标,都是array类型的
#第一个参数是x坐标,取的是属性矩阵的第1列,X[:,1]的结果是m*1的矩阵,flattern把m*1的矩阵变成了 1×m的矩阵,A把这个1×m矩阵
#变成了1×m的数组,取个A[0] 就是取第一行
#print(X[:,1].flatten().A.shape)  X[:,1].flatten().A是1*200的数组

#绘制预测数据
ax.plot(X[:, 1].flatten().A[0], YHat.flatten().A[0], 'r-')
plt.show()  #不加这句不显示图像

print('皮尔逊积矩相关系数:', corrcoef(YHat.T, Y.T))
print('RSS:', LoadData.rssError(Y, YHat))
Exemplo n.º 28
0
import regression

lgX = []
lgY = []
regression.setDataCollect(lgX, lgY)

print(shape(lgX))
lgX1 = mat(ones((58, 5)))
lgX1[:, 1:5] = mat(lgX)
print(lgX[0])
print(lgX1[0])

ws = regression.standRegres(lgX1, lgY)
print(ws)
print(lgX1[0] * ws)
print(lgX1[-1] * ws)
print(lgX1[43] * ws)
Exemplo n.º 29
0
import LoadData
import LocallyWeightedLR
import regression
from numpy import *

X, Y, attNum, trainingSampleNum = LoadData.loadDataSet('abalone.txt')
#YHat01=LocallyWeightedLR.testLocallyWeightedLR(X[100:199],X[:99],Y[:99],0.1) #有许多不可逆的矩阵
#theta01=LocallyWeightedLR.LocallyWeightedLR(X[129],X[:99],Y[:99],0.1) 第129个实例的xTWX矩阵不满秩
#print "129:",X[129]
#print "129 theta:",theta01
#print "129 预测值:",X[129]*theta01
YHat1 = LocallyWeightedLR.testLocallyWeightedLR(X[100:199], X[:99], Y[:99], 1)
YHat10 = LocallyWeightedLR.testLocallyWeightedLR(X[100:199], X[:99], Y[:99],
                                                 10)
YStandR = X[100:199] * regression.standRegres(X[:99], Y[:99])  #乘在了测试数据集上
#print("k=1目标函数为:",LoadData.rssError(Y[100:199],YHat01))
print("k=1目标函数为:", LoadData.rssError(Y[100:199], YHat1))
print("k=10目标函数为:", LoadData.rssError(Y[100:199], YHat10))
print("标准线性回归目标函数为:", LoadData.rssError(Y[100:199], YStandR))
Exemplo n.º 30
0
#coding:utf-8
from numpy import *
import regression

xArr, yArr = regression.loadDataSet('ex0.txt')
#print xArr[0:2]

ws = regression.standRegres(xArr, yArr)
print ws

#使用新的ws值计算预测的值yHat
xMat = mat(xArr)
yMat = mat(yArr)
yHat = xMat * ws

#绘出数据集散点图和最佳拟合直线图
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xMat[:, 1].flatten().A[0], yMat.T[:, 0].flatten().A[0])

#为了绘制计算出的最佳拟合曲线,需要绘出yHat的值
#若直线上的数据点次序混乱,绘图时将会出现问题,固要先将点按照升序排列
xCopy = xMat.copy()
xCopy.sort(0)  #这个应该是np中的sort,意思是按照0维度排序
yHat = xCopy * ws
ax.plot(xCopy[:, 1], yHat)
plt.show()

#对单点进行估计
print yArr[0]
Exemplo n.º 31
0
# -*- coding: utf-8 -*-

import regression
from numpy import *
dm, ls = regression.loadDataSet('ex0.txt')
ws = regression.standRegres(dm, ls)

xMat = mat(dm)
yMat = mat(ls)
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xMat[:, 1].flatten().A[0],
           yMat.T[:, 0].flatten().A[0])  # flatten()将多维矩阵折叠成一维
xCopy = xMat.copy()
xCopy.sort(0)
yHat = xCopy * ws
ax.plot(xCopy[:, 1], yHat)
plt.show()

xMat = mat(dm)
yMat = mat(ls)
yHat = xMat * ws
corrcoef(yHat.T, yMat)  # 求相关系数

# 局部加权线性回归
import regression
from numpy import *
dm, ls = regression.loadDataSet('ex0.txt')
ls[0]
regression.lwlr(dm[0], dm, ls, 1.0)
Exemplo n.º 32
0
import regression
from numpy import *

xArr, yArr = regression.loadDataSet('abalone.txt')
regression.stageWise(xArr, yArr, 0.01, 200)

regression.stageWise(xArr, yArr, 0.001, 5000)

xMat = mat(xArr)
yMat = mat(yArr).T
xMat = regression.regularize(xMat)
yM = mean(yMat, 0)
yMat = yMat - yM
weights = regression.standRegres(xMat, yMat.T)
print(weights.T)
Exemplo n.º 33
0
abX, abY = regression.loadDataSet('abalone.txt')
yHat01 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1)
yHat1 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1)
yHat10 = regression.lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10)
print regression.rssError(abY[0:99], yHat01.T)
print regression.rssError(abY[0:99], yHat1.T)
print regression.rssError(abY[0:99], yHat10.T)

yHat01 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1)
yHat1 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1)
yHat10 = regression.lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10)
print regression.rssError(abY[100:199], yHat01.T)
print regression.rssError(abY[100:199], yHat1.T)
print regression.rssError(abY[100:199], yHat10.T)

ws = regression.standRegres(abX[0:99], abY[0:99])
yHat = mat(abX[100:199]) * ws
print regression.rssError(abY[100:199], yHat.T.A)

ridgeWeights = regression.ridgeTest(abX, abY)
#print ridgeWeights
import matplotlib.pyplot as plt

fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(ridgeWeights)
plt.show()

xArr, yArr = regression.loadDataSet('abalone.txt')
#regression.stageWise(xArr, yArr, 0.01, 200)
#regression.stageWise(xArr, yArr, 0.001, 5000)