예제 #1
0
def classification(test_data, test_bagOfWords, original_data, original_labels, original_bagOfWords, k=3):
    """
    kNN Model Based Classifier for test data (actual data)
    """
    for i in range(len(test_bagOfWords)):
        x = classify(np.array(test_bagOfWords[i]), np.array(original_bagOfWords), original_labels, k)
        print(test_data[i], x)
예제 #2
0
def handwritingClassTest():
    trainingFilePath = "data/digits/trainingDigits/"
    testFilePath = "data/digits/testDigits/"
    hwLabels = []
    trainingFileList = listdir(trainingFilePath)
    m = len(trainingFileList)  # 1934
    trainingMat = zeros((m, 1024))
    for i in range(m):
        fileNameStr = trainingFileList[i]  # 0_10.txt
        fileName = fileNameStr.split(".")[0]  # 0_10
        classNum = int(fileName.split("_")[0])  # 0
        hwLabels.append(classNum)
        trainingMat[i, :] = img2vector(trainingFilePath + fileNameStr)
    testFileList = listdir(testFilePath)
    errorCount = 0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileName = fileNameStr.split(".")[0]
        classNum = int(fileName.split("_")[0])
        testVector = img2vector(testFilePath + fileNameStr)
        classifyRes = kNN.classify(testVector, trainingMat, hwLabels, 3)
        print("kNN分类器分类结果为:{}, 真实的数字为:{}".format(classifyRes, classNum))
        if (classifyRes != classNum): errorCount += 1
    print("错误的数量为%d" % errorCount)
    print("错误率为{}".format(str(errorCount / mTest)))
예제 #3
0
def handwritingClassTest1():
    hwLabels = []
    trainingFileList = listdir('trainingDigits')  #加载训练集
    m = len(trainingFileList)  #计算当前文件夹下文件个数
    trainingMat = np.zeros((m, 1024))  #初始化训练向量矩阵

    for i in range(m):
        fileNameStr = trainingFileList[i]  #获取文件名
        fileStr = fileNameStr.split('.')[0]  #从文件名中解析出分类的数字
        classNumStr = int(fileStr.split('_')[0])
        hwLabels.append(classNumStr)
        trainingMat[i, :] = img2vector('trainingDigits/%s' % fileNameStr)

    testFileList = listdir('testDigits')  #加载测试集
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]  #从文件名中解析出测试样本的类别
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)

        classifierResult = kNN.classify(vectorUnderTest, trainingMat, hwLabels,
                                        3)  #开始分类
        print('预测数字: %d, 真实数字: %d' % (classifierResult, classNumStr))

        if (classifierResult != classNumStr):
            errorCount += 1.0  #计算分错的样本数
    print('\n总错误样本数: %d' % errorCount)
    print('\n错误率: %f' % (errorCount / float(mTest)))
예제 #4
0
 def test_handwriting(self):
     train_set, train_lables = self.imgdir2dataset('knn/trainingDigits')
     test_set, test_labels = self.imgdir2dataset('knn/testDigits')
     err = 0
     for x, label in itertools.izip(test_set, test_labels):
         result = classify(x, train_set, train_lables, 6)
         if result != label:
             err += 1
     logging.info('Error rate: %f', float(err) / len(test_labels))
예제 #5
0
def training_classification(data, label, bagOfWords, k=3):
    """
    kNN Model Based Classifier for the Training Set data;
    Parameters: -
    """
    errCount = 0
    for i in range(len(bagOfWords)):
        x = classify(np.array(bagOfWords[i]), np.array(bagOfWords), label, k)
        # print(data[i], x, label[i])
        if x != label[i]:
            errCount += 1
            print(data[i], x, label[i])
    return (errCount / len(bagOfWords)) * 100
예제 #6
0
def datingClassTest():
    ratio = 0.50  # 训练和测试的比例
    datingDataMat, datingLabels = file2matrix('data/datingTestSet2.txt')
    normMat = autoNorm(datingDataMat)
    m = normMat.shape[0]  # 行数
    numTestVecs = int(m * ratio)
    errorCount = 0
    for i in range(numTestVecs):
        classifyRes = kNN.classify(normMat[i, :], normMat[numTestVecs:m, :],
                                   datingLabels[numTestVecs:m], 3)
        print("kNN分类器分类结果为:{}, 真实的类别为:{}".format(classifyRes, datingLabels[i]))
        if (classifyRes != datingLabels[i]): errorCount += 1
    print("错误的数量为%d" % errorCount)
    print("错误率为{}".format(str(errorCount / numTestVecs)))
예제 #7
0
def datingClassTest(testSetPercent, k):
    dataSetX, dataSetY = getDatingData()
    normDataSetX, ranges, mins = helper.autoNormalize(dataSetX)
    dataSize = normDataSetX.shape[0]
    testSize = int(dataSize * testSetPercent)
    errorCount = 0
    for i in range(testSize):
        inX = normDataSetX[i]
        outY = kNN.classify(inX, normDataSetX[testSize:,:], \
                            dataSetY[testSize:], \
                            k)
        if(outY != dataSetY[i]):
            errorCount = errorCount + 1
    errorRate = errorCount / float(testSize)
    return errorRate
예제 #8
0
def datingClassTest(datingDataMat, datingLabels):
    hoRatio = 0.10  #将数据集且分为训练集和测试集的比例
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m * hoRatio)  #测试集数目

    errorCount = 0.0  #分错样本数
    for i in range(numTestVecs):
        classifierResult = kNN.classify(normMat[i, :],
                                        normMat[numTestVecs:m, :],
                                        datingLabels[numTestVecs:m], 3)
        print('预测类别: %d, 真实类别: %d' % (classifierResult, datingLabels[i]))

        if (classifierResult != datingLabels[i]):
            errorCount += 1.0
    print('错误率: %f' % (errorCount / float(numTestVecs)))
예제 #9
0
 def do_txt_training(self, txt, k):
     dataset, labels = self.txt2dataset(txt)
     dataset = normalize(dataset)
     # 90% for training, 10% for verify
     index = int(0.9 * len(labels))
     training_set = dataset[:index]
     training_lables = labels[:index]
     ref_set = dataset[index:]
     ref_labels = labels[index:]
     # start testing
     errno = 0
     for (x, label) in itertools.izip(ref_set, ref_labels):
         result = classify(x, training_set, training_lables, k)
         msg = 'Data: %s, label: %s, result: %s' % (x, label, result)
         logging.debug(msg)
         if result != label:
             errno += 1
     return float(errno) / len(ref_labels)
예제 #10
0
def classification(data, classified_file, unclassified_file):
    """
    Classifying the unclassified experience labels (will be especially useful for future as data grows...)
    Parameter: -Data: experience data
               -Classified_file: experience with labels
               -Unclassified_file: experience without labels
    """
    f = open(unclassified_file)
    lines = f.readlines()
    lines = [line.replace('\n', '') for line in lines]
    data, labels = parse_classified()
    vocabSet = c.vocabSet(data)
    bagOfWords = [c.bag_of_words(vocabSet, i) for i in data]
    errCount = 0
    for i in range(len(bagOfWords)):
        x = classify(np.array(bagOfWords[i]), np.array(bagOfWords), labels, 20)
        print(data[i], x, labels[i])
        if x != labels[i]: errCount += 1
    print(errCount / len(bagOfWords))
예제 #11
0
def classifyPerson(datingDataMat, datingLabels):

    #特征
    percentTats = float(input("玩视频游戏所耗时间百分比:"))  #控制台手动输入 0.96
    ffMiles = float(input("每年获得的飞行常客里程数:"))  #控制台手动输入 50000
    iceCream = float(input("每周消费的冰激淋公升数:"))  #控制台手动输入 1.55

    #标签
    resultList = ['不喜欢的人', '魅力一般的人', '极具魅力的人']

    #训练集归一化
    normMat, ranges, minVals = autoNorm(datingDataMat)

    #测试集归一化
    inArray = np.array([percentTats, ffMiles, iceCream])
    normInArray = (inArray - minVals) / ranges

    #返回分类结果
    classifierResult = kNN.classify(normInArray, normMat, datingLabels, 3)
    print("他可能是你%s" % (resultList[classifierResult - 1]))
예제 #12
0
def getDigitsData(k):
    dirname = '../../data/kNN_data/digits/trainingDigits/'
    trainingSetX, trainingSetY = getTrainingSet(dirname)

    errorCount = 0
    dirname = '../../data/kNN_data/digits/testDigits/'
    for root, dirs, files in os.walk(dirname):
        break
    for filename in files:
        header = filename.strip('.txt')
        label = header.split('_')[0]
        label = int(label)
        filename = root +filename
        inX = getVector(filename)
        
        result = kNN.classify(inX, trainingSetX, trainingSetY, k)        
        print((label, result)) 
        if(label != result):
            errorCount += 1
    totalCount = len(files)
    errorRate = errorCount/float(totalCount)
    print((errorRate, errorCount, totalCount))
    return errorRate
예제 #13
0
def handWriteDigitClassify():

    # 训练数据集及标签
    mnist2Text.read_image('../caffe/data/mnist/train-images-idx3-ubyte', './train_image.txt')
    mnist2Text.read_label('../caffe/data/mnist/train-labels-idx1-ubyte', './train_label.txt')
    traingImage = kNN.img2Vector('./train_image.txt')
    traingLabel = kNN.label2Vector('./train_label.txt')

    # 测试数据集几标签
    mnist2Text.read_image('../caffe/data/mnist/t10k-images-idx3-ubyte', './test_image.txt')
    mnist2Text.read_label('../caffe/data/mnist/t10k-labels-idx1-ubyte', './test_label.txt')
    testImage = kNN.img2Vector('./test_image.txt')
    testLabel = kNN.label2Vector('./test_label.txt')

    error = 0.0
    for i in range(200):
        knnClass = kNN.classify(testImage[i], traingImage, traingLabel, 5)
        print " the kNN's classifies result is " + str(knnClass)
        print " the True is " + str(testLabel[i])
        if knnClass != testLabel[i]:
            error += 1.0
    
    print "the error rate : " + str(error/200.0)
import kNN

dataset, labels = kNN.file_to_array('datingTestSet2.txt')
normalize_dataset, min_array, max_array, range_array = kNN.normalize(dataset)
dataset_len = dataset.shape[0]
ratio = 0.1
test_len = int(dataset_len * ratio)
train_dataset = normalize_dataset[:dataset_len - test_len, :]
train_labels = labels[:dataset_len - test_len]
error_len = 0

for i in range(dataset_len - test_len, dataset_len):
    label = kNN.classify(train_dataset, train_labels, normalize_dataset[i, :],
                         5)
    print('index: %d, train: %d, real: %d' % (i, label, labels[i]))
    if label != labels[i]:
        error_len += 1
print('error rate: %f' % (error_len / float(test_len)))
예제 #15
0
        trainLabels.append(copyLabels.pop(index))
    return [trainSet, copy, trainLabels, copyLabels]

filename = "../data/3079066.txt"
# filename = "../data/data_3121867.txt"
rankcount = 199
dataSet, labels = kNN.data_ready(filename, rankcount)

# input = array([0, 1])
K = 5
rank_range = 10    # 排名误差
splitRatio = 0.67  # 训练集数据  测试集数据

trainingSet, testSet, trainLabels, copyLabels = splitDataset(dataSet.tolist(), splitRatio, labels.tolist())
print 'Split {0} rows into train={1} and test={2} rows'.format(len(dataSet), len(trainingSet), len(testSet))

success_count = 0

for i in range(0, len(testSet)):
    output = kNN.classify(testSet[i], np.array(trainingSet), np.array(trainLabels), K)
    copy_class = float(copyLabels[i])
    out_class = float(output)
    difference = int(abs(copy_class - out_class))
    # print difference
    if difference < rank_range:
        success_count += 1
    else:
        print testSet[i], output, copyLabels[i], difference
print success_count
print float(success_count)/len(testSet)
예제 #16
0
X_train = np.array(pd.read_csv("X_train.csv", header=None))
y_train = list(pd.read_csv("y_train.csv", header=None).ix[:, 0])
X_test = np.array(pd.read_csv("X_test.csv", header=None))

XTrain = X_train[:nsample, :]  #use the first 4000 samples for training
yTrain = y_train[:nsample]
XVal = X_train[nsample:, :]  #use the rests for validation
yVal = y_train[nsample:]

#nVal = XVal.shape[0]
nVal = 100  #for simplicity...

valScore = 0
for i in range(nVal):
    prediction = knn.classify(XVal[i, :], XTrain, yTrain, 1)  #1-NN
    print("Validation sample ", i, "...    Prediction: ", prediction,
          " Truth: ", yVal[i])
    if prediction == yVal[i]:
        valScore = valScore + 1

print("Validation score ", float(valScore) / nVal)

nTest = X_test.shape[0]
yHatTest = []
for i in range(nTest):
    prediction = knn.classify(X_test[i, :], XTrain, yTrain, 1)
    print("Testing sample ", i, "...    Prediction: ", prediction)
    yHatTest.append(prediction)

np.savetxt('result_knn.txt', yHatTest)
kNN: 电影分类
@author: Jerry
"""

import numpy as np
import kNN


# 创建数据集
def createDataSet():
    #[笑脸镜头 高科技镜头 接吻镜头 打斗镜头]
    features = np.array([[5, 10, 32, 114], [2, 5, 23, 150], [1, 9, 8, 154],
                         [121, 10, 12, 11], [98, 2, 20, 5], [4, 97, 14, 10],
                         [8, 110, 13, 23], [9, 100, 5, 1], [4, 5, 90, 5],
                         [1, 3, 88, 10]])
    labels = [
        "动作片", "动作片", "动作片", "喜剧片", "喜剧片", "科幻片", "科幻片", "科幻片", "爱情片", "爱情片"
    ]
    return features, labels


if __name__ == '__main__':
    features, labels = createDataSet()

    input = np.array([5, 100, 12, 6])
    k = 3

    label = kNN.classify(input, features, labels, k)
    print('预测结果:', label)
예제 #18
0
import numpy as np
import kNN

NUM_SETS = 10

# cross-validation do parzen
parzen_valid_err = np.loadtxt('files/cross-validation/parzen.txt')
parzen_valid_meanerr = np.mean(parzen_valid_err)
print('h = 1\nparzen_valid_meanerr = 0.5111\n')
print('h = 0.5\nparzen_valid_meanerr: %f\n' % parzen_valid_meanerr)
print('h = 0.3\nparzen_valid_meanerr = 0.2111\n')

# cross-validation do kNN
kNN_valid_err = np.array(range(NUM_SETS))
for conj in range(NUM_SETS):
    training_data = np.loadtxt('files/cross-validation/sets/train%d.txt' %
                               conj)
    training_indices = np.array(training_data[:, 2], dtype=np.int)
    training_data = training_data[:, :2]
    training_classes = np.ones(len(training_data), dtype=np.int)
    print(training_classes.shape)
    print(training_classes)

    test_data = np.loadtxt('files/cross-validation/sets/test%d.txt' % conj)
    test_indices = np.array(test_data[:, 2], dtype=np.int)

    (kNN_valid_err[conj], _, _, _) = kNN.classify(training_data, test_data,
                                                  training_classes,
                                                  test_classes)

    break
예제 #19
0
    # ax = fig.add_subplot(111)
    # ax.scatter(normalized_training_set[:, 0], normalized_training_set[:, 1],
    #             np.array(class_label_vector), np.array(class_label_vector))
    # plt.show()

    # Classifier Test
    k = 5
    test_ratio = 0.15
    data_size = normalized_training_set.shape[0]
    test_set_size = int(data_size * test_ratio)

    error_count = 0
    for i in range(test_set_size):
        test_result = kNN.classify(
            input_data=normalized_training_set[i, :],
            training_set=normalized_training_set[test_set_size:, :],
            labels=class_label_vector[test_set_size:],
            k=k)

        if test_result != class_label_vector[i]:
            print(
                "- The classifier came back with: {0}, the real answer is : {1}"
                .format(test_result, class_label_vector[i]))
            error_count += 1
        else:
            pass
    error_rate = error_count / test_set_size

    info = """
    --------------------------------------------------
    * Data Set Shape: {0}
예제 #20
0
from numpy import *
import sys
import matplotlib
import matplotlib.pyplot as plt
from imp import reload
sys.path.append(r"C:\Python34\code\machinelearninginaction\Ch02")
import kNN

reload(kNN)

df = kNN.createDataSet()
inputt = array([0.7, 0.8])
K = 3
output = kNN.classify(inputt, df, K)
print("测试数据为:", inputt, "分类结果为:", output)

fig = plt.figure(figsize=(6, 6))  #XY轴具有相同的刻度和比例
ax = fig.add_subplot(1, 1, 1)
plt.plot(df['x'], df['y'], 'ro')  #画图
plt.plot(inputt[0], inputt[1], 'go')
count = 0
##添加标注
for label in df.index:
    ax.annotate(label,
                xy=df.values[count],
                xytext=(df.values[count][0] + 0.1, df.values[count][1] + 0.05),
                arrowprops=(dict(facecolor='b',
                                 width=0.05,
                                 shrink=0.05,
                                 headwidth=1,
                                 connectionstyle="arc3")))
예제 #21
0
    labels = np.array(labels)
    y = np.zeros(labels.shape)
    ''''' 标签转换为0/1 '''
    y[labels == 'A'] = 1
    return x, y


# 给出训练数据以及对应的类别
def createDataSet():
    # group = array([[1.0, 2.0], [1.2, 0.1], [0.1, 1.4], [0.3, 3.5]])
    # labels = ['A', 'A', 'B', 'B']
    # return group, labels

    return data_ready()


# 1 0 0 69 0 174 2 186 2 B
# 2 22 5 0 0 0 0 0 0 B

# 2 0 0 0 0 0 2 13 1 A
# 2 0 0 3 0 0 2 65 1 A

# 1 0 0 0 0 0 2 48 1 A
# 1 0 0 0 0 0 2 582 0 A

dataSet, labels = createDataSet()
input = array([1, 0, 0, 0, 0, 0, 2, 48, 1])
K = 3
output = kNN.classify(input, dataSet, labels, K)
print("测试数据为:", input, "分类结果为:", output)
예제 #22
0
    return dataFromFile,resultClass

def auto_toOne(matrix):
    result=zeros((matrix.shape[0],matrix.shape[1]))
    rows=matrix.shape[0];
    coloum=matrix.shape[1];
    ran=zeros((1,coloum));
    ran=matrix.max(0)-matrix.min(0)
    
    norMatrix=matrix-tile(matrix.min(0),(rows,1))
    result=norMatrix/tile(ran,(rows,1))
    return result,ran,matrix.min(0)

dataFromFile,resultClass=textRead()
result,ran,mins=auto_toOne(dataFromFile)
len=result.shape[0]
errCount=0
classsCount=(int)(0.1*len)
for i in range(classsCount):
    r=classify(result[i],result[classsCount:len,:],resultClass[classsCount:len,0],3);
    if(r!=resultClass[i,0]):
       errCount+=1.0
print errCount
print "err is %f" % (errCount/classsCount)  


fig=pyt.figure()
ax=fig.add_subplot(111)
ax.scatter(result[:,0],result[:,1]);
#pyt.show()
예제 #23
0
import kNN
g,l = kNN.create_dataset()
print kNN.classify([0,0],g,l,3)

'''
File Name:    main
Description:  主函数,主要调用kNN.py中的函数
Author:       jwj
Date:         2018/1/18
'''
__author__ = 'jwj'

import kNN

if __name__ == '__main__':
    group, labels = kNN.createDataSet()
    label = kNN.classify([0, 0], group, labels, 3)
    print(label)

    dataArray, dataLabels = kNN.file2matrix("datingTestSet2.txt")
    kNN.autoNorm(dataArray)

    normMat, ranges, minVals = kNN.autoNorm(dataArray)
    # print(normMat)

    # kNN.dataClassTest()
    # kNN.classifyPerson()

    kNN.handwritingClassTest()
예제 #25
0
from numpy import *
import matplotlib.pyplot as plt
import kNN
# create 5 my neighbors
neighbors,names = kNN.createDataSet(5)
# find two nearest neighbors to me
result = kNN.classify([0,0], neighbors, names, 2)
# x and y save positons of my neighbors
x = [0]* neighbors.shape[0]
y = [0]* neighbors.shape[0]
for i in range(0, neighbors.shape[0]):   
	x[i] = neighbors [i][0]      
	y[i] = neighbors [i][1]
# display my neighbors with blue color
plt.plot(x,y,'bo')
plt.axis([-0.2, 1.2, -0.2, 1.2])
# assign names to neighbors
for i, name in enumerate(names):     
plt.annotate(name,(x[i],y[i]),(x[i]-0.08,y[i]+0.01))
# diplay me with red color
plt.plot([0],[0],'ro')
# display two nearest neighbors with  messages and yellow color
for i, name in enumerate(names):    
	for r in result:           
		if name is r[0]:               
			plt.plot([x[i]],[y[i]],'yo')                  
			plt.annotate('I am here',(x[i],y[i]),(x[i]+0.01,y[i]-0.05))
plt.show()
예제 #26
0
import kNN
import numpy

result_array = ['not at all', 'in small doses', 'in large doses']
dataset, labels = kNN.file_to_array('datingTestSet2.txt')
normalize_dataset, min_array, max_array, range_array = kNN.normalize(dataset)

fly_input = float(input('每年获得的飞行常客里程数 >>>'))
game_input = float(input('玩视频游戏所耗时间百分比 >>>'))
icecream_input = float(input('每周消费的冰淇淋公升数 >>>'))
input_array = numpy.array([fly_input, game_input, icecream_input])
normalize_input_array = (input_array - min_array) / range_array
label = kNN.classify(normalize_dataset, labels, normalize_input_array, 5)
print('label %s' % result_array[label - 1])
import numpy
import os

train_file_array = os.listdir('digits/trainingDigits')
train_file_len = len(train_file_array)
train_labels = []
train_dataset = numpy.zeros((train_file_len, 1024))
for i in range(train_file_len):
    filename = train_file_array[i]
    label = filename.split('.')[0]
    label = label.split('_')[0]
    label = int(label)
    train_labels.append(label)
    train_dataset[i, :] = kNN.image_to_array('digits/trainingDigits/' +
                                             filename)

test_file_array = os.listdir('digits/testDigits')
test_file_len = len(test_file_array)
error_len = 0
for i in range(test_file_len):
    filename = test_file_array[i]
    label = filename.split('.')[0]
    label = label.split('_')[0]
    label = int(label)
    input = kNN.image_to_array('digits/testDigits/' + filename)
    result = kNN.classify(train_dataset, train_labels, input, 5)
    print('result: %d, label: %d' % (result, label))
    if result != label:
        error_len += 1
print('error count: %d' % error_len)
print('error rate: %f' % (error_len / float(test_file_len)))
예제 #28
0
import kNN
g, l = kNN.create_dataset()
print kNN.classify([0, 0], g, l, 3)
예제 #29
0
 def test_dataset():
     group = numpy.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
     labels = ['A', 'A', 'B', 'B']
     x = numpy.array([1, 0.5])
     result = classify(x, normalize(group), labels, 3)
     logging.info(result)