def classification(test_data, test_bagOfWords, original_data, original_labels, original_bagOfWords, k=3): """ kNN Model Based Classifier for test data (actual data) """ for i in range(len(test_bagOfWords)): x = classify(np.array(test_bagOfWords[i]), np.array(original_bagOfWords), original_labels, k) print(test_data[i], x)
def handwritingClassTest(): trainingFilePath = "data/digits/trainingDigits/" testFilePath = "data/digits/testDigits/" hwLabels = [] trainingFileList = listdir(trainingFilePath) m = len(trainingFileList) # 1934 trainingMat = zeros((m, 1024)) for i in range(m): fileNameStr = trainingFileList[i] # 0_10.txt fileName = fileNameStr.split(".")[0] # 0_10 classNum = int(fileName.split("_")[0]) # 0 hwLabels.append(classNum) trainingMat[i, :] = img2vector(trainingFilePath + fileNameStr) testFileList = listdir(testFilePath) errorCount = 0 mTest = len(testFileList) for i in range(mTest): fileNameStr = testFileList[i] fileName = fileNameStr.split(".")[0] classNum = int(fileName.split("_")[0]) testVector = img2vector(testFilePath + fileNameStr) classifyRes = kNN.classify(testVector, trainingMat, hwLabels, 3) print("kNN分类器分类结果为:{}, 真实的数字为:{}".format(classifyRes, classNum)) if (classifyRes != classNum): errorCount += 1 print("错误的数量为%d" % errorCount) print("错误率为{}".format(str(errorCount / mTest)))
def handwritingClassTest1(): hwLabels = [] trainingFileList = listdir('trainingDigits') #加载训练集 m = len(trainingFileList) #计算当前文件夹下文件个数 trainingMat = np.zeros((m, 1024)) #初始化训练向量矩阵 for i in range(m): fileNameStr = trainingFileList[i] #获取文件名 fileStr = fileNameStr.split('.')[0] #从文件名中解析出分类的数字 classNumStr = int(fileStr.split('_')[0]) hwLabels.append(classNumStr) trainingMat[i, :] = img2vector('trainingDigits/%s' % fileNameStr) testFileList = listdir('testDigits') #加载测试集 errorCount = 0.0 mTest = len(testFileList) for i in range(mTest): fileNameStr = testFileList[i] fileStr = fileNameStr.split('.')[0] #从文件名中解析出测试样本的类别 classNumStr = int(fileStr.split('_')[0]) vectorUnderTest = img2vector('testDigits/%s' % fileNameStr) classifierResult = kNN.classify(vectorUnderTest, trainingMat, hwLabels, 3) #开始分类 print('预测数字: %d, 真实数字: %d' % (classifierResult, classNumStr)) if (classifierResult != classNumStr): errorCount += 1.0 #计算分错的样本数 print('\n总错误样本数: %d' % errorCount) print('\n错误率: %f' % (errorCount / float(mTest)))
def test_handwriting(self): train_set, train_lables = self.imgdir2dataset('knn/trainingDigits') test_set, test_labels = self.imgdir2dataset('knn/testDigits') err = 0 for x, label in itertools.izip(test_set, test_labels): result = classify(x, train_set, train_lables, 6) if result != label: err += 1 logging.info('Error rate: %f', float(err) / len(test_labels))
def training_classification(data, label, bagOfWords, k=3): """ kNN Model Based Classifier for the Training Set data; Parameters: - """ errCount = 0 for i in range(len(bagOfWords)): x = classify(np.array(bagOfWords[i]), np.array(bagOfWords), label, k) # print(data[i], x, label[i]) if x != label[i]: errCount += 1 print(data[i], x, label[i]) return (errCount / len(bagOfWords)) * 100
def datingClassTest(): ratio = 0.50 # 训练和测试的比例 datingDataMat, datingLabels = file2matrix('data/datingTestSet2.txt') normMat = autoNorm(datingDataMat) m = normMat.shape[0] # 行数 numTestVecs = int(m * ratio) errorCount = 0 for i in range(numTestVecs): classifyRes = kNN.classify(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3) print("kNN分类器分类结果为:{}, 真实的类别为:{}".format(classifyRes, datingLabels[i])) if (classifyRes != datingLabels[i]): errorCount += 1 print("错误的数量为%d" % errorCount) print("错误率为{}".format(str(errorCount / numTestVecs)))
def datingClassTest(testSetPercent, k): dataSetX, dataSetY = getDatingData() normDataSetX, ranges, mins = helper.autoNormalize(dataSetX) dataSize = normDataSetX.shape[0] testSize = int(dataSize * testSetPercent) errorCount = 0 for i in range(testSize): inX = normDataSetX[i] outY = kNN.classify(inX, normDataSetX[testSize:,:], \ dataSetY[testSize:], \ k) if(outY != dataSetY[i]): errorCount = errorCount + 1 errorRate = errorCount / float(testSize) return errorRate
def datingClassTest(datingDataMat, datingLabels): hoRatio = 0.10 #将数据集且分为训练集和测试集的比例 normMat, ranges, minVals = autoNorm(datingDataMat) m = normMat.shape[0] numTestVecs = int(m * hoRatio) #测试集数目 errorCount = 0.0 #分错样本数 for i in range(numTestVecs): classifierResult = kNN.classify(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3) print('预测类别: %d, 真实类别: %d' % (classifierResult, datingLabels[i])) if (classifierResult != datingLabels[i]): errorCount += 1.0 print('错误率: %f' % (errorCount / float(numTestVecs)))
def do_txt_training(self, txt, k): dataset, labels = self.txt2dataset(txt) dataset = normalize(dataset) # 90% for training, 10% for verify index = int(0.9 * len(labels)) training_set = dataset[:index] training_lables = labels[:index] ref_set = dataset[index:] ref_labels = labels[index:] # start testing errno = 0 for (x, label) in itertools.izip(ref_set, ref_labels): result = classify(x, training_set, training_lables, k) msg = 'Data: %s, label: %s, result: %s' % (x, label, result) logging.debug(msg) if result != label: errno += 1 return float(errno) / len(ref_labels)
def classification(data, classified_file, unclassified_file): """ Classifying the unclassified experience labels (will be especially useful for future as data grows...) Parameter: -Data: experience data -Classified_file: experience with labels -Unclassified_file: experience without labels """ f = open(unclassified_file) lines = f.readlines() lines = [line.replace('\n', '') for line in lines] data, labels = parse_classified() vocabSet = c.vocabSet(data) bagOfWords = [c.bag_of_words(vocabSet, i) for i in data] errCount = 0 for i in range(len(bagOfWords)): x = classify(np.array(bagOfWords[i]), np.array(bagOfWords), labels, 20) print(data[i], x, labels[i]) if x != labels[i]: errCount += 1 print(errCount / len(bagOfWords))
def classifyPerson(datingDataMat, datingLabels): #特征 percentTats = float(input("玩视频游戏所耗时间百分比:")) #控制台手动输入 0.96 ffMiles = float(input("每年获得的飞行常客里程数:")) #控制台手动输入 50000 iceCream = float(input("每周消费的冰激淋公升数:")) #控制台手动输入 1.55 #标签 resultList = ['不喜欢的人', '魅力一般的人', '极具魅力的人'] #训练集归一化 normMat, ranges, minVals = autoNorm(datingDataMat) #测试集归一化 inArray = np.array([percentTats, ffMiles, iceCream]) normInArray = (inArray - minVals) / ranges #返回分类结果 classifierResult = kNN.classify(normInArray, normMat, datingLabels, 3) print("他可能是你%s" % (resultList[classifierResult - 1]))
def getDigitsData(k): dirname = '../../data/kNN_data/digits/trainingDigits/' trainingSetX, trainingSetY = getTrainingSet(dirname) errorCount = 0 dirname = '../../data/kNN_data/digits/testDigits/' for root, dirs, files in os.walk(dirname): break for filename in files: header = filename.strip('.txt') label = header.split('_')[0] label = int(label) filename = root +filename inX = getVector(filename) result = kNN.classify(inX, trainingSetX, trainingSetY, k) print((label, result)) if(label != result): errorCount += 1 totalCount = len(files) errorRate = errorCount/float(totalCount) print((errorRate, errorCount, totalCount)) return errorRate
def handWriteDigitClassify(): # 训练数据集及标签 mnist2Text.read_image('../caffe/data/mnist/train-images-idx3-ubyte', './train_image.txt') mnist2Text.read_label('../caffe/data/mnist/train-labels-idx1-ubyte', './train_label.txt') traingImage = kNN.img2Vector('./train_image.txt') traingLabel = kNN.label2Vector('./train_label.txt') # 测试数据集几标签 mnist2Text.read_image('../caffe/data/mnist/t10k-images-idx3-ubyte', './test_image.txt') mnist2Text.read_label('../caffe/data/mnist/t10k-labels-idx1-ubyte', './test_label.txt') testImage = kNN.img2Vector('./test_image.txt') testLabel = kNN.label2Vector('./test_label.txt') error = 0.0 for i in range(200): knnClass = kNN.classify(testImage[i], traingImage, traingLabel, 5) print " the kNN's classifies result is " + str(knnClass) print " the True is " + str(testLabel[i]) if knnClass != testLabel[i]: error += 1.0 print "the error rate : " + str(error/200.0)
import kNN dataset, labels = kNN.file_to_array('datingTestSet2.txt') normalize_dataset, min_array, max_array, range_array = kNN.normalize(dataset) dataset_len = dataset.shape[0] ratio = 0.1 test_len = int(dataset_len * ratio) train_dataset = normalize_dataset[:dataset_len - test_len, :] train_labels = labels[:dataset_len - test_len] error_len = 0 for i in range(dataset_len - test_len, dataset_len): label = kNN.classify(train_dataset, train_labels, normalize_dataset[i, :], 5) print('index: %d, train: %d, real: %d' % (i, label, labels[i])) if label != labels[i]: error_len += 1 print('error rate: %f' % (error_len / float(test_len)))
trainLabels.append(copyLabels.pop(index)) return [trainSet, copy, trainLabels, copyLabels] filename = "../data/3079066.txt" # filename = "../data/data_3121867.txt" rankcount = 199 dataSet, labels = kNN.data_ready(filename, rankcount) # input = array([0, 1]) K = 5 rank_range = 10 # 排名误差 splitRatio = 0.67 # 训练集数据 测试集数据 trainingSet, testSet, trainLabels, copyLabels = splitDataset(dataSet.tolist(), splitRatio, labels.tolist()) print 'Split {0} rows into train={1} and test={2} rows'.format(len(dataSet), len(trainingSet), len(testSet)) success_count = 0 for i in range(0, len(testSet)): output = kNN.classify(testSet[i], np.array(trainingSet), np.array(trainLabels), K) copy_class = float(copyLabels[i]) out_class = float(output) difference = int(abs(copy_class - out_class)) # print difference if difference < rank_range: success_count += 1 else: print testSet[i], output, copyLabels[i], difference print success_count print float(success_count)/len(testSet)
X_train = np.array(pd.read_csv("X_train.csv", header=None)) y_train = list(pd.read_csv("y_train.csv", header=None).ix[:, 0]) X_test = np.array(pd.read_csv("X_test.csv", header=None)) XTrain = X_train[:nsample, :] #use the first 4000 samples for training yTrain = y_train[:nsample] XVal = X_train[nsample:, :] #use the rests for validation yVal = y_train[nsample:] #nVal = XVal.shape[0] nVal = 100 #for simplicity... valScore = 0 for i in range(nVal): prediction = knn.classify(XVal[i, :], XTrain, yTrain, 1) #1-NN print("Validation sample ", i, "... Prediction: ", prediction, " Truth: ", yVal[i]) if prediction == yVal[i]: valScore = valScore + 1 print("Validation score ", float(valScore) / nVal) nTest = X_test.shape[0] yHatTest = [] for i in range(nTest): prediction = knn.classify(X_test[i, :], XTrain, yTrain, 1) print("Testing sample ", i, "... Prediction: ", prediction) yHatTest.append(prediction) np.savetxt('result_knn.txt', yHatTest)
kNN: 电影分类 @author: Jerry """ import numpy as np import kNN # 创建数据集 def createDataSet(): #[笑脸镜头 高科技镜头 接吻镜头 打斗镜头] features = np.array([[5, 10, 32, 114], [2, 5, 23, 150], [1, 9, 8, 154], [121, 10, 12, 11], [98, 2, 20, 5], [4, 97, 14, 10], [8, 110, 13, 23], [9, 100, 5, 1], [4, 5, 90, 5], [1, 3, 88, 10]]) labels = [ "动作片", "动作片", "动作片", "喜剧片", "喜剧片", "科幻片", "科幻片", "科幻片", "爱情片", "爱情片" ] return features, labels if __name__ == '__main__': features, labels = createDataSet() input = np.array([5, 100, 12, 6]) k = 3 label = kNN.classify(input, features, labels, k) print('预测结果:', label)
import numpy as np import kNN NUM_SETS = 10 # cross-validation do parzen parzen_valid_err = np.loadtxt('files/cross-validation/parzen.txt') parzen_valid_meanerr = np.mean(parzen_valid_err) print('h = 1\nparzen_valid_meanerr = 0.5111\n') print('h = 0.5\nparzen_valid_meanerr: %f\n' % parzen_valid_meanerr) print('h = 0.3\nparzen_valid_meanerr = 0.2111\n') # cross-validation do kNN kNN_valid_err = np.array(range(NUM_SETS)) for conj in range(NUM_SETS): training_data = np.loadtxt('files/cross-validation/sets/train%d.txt' % conj) training_indices = np.array(training_data[:, 2], dtype=np.int) training_data = training_data[:, :2] training_classes = np.ones(len(training_data), dtype=np.int) print(training_classes.shape) print(training_classes) test_data = np.loadtxt('files/cross-validation/sets/test%d.txt' % conj) test_indices = np.array(test_data[:, 2], dtype=np.int) (kNN_valid_err[conj], _, _, _) = kNN.classify(training_data, test_data, training_classes, test_classes) break
# ax = fig.add_subplot(111) # ax.scatter(normalized_training_set[:, 0], normalized_training_set[:, 1], # np.array(class_label_vector), np.array(class_label_vector)) # plt.show() # Classifier Test k = 5 test_ratio = 0.15 data_size = normalized_training_set.shape[0] test_set_size = int(data_size * test_ratio) error_count = 0 for i in range(test_set_size): test_result = kNN.classify( input_data=normalized_training_set[i, :], training_set=normalized_training_set[test_set_size:, :], labels=class_label_vector[test_set_size:], k=k) if test_result != class_label_vector[i]: print( "- The classifier came back with: {0}, the real answer is : {1}" .format(test_result, class_label_vector[i])) error_count += 1 else: pass error_rate = error_count / test_set_size info = """ -------------------------------------------------- * Data Set Shape: {0}
from numpy import * import sys import matplotlib import matplotlib.pyplot as plt from imp import reload sys.path.append(r"C:\Python34\code\machinelearninginaction\Ch02") import kNN reload(kNN) df = kNN.createDataSet() inputt = array([0.7, 0.8]) K = 3 output = kNN.classify(inputt, df, K) print("测试数据为:", inputt, "分类结果为:", output) fig = plt.figure(figsize=(6, 6)) #XY轴具有相同的刻度和比例 ax = fig.add_subplot(1, 1, 1) plt.plot(df['x'], df['y'], 'ro') #画图 plt.plot(inputt[0], inputt[1], 'go') count = 0 ##添加标注 for label in df.index: ax.annotate(label, xy=df.values[count], xytext=(df.values[count][0] + 0.1, df.values[count][1] + 0.05), arrowprops=(dict(facecolor='b', width=0.05, shrink=0.05, headwidth=1, connectionstyle="arc3")))
labels = np.array(labels) y = np.zeros(labels.shape) ''''' 标签转换为0/1 ''' y[labels == 'A'] = 1 return x, y # 给出训练数据以及对应的类别 def createDataSet(): # group = array([[1.0, 2.0], [1.2, 0.1], [0.1, 1.4], [0.3, 3.5]]) # labels = ['A', 'A', 'B', 'B'] # return group, labels return data_ready() # 1 0 0 69 0 174 2 186 2 B # 2 22 5 0 0 0 0 0 0 B # 2 0 0 0 0 0 2 13 1 A # 2 0 0 3 0 0 2 65 1 A # 1 0 0 0 0 0 2 48 1 A # 1 0 0 0 0 0 2 582 0 A dataSet, labels = createDataSet() input = array([1, 0, 0, 0, 0, 0, 2, 48, 1]) K = 3 output = kNN.classify(input, dataSet, labels, K) print("测试数据为:", input, "分类结果为:", output)
return dataFromFile,resultClass def auto_toOne(matrix): result=zeros((matrix.shape[0],matrix.shape[1])) rows=matrix.shape[0]; coloum=matrix.shape[1]; ran=zeros((1,coloum)); ran=matrix.max(0)-matrix.min(0) norMatrix=matrix-tile(matrix.min(0),(rows,1)) result=norMatrix/tile(ran,(rows,1)) return result,ran,matrix.min(0) dataFromFile,resultClass=textRead() result,ran,mins=auto_toOne(dataFromFile) len=result.shape[0] errCount=0 classsCount=(int)(0.1*len) for i in range(classsCount): r=classify(result[i],result[classsCount:len,:],resultClass[classsCount:len,0],3); if(r!=resultClass[i,0]): errCount+=1.0 print errCount print "err is %f" % (errCount/classsCount) fig=pyt.figure() ax=fig.add_subplot(111) ax.scatter(result[:,0],result[:,1]); #pyt.show()
import kNN g,l = kNN.create_dataset() print kNN.classify([0,0],g,l,3)
''' File Name: main Description: 主函数,主要调用kNN.py中的函数 Author: jwj Date: 2018/1/18 ''' __author__ = 'jwj' import kNN if __name__ == '__main__': group, labels = kNN.createDataSet() label = kNN.classify([0, 0], group, labels, 3) print(label) dataArray, dataLabels = kNN.file2matrix("datingTestSet2.txt") kNN.autoNorm(dataArray) normMat, ranges, minVals = kNN.autoNorm(dataArray) # print(normMat) # kNN.dataClassTest() # kNN.classifyPerson() kNN.handwritingClassTest()
from numpy import * import matplotlib.pyplot as plt import kNN # create 5 my neighbors neighbors,names = kNN.createDataSet(5) # find two nearest neighbors to me result = kNN.classify([0,0], neighbors, names, 2) # x and y save positons of my neighbors x = [0]* neighbors.shape[0] y = [0]* neighbors.shape[0] for i in range(0, neighbors.shape[0]): x[i] = neighbors [i][0] y[i] = neighbors [i][1] # display my neighbors with blue color plt.plot(x,y,'bo') plt.axis([-0.2, 1.2, -0.2, 1.2]) # assign names to neighbors for i, name in enumerate(names): plt.annotate(name,(x[i],y[i]),(x[i]-0.08,y[i]+0.01)) # diplay me with red color plt.plot([0],[0],'ro') # display two nearest neighbors with messages and yellow color for i, name in enumerate(names): for r in result: if name is r[0]: plt.plot([x[i]],[y[i]],'yo') plt.annotate('I am here',(x[i],y[i]),(x[i]+0.01,y[i]-0.05)) plt.show()
import kNN import numpy result_array = ['not at all', 'in small doses', 'in large doses'] dataset, labels = kNN.file_to_array('datingTestSet2.txt') normalize_dataset, min_array, max_array, range_array = kNN.normalize(dataset) fly_input = float(input('每年获得的飞行常客里程数 >>>')) game_input = float(input('玩视频游戏所耗时间百分比 >>>')) icecream_input = float(input('每周消费的冰淇淋公升数 >>>')) input_array = numpy.array([fly_input, game_input, icecream_input]) normalize_input_array = (input_array - min_array) / range_array label = kNN.classify(normalize_dataset, labels, normalize_input_array, 5) print('label %s' % result_array[label - 1])
import numpy import os train_file_array = os.listdir('digits/trainingDigits') train_file_len = len(train_file_array) train_labels = [] train_dataset = numpy.zeros((train_file_len, 1024)) for i in range(train_file_len): filename = train_file_array[i] label = filename.split('.')[0] label = label.split('_')[0] label = int(label) train_labels.append(label) train_dataset[i, :] = kNN.image_to_array('digits/trainingDigits/' + filename) test_file_array = os.listdir('digits/testDigits') test_file_len = len(test_file_array) error_len = 0 for i in range(test_file_len): filename = test_file_array[i] label = filename.split('.')[0] label = label.split('_')[0] label = int(label) input = kNN.image_to_array('digits/testDigits/' + filename) result = kNN.classify(train_dataset, train_labels, input, 5) print('result: %d, label: %d' % (result, label)) if result != label: error_len += 1 print('error count: %d' % error_len) print('error rate: %f' % (error_len / float(test_file_len)))
import kNN g, l = kNN.create_dataset() print kNN.classify([0, 0], g, l, 3)
def test_dataset(): group = numpy.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]]) labels = ['A', 'A', 'B', 'B'] x = numpy.array([1, 0.5]) result = classify(x, normalize(group), labels, 3) logging.info(result)