def do_txt_training(self, txt, k): dataset, labels = self.txt2dataset(txt) dataset = normalize(dataset) # 90% for training, 10% for verify index = int(0.9 * len(labels)) training_set = dataset[:index] training_lables = labels[:index] ref_set = dataset[index:] ref_labels = labels[index:] # start testing errno = 0 for (x, label) in itertools.izip(ref_set, ref_labels): result = classify(x, training_set, training_lables, k) msg = 'Data: %s, label: %s, result: %s' % (x, label, result) logging.debug(msg) if result != label: errno += 1 return float(errno) / len(ref_labels)
import kNN import numpy result_array = ['not at all', 'in small doses', 'in large doses'] dataset, labels = kNN.file_to_array('datingTestSet2.txt') normalize_dataset, min_array, max_array, range_array = kNN.normalize(dataset) fly_input = float(input('每年获得的飞行常客里程数 >>>')) game_input = float(input('玩视频游戏所耗时间百分比 >>>')) icecream_input = float(input('每周消费的冰淇淋公升数 >>>')) input_array = numpy.array([fly_input, game_input, icecream_input]) normalize_input_array = (input_array - min_array) / range_array label = kNN.classify(normalize_dataset, labels, normalize_input_array, 5) print('label %s' % result_array[label - 1])
each_line = line.split('\t') to_matrix[index, :] = each_line[0:3] class_label_vector.append(int( each_line[-1])) # int("label\n") => label: int index += 1 return to_matrix, class_label_vector if __name__ == "__main__": root_path = os.path.dirname(os.path.abspath(__file__)) data_folder = os.path.join(root_path, "data") data_file = os.path.join(data_folder, "dating_DataSet.txt") training_set, class_label_vector = file_to_matrix(data_file) normalized_training_set, range_values, min_value = kNN.normalize( training_set) # fig = plt.figure() # ax = fig.add_subplot(111) # ax.scatter(normalized_training_set[:, 0], normalized_training_set[:, 1], # np.array(class_label_vector), np.array(class_label_vector)) # plt.show() # Classifier Test k = 5 test_ratio = 0.15 data_size = normalized_training_set.shape[0] test_set_size = int(data_size * test_ratio) error_count = 0 for i in range(test_set_size):
def classifyBykNN(dataSet, labels, k, predictX): dataSet = kNN.normalize(dataSet) return kNN.kNNclassify(dataSet, labels, k, predictX)
classes = Rocchio.train(dataSet, labels) return Rocchio.predict(classes, predictX) def classifyByNBC(predictX): return NBCpredict.predict(predictX) def classifyBySVM(predictX): return SVMpredict.predict(predictX) def classifyByANN(predictX): return ANNpredict.predict(predictX) if __name__ == "__main__": ''' Classify the document using the given method ''' ds, labels = loadDataSet(CLASSIFY_FILE, 5400, 967) fh = open('D:/e.txt', 'r') x = vectorArticleByTFIDF(fh.read()) fh.close() ds = kNN.normalize(ds) c1 = int(classifyBykNN(ds, labels, 10, x)) c2 = int(classifyByRocchio(ds, labels, x)) ##print c1;exit() for c in CLASS_LABEL: cid = CLASS_LABEL[c][0] cname = CLASS_LABEL[c][1] if cid == c1: print 'Predicted by kNN: %s - %d(%s)' % (c, cid, cname) if cid == c2: print 'Predicted by Rocchio: %s - %d(%s)' % (c, cid, cname)
# Change lines into array featureCount = len(lines[0].split()) - 1 group = np.zeros((len(lines), featureCount)) labels = [] for i in range(len(lines)): lst = lines[i].split() group[i] = np.array(lst[:-1]) labels.append(lst[-1]) return (group, labels) # Get training set group, labels = createDataSetFromFile('data/dating/training.txt') group = kNN.normalize(group) # Try on test set testGroup, testLabels = createDataSetFromFile('data/dating/test.txt') testGroup = kNN.normalize(testGroup) correct = 0 for i in range(testGroup.shape[0]): res = kNN.classify0(testGroup[i], group, labels, 3) if res == 'didntLike': res = '1' elif res == 'smallDoses': res = '2' else: res = '3' if res == testLabels[i]:
def test_dataset(): group = numpy.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]]) labels = ['A', 'A', 'B', 'B'] x = numpy.array([1, 0.5]) result = classify(x, normalize(group), labels, 3) logging.info(result)