def get_knn_probability(k, training_data, training_data_class, test_data): """ Use knn to compute probabilities of test_data belonging to training_data_class k: number of nearest neighbors training_data: training training_data instances training_data_class: classes of each training training_data instance test_data: test_data to classify returns: probabilities of each input instance belonging to each class """ num_inputs = np.shape(test_data)[0] unique_classes = np.unique(training_data_class) unique_class_to_index = {} for i in range(len(unique_classes)): unique_class_to_index[unique_classes[i]] = i num_classes = len(unique_classes) if False: print 'num_inputs =', num_inputs print 'num_classes =', num_classes print 'training_data_class =', training_data_class print 'unique_classes =', unique_classes print 'unique_class_to_index =', unique_class_to_index exit() print 'training_data =', training_data print 'test_data =', test_data probabilites = np.zeros((num_inputs,num_classes),dtype = 'f') if USE_KD_TREE: print 'Training kd tree' kd_tree = KDTree(training_data) print 'Done training kd, tree' for n in range(num_inputs): if USE_KD_TREE: distances, indices = kd_tree.query(test_data[n,:], k=k) else: distances = np.sqrt(np.sum((training_data - test_data[n,:])**2, axis = 1)) indices = np.argsort(distances, axis = 0) #print 'i =', test_data[n,:] #print 'd =', distances #print 'indices =', indices classes = training_data_class[indices[:k]] if False: print 'classes =', classes class_totals = np.zeros(num_classes) for i in range(classes.shape[0]): class_totals[unique_class_to_index[classes[i]]] += 1 #print 'class_totals =', class_totals for i in range(class_totals.shape[0]): probabilites[n,i] = class_totals[i]/classes.shape[0] return unique_classes, probabilites
def get_knn(k, training_data_class, test_data, kd_tree): """ Naive implementation of k nearest neighbours k: number of nearest neighbours training_data_class: classes of each training training_data instance test_data: test_data to classify returns: classes of each input instance """ num_inputs = np.shape(test_data)[0] if True: print 'k =', k print 'num_inputs =', num_inputs print 'training_data_class =', training_data_class[:20] closest = np.zeros(num_inputs) for n in range(num_inputs): retval = kd_tree.query(test_data[n,:], k=k) #print 'kd_tree.query returned', retval distances, indices = retval if k == 1: indices = np.array([indices]) distances = np.array([distances]) #print 'k =', k #print 'i =', test_data[n,:] #print 'd =', distances #print 'indices =', indices classes = training_data_class[indices[:k]] #print 'classes =', classes classes = np.unique(classes) #print 'unique classes =', classes if len(classes) == 1: closest[n] = np.unique(classes) else: #print 'x'*10 counts = np.zeros(max(classes) + 1) for i in range(k): counts[training_data_class[indices[i]]] += 1 #print 'counts =', counts closest[n] = np.argmax(counts) return closest