def _calc_score(predict_y, true_y, nlabels, with_cfmat=False): """ Calculate various scoring of the prediction. :param predict_y: predicted value (enumerated) :param true_y: actual value (enumerated) :param nlabels: number of labels (true_y might not have all labels) :param with_cfmat: if True will return confusiion matrix :return: score( percentage correct) label_hits (ndarray of hit count for each label. Labels not present in true_y will have count of NaN) label_misses similar to label_hits but count the misses cf_matrix (optional) the confusion matrix (nlabels x nlabels) """ hits = (predict_y == true_y).astype(np.int) misses = 1 - hits score = hits.sum() / len(true_y) label_hits = np.full((nlabels, ), np.nan) label_misses = np.full((nlabels, ), np.nan) unique_test_labels = np.unique(true_y) _label_hits = accum(true_y, hits, func=np.sum, dtype=np.int) _label_misses = accum(true_y, misses, func=np.sum, dtype=np.int) label_hits[unique_test_labels] = _label_hits[unique_test_labels] label_misses[unique_test_labels] = _label_misses[unique_test_labels] if with_cfmat: cf_matrix = confusion_matrix(true_y, predict_y) return score, label_hits, label_misses, cf_matrix else: return score, label_hits, label_misses
def sorted_labels_by_prevalence_then_distance(neighbour_labels, neighbour_distances, distance_func): # labels = np.array(['a', 'a', 'b', 'c', 'b', 'a', 'c', 'd']) # dists = np.array([0.1, 0.1, 0.3, 0.3, 0.5, 0.2, 0.4, 0.6]) # indices = np.array([0, 1, 2, 3, 4, 5, 6, 7]) sorted_indices = np.argsort(neighbour_distances) closests = sorted_indices[0:7] closests_labels = neighbour_labels[closests] unique_labels, indices = np.unique(closests_labels, return_inverse=True) prevalences = np.bincount(indices) closests_distances = neighbour_distances[closests] mean_distance_by_labels = accum(indices, closests_distances, func=distance_func, dtype=np.float) # Now, lexsort sorts everything ascendingly. We want the prevalences to be sorted descendingly and distances # ascendingly. So we must negate prevalences sorted_indices_by_prevalence_then_mean_distance = np.lexsort( (0 - prevalences, mean_distance_by_labels)) sorted_labels = unique_labels[ sorted_indices_by_prevalence_then_mean_distance] return sorted_labels
def _calc_score(predict_y, test_y, nlabels): hits = (predict_y == test_y).astype(np.int) misses = 1 - hits score = hits.sum() / len(test_y) label_hits = np.full((nlabels, ), np.nan) label_misses = np.full((nlabels, ), np.nan) unique_test_labels = np.unique(test_y) _label_hits = accum(test_y, hits, func=np.sum, dtype=np.int) _label_misses = accum(test_y, misses, func=np.sum, dtype=np.int) label_hits[unique_test_labels] = _label_hits[unique_test_labels] label_misses[unique_test_labels] = _label_misses[unique_test_labels] return score, label_hits, label_misses
def k_nearest(distmat, train_labels, nlabels, k, map_order): # Leave one out K nearest neighbour # Find the nearest 5 neighbours & find out which label dominates element_count = distmat.shape[0] actual_labels = [] predicted_labels = [] for j in range(element_count): distances_from_j = distmat[j, :] sorted_indices = np.argsort(distances_from_j) closests_indices = sorted_indices[1:k + 1] closests_labels = train_labels[closests_indices] closests_distances = distances_from_j[closests_indices] predicted_label = sorted_labels_by_prevalence_then_distance( closests_labels, closests_distances, np.mean) predicted_labels.append(predicted_label) actual_labels.append([train_labels[j]]) label_prediction_map_score = mapk(actual_labels, predicted_labels, map_order) hits = np.array( [1 if a in p else 0 for a, p in zip(actual_labels, predicted_labels)], dtype=np.int) misses = np.array( [0 if a in p else 1 for a, p in zip(actual_labels, predicted_labels)], dtype=np.int) label_hits = np.full((nlabels, ), np.nan) label_misses = np.full((nlabels, ), np.nan) unique_test_labels = np.unique(train_labels) _label_hits = accum(train_labels, hits, func=np.sum, dtype=np.int) _label_misses = accum(train_labels, misses, func=np.sum, dtype=np.int) label_hits[unique_test_labels] = _label_hits[unique_test_labels] label_misses[unique_test_labels] = _label_misses[unique_test_labels] return label_prediction_map_score, label_hits, label_misses