예제 #1
0
def _calc_score(predict_y, true_y, nlabels, with_cfmat=False):
    """
    Calculate various scoring of the prediction.
    :param predict_y: predicted value (enumerated)
    :param true_y: actual value (enumerated)
    :param nlabels: number of labels (true_y might not have all labels)
    :param with_cfmat: if True will return confusiion matrix
    :return: score( percentage correct)
             label_hits (ndarray of hit count for each label. Labels not present in true_y will have count of NaN)
             label_misses similar to label_hits but count the misses
             cf_matrix (optional) the confusion matrix (nlabels x nlabels)
    """
    hits = (predict_y == true_y).astype(np.int)
    misses = 1 - hits
    score = hits.sum() / len(true_y)

    label_hits = np.full((nlabels, ), np.nan)
    label_misses = np.full((nlabels, ), np.nan)

    unique_test_labels = np.unique(true_y)

    _label_hits = accum(true_y, hits, func=np.sum, dtype=np.int)
    _label_misses = accum(true_y, misses, func=np.sum, dtype=np.int)

    label_hits[unique_test_labels] = _label_hits[unique_test_labels]
    label_misses[unique_test_labels] = _label_misses[unique_test_labels]

    if with_cfmat:
        cf_matrix = confusion_matrix(true_y, predict_y)
        return score, label_hits, label_misses, cf_matrix
    else:
        return score, label_hits, label_misses
def sorted_labels_by_prevalence_then_distance(neighbour_labels,
                                              neighbour_distances,
                                              distance_func):
    # labels = np.array(['a', 'a', 'b', 'c', 'b', 'a', 'c', 'd'])
    # dists = np.array([0.1, 0.1, 0.3, 0.3, 0.5, 0.2, 0.4, 0.6])
    # indices = np.array([0, 1, 2, 3, 4, 5, 6, 7])

    sorted_indices = np.argsort(neighbour_distances)
    closests = sorted_indices[0:7]
    closests_labels = neighbour_labels[closests]
    unique_labels, indices = np.unique(closests_labels, return_inverse=True)

    prevalences = np.bincount(indices)

    closests_distances = neighbour_distances[closests]
    mean_distance_by_labels = accum(indices,
                                    closests_distances,
                                    func=distance_func,
                                    dtype=np.float)

    # Now, lexsort sorts everything ascendingly. We want the prevalences to be sorted descendingly and distances
    # ascendingly. So we must negate prevalences
    sorted_indices_by_prevalence_then_mean_distance = np.lexsort(
        (0 - prevalences, mean_distance_by_labels))

    sorted_labels = unique_labels[
        sorted_indices_by_prevalence_then_mean_distance]

    return sorted_labels
def _calc_score(predict_y, test_y, nlabels):
    hits = (predict_y == test_y).astype(np.int)
    misses = 1 - hits
    score = hits.sum() / len(test_y)

    label_hits = np.full((nlabels, ), np.nan)
    label_misses = np.full((nlabels, ), np.nan)

    unique_test_labels = np.unique(test_y)

    _label_hits = accum(test_y, hits, func=np.sum, dtype=np.int)
    _label_misses = accum(test_y, misses, func=np.sum, dtype=np.int)

    label_hits[unique_test_labels] = _label_hits[unique_test_labels]
    label_misses[unique_test_labels] = _label_misses[unique_test_labels]

    return score, label_hits, label_misses
def k_nearest(distmat, train_labels, nlabels, k, map_order):
    # Leave one out K nearest neighbour
    # Find the nearest 5 neighbours & find out which label dominates
    element_count = distmat.shape[0]

    actual_labels = []
    predicted_labels = []

    for j in range(element_count):
        distances_from_j = distmat[j, :]
        sorted_indices = np.argsort(distances_from_j)
        closests_indices = sorted_indices[1:k + 1]
        closests_labels = train_labels[closests_indices]
        closests_distances = distances_from_j[closests_indices]

        predicted_label = sorted_labels_by_prevalence_then_distance(
            closests_labels, closests_distances, np.mean)

        predicted_labels.append(predicted_label)
        actual_labels.append([train_labels[j]])

    label_prediction_map_score = mapk(actual_labels, predicted_labels,
                                      map_order)

    hits = np.array(
        [1 if a in p else 0 for a, p in zip(actual_labels, predicted_labels)],
        dtype=np.int)
    misses = np.array(
        [0 if a in p else 1 for a, p in zip(actual_labels, predicted_labels)],
        dtype=np.int)

    label_hits = np.full((nlabels, ), np.nan)
    label_misses = np.full((nlabels, ), np.nan)

    unique_test_labels = np.unique(train_labels)

    _label_hits = accum(train_labels, hits, func=np.sum, dtype=np.int)
    _label_misses = accum(train_labels, misses, func=np.sum, dtype=np.int)

    label_hits[unique_test_labels] = _label_hits[unique_test_labels]
    label_misses[unique_test_labels] = _label_misses[unique_test_labels]

    return label_prediction_map_score, label_hits, label_misses