Пример #1
0
def listed_k_nearest_neighbors(vectors, sample_idcs, negative_idcs=None, k=1):
    print('Find high dimensional neighbors...')
    start = time.time()
    index = faiss.IndexFlatL2(vectors.shape[1])  # build the index
    index.add(vectors.astype(np.float32))  # add vectors to the index
    _, idx = index.search(vectors[sample_idcs].astype(np.float32),
                          len(vectors))

    mask = np.isin(idx, sample_idcs).__invert__()
    idx = idx[mask].reshape(len(idx), -1)
    scores = np.arange(idx.shape[1]).reshape(1, -1) * np.ones(idx.shape)

    idx = idx.transpose().flatten(
    )  # list nearest mutual neighbors and make them unique
    scores = scores.transpose().flatten()
    neighbors = []
    for i in idx:
        if i not in neighbors:
            neighbors.append(i)
        if len(neighbors) == k:
            break

    stop = time.time()
    print('Done. ({}min {}s)'.format(
        int((stop - start)) / 60, (stop - start) % 60))
    return neighbors, scores
Пример #2
0
def mutual_k_nearest_neighbors(vectors, sample_idcs, negative_idcs=None, k=1):
    print('Find high dimensional neighbors...')
    start = time.time()
    index = faiss.IndexFlatL2(vectors.shape[1])  # build the index
    index.add(vectors.astype(np.float32))  # add vectors to the index
    dist, idx = index.search(vectors[sample_idcs].astype(np.float32),
                             len(vectors))

    # the neighbor score for each sample is computed as the sum of distances to all the samples
    scores = np.zeros(len(vectors))
    for i in range(len(vectors)):
        mask = idx == i
        scores[i] = dist[mask].sum()
    neighbors = np.argsort(scores)
    neighbors = neighbors[np.isin(neighbors, sample_idcs).__invert__()][:k]

    stop = time.time()
    print('Done. ({}min {}s)'.format(
        int((stop - start)) / 60, (stop - start) % 60))
    return neighbors, scores[neighbors]
Пример #3
0
def get_neighborhood(position, idx_modified):
    """Use faiss to compute neighborhood of modified samples."""
    if len(idx_modified) == 0:
        return []
    print('Infer neighborhood...')
    start = time.time()

    index = faiss.IndexFlatL2(position.shape[1])  # build the index
    index.add(position.astype(np.float32))  # add vectors to the index
    # define nearest neighbors wrt cluster center
    center = np.mean(position[idx_modified], axis=0).reshape(1, -1)
    dist, idx = index.search(center.astype(np.float32), len(position))
    in_modified = np.isin(idx, idx_modified)
    max_dist = 1.1 * np.max(dist[in_modified])
    neighbors = idx[(dist <= max_dist) * (in_modified.__invert__())]

    stop = time.time()
    print('Done. ({}min {}s)'.format(
        int((stop - start)) / 60, (stop - start) % 60))

    return neighbors
Пример #4
0
def get_neighborhood(data, sample_idcs, buffer=0., use_faiss=True):
    """Determine all data points within the sphere in data space defined by the samples.
    Args:
        data (np.ndarray): NxD array containing N D-dimensional data vectors
        sample_idcs (iterable ints): indices of the data points that define the sphere
        buffer (optional, float): fraction of radius which to additionally include in sphere
        use_faiss (optional, bool): whether to use faiss library for distance calculation
        """
    # get center of samples
    center = np.mean(data[sample_idcs], axis=0, keepdims=True)

    if use_faiss:
        index = faiss.IndexFlatL2(data.shape[1])  # build the index
        index.add(data.astype('float32'))  # add vectors to the index
        distances, indices = index.search(center.astype('float32'), len(data))
        distances, indices = np.sqrt(
            distances[0]), indices[0]  # faiss returns squared distances

        radius = max(
            [d for d, i in zip(distances, indices) if i in sample_idcs])
        radius += buffer * radius

        local_idcs = []
        for d, i in zip(distances, indices):
            if d > radius:
                break
            local_idcs.append(i)
        local_idcs = np.array(local_idcs)

    else:
        distances = np.array([euclidean(d, center) for d in data])

        radius = max(distances[sample_idcs])
        radius += buffer * radius

        local_idcs = np.where(distances <= radius)[0]

    return local_idcs, center, radius
Пример #5
0
def score_k_nearest_neighbors(vectors, sample_idcs, negative_idcs=None, k=1):
    print('Find high dimensional neighbors...')
    start = time.time()
    index = faiss.IndexFlatL2(vectors.shape[1])  # build the index
    index.add(vectors.astype(np.float32))  # add vectors to the index
    dist, idx = index.search(vectors[sample_idcs].astype(np.float32),
                             len(vectors))

    mask = np.isin(idx, sample_idcs).__invert__()
    idx = idx[mask].reshape(len(idx), -1)

    # get the neighbor score for each sample
    # the neighbor score for each sample is computed as the sum of the column numbers in which it appears
    scores = np.zeros(len(vectors), dtype=np.longlong)
    for col in range(idx.shape[1]):
        scores[idx[:, col]] += col
    neighbors = np.argsort(scores)[len(sample_idcs) + 1:len(sample_idcs) + 1 +
                                   k]

    stop = time.time()
    print('Done. ({}min {}s)'.format(
        int((stop - start)) / 60, (stop - start) % 60))
    return neighbors, scores[neighbors]
Пример #6
0
def generate_triplets(positives,
                      negatives,
                      N,
                      n_pos_pa=1,
                      n_neg_pp=1,
                      seed=123,
                      consider_neighborhood=False,
                      embedding=None,
                      n_nn_neg_pp=1):
    np.random.seed(seed)
    neighbor_sampling = consider_neighborhood and embedding is not None
    if neighbor_sampling:
        assert np.concatenate([
            positives, negatives
        ]).max() < len(embedding), 'sample index out of embedding shape'

    n_pos_pa = min(n_pos_pa, len(positives) - 1)
    n_neg_pp = min(n_neg_pp, len(negatives) - 1)
    if n_pos_pa <= 0 or n_neg_pp <= 0:
        return np.array([], dtype=long)
    N_anchors = min(int(N * 1.0 / (n_neg_pp * n_pos_pa)), len(positives))
    N_tot = N_anchors * n_pos_pa * n_neg_pp

    if N != N_tot:
        warnings.warn(
            'Too few data to generate {} triplets. Instead generate {} triplets using:\n'
            '{} anchors, {} positives per anchor, {} negatives per positive'.
            format(N, N_tot, N_anchors, n_pos_pa, n_neg_pp), RuntimeWarning)
        N = N_tot

    triplets = np.empty((N, 3), dtype=np.long)

    anchors = np.random.choice(positives, N_anchors, replace=False)
    if neighbor_sampling:
        # get the embedding neighbors for the anchors
        index = faiss.IndexFlatL2(embedding.shape[1])  # build the index
        index.add(embedding.astype('float32'))  # add vectors to the index
        _, neighbors = index.search(embedding[anchors].astype('float32'),
                                    len(embedding))

    for i, a in enumerate(anchors):
        pos = np.random.choice(np.delete(positives,
                                         np.where(positives == a)[0][0]),
                               n_pos_pa,
                               replace=False)

        if neighbor_sampling:  # get the nearest negatives
            nn_negatives = np.array(
                [nghbr for nghbr in neighbors[i] if nghbr in negatives])
            n_neg_neighbors = min(
                len(nn_negatives) - 1, n_pos_pa * n_nn_neg_pp)
            nn_negatives = nn_negatives[:n_neg_neighbors]
            outer_negatives = np.array(
                [n for n in negatives if not n in nn_negatives])
            n_outer_neg_pp = min(n_neg_pp - n_nn_neg_pp,
                                 len(outer_negatives) - 1)

            if n_outer_neg_pp + n_nn_neg_pp != n_neg_pp:
                n_nn_neg_pp = n_neg_pp - n_outer_neg_pp
                warnings.warn(
                    'cannot generate {} negatives. Use {} negatives from neighborhood '
                    'and {} from outside.'.format(n_neg_pp, n_nn_neg_pp,
                                                  n_outer_neg_pp))

        for j, p in enumerate(pos):
            if neighbor_sampling:
                nn_neg = np.random.choice(nn_negatives,
                                          n_nn_neg_pp,
                                          replace=False)
                neg = np.random.choice(outer_negatives,
                                       n_outer_neg_pp,
                                       replace=False)
                neg = np.concatenate([nn_neg, neg])
            else:
                neg = np.random.choice(negatives, n_neg_pp, replace=False)
            t = np.stack([np.repeat(a, n_neg_pp),
                          np.repeat(p, n_neg_pp), neg],
                         axis=1)
            i_start = (i * n_pos_pa + j) * n_neg_pp
            triplets[i_start:i_start + n_neg_pp] = t

    return triplets
Пример #7
0
def create_graph(names, positions, label=None, labels=None):
    """Compute nearest neighbor graph with inverse distances used as edge weights ('link strength')."""
    global features, n_neighbors
    print('Compute nearest neighbor graph...')
    tic = time()

    def create_links(neighbors, distances, strenght_range=(0, 1)):
        """Computes links between data points based on nearest neighbor distance."""
        # depending on nn layout remove samples themselves from nn list
        if neighbors[0, 0] == 0:  # first column contains sample itself
            neighbors = neighbors[:, 1:]
            distances = distances[:, 1:]

        # normalize quadratic distances to strength_range to use them as link strength
        a, b = strenght_range
        dmin = distances.min()**2
        dmax = distances.max()**2
        distances = (b - a) * (distances**2 - dmin) / (dmax - dmin) + a

        links = {}
        for i in range(len(neighbors)):
            links[i] = {}
            for n, d in zip(neighbors[i], distances[i]):
                # # prevent double linking      # allow double linking!
                # if n < i:
                #     if i in neighbors[n]:
                #         continue
                links[i][n] = float(d)
        return links

    # compute nearest neighbors and distances with faiss library
    index = faiss.IndexFlatL2(features.shape[1])  # build the index
    index.add(np.stack(features).astype('float32'))  # add vectors to the index
    knn_distances, knn_indices = index.search(
        np.stack(features).astype('float32'), n_neighbors +
        1)  # add 1 because neighbors include sample itself for k=0

    # get links between the nodes
    # invert strength range because distances are used as measure and we want distant points to be weakly linked
    links = create_links(knn_indices[:, :n_neighbors + 1],
                         knn_distances[:, :n_neighbors + 1],
                         strenght_range=(1, 0))

    if label is None:
        label = [None] * len(names)

    if labels is None:
        labels = {0: [None] * len(names)}
    elif not isinstance(labels, dict):
        labels = {0: labels}

    nodes = {}
    for i, (name, (x, y)) in enumerate(zip(names, positions)):
        multi_label = [l[i] for l in labels.values()]
        nodes[i] = {
            'name': name,
            'label': str(label[i]),
            'labels': multi_label,
            'x': x,
            'y': y,
            'links': links[i]
        }

    toc = time()
    print('Done. ({:2.0f}min {:2.1f}s)'.format((toc - tic) / 60,
                                               (toc - tic) % 60))
    return nodes, labels.keys()
Пример #8
0
def evaluate(ground_truth,
             plot_decision_boundary=True,
             plot_GTE=True,
             compute_GTE=True,
             eval_local=True):
    global reset, gte_fig, db_fig
    with open('_svm_prediction.pkl', 'rb') as f:
        svm_data = pickle.load(f)
        predictions = svm_data['labels']
        distances = svm_data['distance']
        local_idcs = svm_data['local_indices'] if eval_local else np.arange(
            len(predictions)).astype(int)
        train_idcs = np.concatenate([
            svm_data['idcs_positives_train'], svm_data['idcs_negatives_train']
        ])
        if compute_GTE:
            local_embedding = svm_data['local_embedding']
            local_triplets = svm_data['local_triplets']

    test_idcs = np.array([
        idx for idx in range(len(ground_truth))
        if (idx in local_idcs and idx not in train_idcs)
    ],
                         dtype=int)

    # evaluate precision, recall and true negative rate
    # training
    (tn_rate_train,
     prec_train), (_, recall_train), _, _ = precision_recall_fscore_support(
         ground_truth[train_idcs], predictions[train_idcs])
    (tn_rate_test,
     prec_test), (_, recall_test), _, _ = precision_recall_fscore_support(
         ground_truth[test_idcs], predictions[test_idcs])

    train_acc = np.sum(predictions[train_idcs] ==
                       ground_truth[train_idcs]) * 1.0 / len(train_idcs)
    test_acc = np.sum(predictions[test_idcs] ==
                      ground_truth[test_idcs]) * 1.0 / len(test_idcs)
    print('Train SVM: '
          '\n\taccuracy: {:2.1f}%'
          '\n\tprecision: {:.3f}'
          '\n\trecall: {:.3f}'
          '\n\ttn_rate: {:.3f}'
          '\nTest SVM: '
          '\n\taccuracy: {:2.1f}%'
          '\n\tprecision: {:.3f}'
          '\n\trecall: {:.3f}'
          '\n\ttn_rate: {:.3f}'.format(100 * train_acc, prec_train,
                                       recall_train, tn_rate_train,
                                       100 * test_acc, prec_test, recall_test,
                                       tn_rate_test))

    if plot_decision_boundary:
        if db_fig is None:
            db_fig, ax = plt.subplots(1, 3, figsize=(12, 4))
        ax = db_fig.axes
        for a in ax:
            a.clear()

        # Plot decision boundary accuracy
        ax[0].set_title('test')
        ax[0].plot(distances[test_idcs][predictions[test_idcs] ==
                                        ground_truth[test_idcs]],
                   np.zeros(
                       len(distances[test_idcs][predictions[test_idcs] ==
                                                ground_truth[test_idcs]])),
                   c='g',
                   linewidth=0.0,
                   marker='o',
                   alpha=0.1)
        ax[0].plot(
            distances[test_idcs][
                predictions[test_idcs] != ground_truth[test_idcs]],
            np.zeros(
                len(distances[test_idcs][
                    predictions[test_idcs] != ground_truth[test_idcs]])),
            c='r',
            linewidth=0.0,
            marker='o',
            alpha=0.1)

        # train
        ax[1].set_title('train')
        ax[1].plot(distances[train_idcs][predictions[train_idcs] ==
                                         ground_truth[train_idcs]],
                   np.zeros(
                       len(distances[train_idcs][predictions[train_idcs] ==
                                                 ground_truth[train_idcs]])),
                   c='g',
                   linewidth=0.0,
                   marker='o',
                   alpha=0.1)
        ax[1].plot(
            distances[train_idcs][
                predictions[train_idcs] != ground_truth[train_idcs]],
            np.zeros(
                len(distances[train_idcs][
                    predictions[train_idcs] != ground_truth[train_idcs]])),
            c='r',
            linewidth=0.0,
            marker='o',
            alpha=0.1)

        # all images
        ax[2].set_title('global')
        ax[2].plot(distances[predictions == ground_truth],
                   np.zeros(len(distances[predictions == ground_truth])),
                   c='g',
                   linewidth=0.0,
                   marker='o',
                   alpha=0.1)
        ax[2].plot(distances[predictions != ground_truth],
                   np.zeros(len(distances[predictions != ground_truth])),
                   c='r',
                   linewidth=0.0,
                   marker='o',
                   alpha=0.1)

        plt.pause(8)

    if compute_GTE:
        # evaluate triplet error in embedding
        positives = np.where(ground_truth[test_idcs] == 1)[0]
        negatives = np.where(ground_truth[test_idcs] == 0)[0]

        N_test_triplets = 200
        test_triplets = generate_triplets(positives,
                                          negatives,
                                          n_pos_pa=2,
                                          N=N_test_triplets,
                                          seed=234)
        for i, t in enumerate(test_triplets):
            if any([(t == lt).all() for lt in local_triplets]):
                print('duplicate')
                accept = False
                while not accept:
                    print('try new t')
                    t_new = generate_triplets(positives,
                                              negatives,
                                              N=1,
                                              seed=345 + i)[0]
                    if not (any([(t_new == lt).all() for lt in test_triplets])
                            or any([(t_new == lt).all()
                                    for lt in local_triplets])):
                        accept = True
                test_triplets[i] = t_new

        # evaluate GTE
        # compute distances in local embedding
        index = faiss.IndexFlatL2(local_embedding.shape[1])  # build the index
        index.add(np.stack(local_embedding).astype(
            'float32'))  # add vectors to the index
        knn_distances, knn_indices = index.search(
            np.stack(local_embedding).astype('float32'), len(local_embedding))

        GTE = 0.0
        for (a, p, n) in test_triplets:
            if knn_distances[a, p] >= knn_distances[a, n]:
                GTE += 1
        GTE = GTE / len(test_triplets)
    else:
        GTE = float('nan')

    # Write to evaluation file
    f = open('_eval.csv', 'wb') if reset else open('_eval.csv', 'a')
    outdict = {'n_labeled': len(train_idcs), 'test_acc': test_acc, 'GTE': GTE}
    writer = csv.DictWriter(f, fieldnames=outdict.keys())
    if reset:
        writer.writeheader()
    writer.writerow(outdict)
    f.close()

    if plot_GTE:
        # plot result
        n_labeled = []
        test_acc = []
        GTE = []
        with open('_eval.csv', 'rb') as f:
            reader = csv.DictReader(f, fieldnames=outdict.keys())
            next(reader, None)  # skip the headers
            for row in reader:
                n_labeled.append(int(row['n_labeled']))
                test_acc.append(float(row['test_acc']))
                GTE.append(float(row['GTE']))

        if gte_fig is None:
            gte_fig, ax = plt.subplots(1, 2)
        ax = gte_fig.axes
        for a in ax:
            a.clear()
        ax[0].set_title('test acc')
        ax[0].plot(n_labeled, test_acc)
        ax[1].set_title('GTE')
        n_labeled = np.array(n_labeled)
        GTE = np.array(GTE)
        ax[1].plot(n_labeled[np.isfinite(GTE)], GTE[np.isfinite(GTE)])
        plt.pause(5)

    reset = False