예제 #1
0
def analyze(q, x, ed):
    l2 = l2_dist(q, x)

    from matplotlib import pyplot as plt

    idx = np.random.choice(np.size(ed), 1000)
    plt.scatter(ed.reshape(-1)[idx], l2.reshape(-1)[idx], color="r")
    plt.show()
def run_kmeans(ds, qu, neigh, n_bins, n_clusters, height, ht2cutsz, opt):
        
        #used if evaluating performance on training set
        swap_query_to_data = False
        if swap_query_to_data:                
                qu = ds                
                #nearest neighbor not itself
                dist = utils.l2_dist(ds)
                dist +=  2*torch.max(dist).item()*torch.eye(len(ds))
                val, neigh = torch.topk(dist, k=opt.k, dim=1, largest=False)
                neigh = neigh.numpy()
        
        if opt.sift:
                kmeans_path = os.path.join(data_dir, 'sift', 'sift_dsroot{}ht{}'.format(n_clusters, height))
        elif opt.glove:
                if opt.fast_kmeans:
                        kmeans_path = os.path.join(data_dir, 'kmeans', 'fastkmeans_dsroot{}{}{}_{}'.format(n_clusters, km_method, max_loyd, height))
                else:
                        kmeans_path = os.path.join(data_dir, 'kmeans', 'kmeans_dsroot{}{}{}_{}'.format(n_clusters, km_method, max_loyd, height))
        elif opt.glove_c:
                #if opt.fast_kmeans:
                kmeans_path = os.path.join(data_dir, 'kmeans_glove_c', 'fastkmeans_dsroot{}{}{}_{}'.format(n_clusters, km_method, max_loyd, height))
        elif opt.sift_c:
                #if opt.fast_kmeans:
                kmeans_path = os.path.join(data_dir, 'kmeans_sift_c', 'fastkmeans_dsroot{}{}{}_{}'.format(n_clusters, km_method, max_loyd, height))
        
        else:                
                if opt.fast_kmeans:
                        kmeans_path = os.path.join(data_dir, 'kmeans_mnist', 'fastkmeans_dsroot{}{}{}_{}'.format(n_clusters, km_method, max_loyd, height))
                else:
                        kmeans_path = os.path.join(data_dir, 'kmeans_mnist', 'kmeans_dsroot{}{}{}_{}'.format(n_clusters, km_method, max_loyd, height))
        save_data = True #True
        if os.path.exists(kmeans_path) and not (opt.pca or opt.rp or opt.st): #False and
                with open(kmeans_path, 'rb') as file:
                        root = pickle.load(file)
        elif opt.cplsh and hasattr(opt, 'cplsh_root'):
                #can't serialize cpp object
                root = opt.cplsh_root
        else:
                print("Building ...")                
                d_idx = np.array(list(range(len(ds))))
                #q_idx = np.array(list(range(len(qu))))
                
                #dataset element indices to bin indices
                ds2bins = {}
                root = KNode(d_idx, ds, n_clusters, height, ds2bins, ht2cutsz, opt)
                if save_data:
                        if opt.cplsh:
                                opt.cplsh_root = root                        
                        elif not (opt.rp or opt.pca or opt.st):   
                                with open(kmeans_path, "wb") as output:
                                        pickle.dump(root, output)
                                opt.saved_path = kmeans_path
        
        acc, probe, probe95 = check_res_single(root, qu, neigh, n_bins, root.ds2bins, opt)
        print('n_clusters: {} n_bins: {} height: {} acc: {} probe: {} probe95: {}'.format(n_clusters, n_bins, height, acc, probe, probe95))
        return acc, probe, probe95
예제 #3
0
def update_heap_from_cached_result(group_sample, heap, input_batch, k,
                                   cached_neuron_group_result):
    for input_id, real_id in enumerate(input_batch):
        neuron_group_result = cached_neuron_group_result[real_id]
        dist = l2_dist(neuron_group_result, group_sample)
        if len(heap) < k:
            heapq.heappush(heap, (-dist, real_id))
        elif (-dist, real_id) > heap[0]:
            heapq.heapreplace(heap, (-dist, real_id))
예제 #4
0
def get_partition_access_order_list(group_sample, n_partitions, neuron_group,
                                    lower_bounds, partition_pointer_list):
    partition_access_order_list = list()
    for neuron_id in range(len(neuron_group.neuron_idx_list)):
        if partition_pointer_list[neuron_id] is None:
            partition_access_order_list.append(
                [i for i in range(1, n_partitions)])
        else:
            partition_access_order_list.append(
                [partition_pointer_list[neuron_id][0]])
            while True:
                pointer_dec = -1
                pointer_inc = -1
                if partition_pointer_list[neuron_id][0] - 1 >= 0:
                    pointer_dec = partition_pointer_list[neuron_id][0] - 1
                if partition_pointer_list[neuron_id][1] + 1 < n_partitions:
                    pointer_inc = partition_pointer_list[neuron_id][1] + 1

                if pointer_dec == -1 and pointer_inc == -1:
                    break
                else:
                    if pointer_dec == -1:
                        partition_access_order_list[neuron_id].append(
                            pointer_inc)
                        partition_pointer_list[neuron_id][1] += 1
                    elif pointer_inc == -1:
                        partition_access_order_list[neuron_id].append(
                            pointer_dec)
                        partition_pointer_list[neuron_id][0] -= 1
                    else:
                        if l2_dist(lower_bounds[neuron_id][pointer_dec], group_sample[neuron_id]) \
                                <= l2_dist(lower_bounds[neuron_id][pointer_inc], group_sample[neuron_id]):
                            partition_access_order_list[neuron_id].append(
                                pointer_dec)
                            partition_pointer_list[neuron_id][0] -= 1
                        else:
                            partition_access_order_list[neuron_id].append(
                                pointer_inc)
                            partition_pointer_list[neuron_id][1] += 1
    return partition_access_order_list
예제 #5
0
def get_access_order(neuron_group, group_sample, n_inputs_in_partition_0,
                     activations_with_idx_list, pointer_list):
    access_order_list = list()
    boundary_with_highest_activation_reached = [False] * len(
        neuron_group.neuron_idx_list)

    for neuron_id, activations_with_idx in enumerate(
            activations_with_idx_list):
        if pointer_list[neuron_id] is None:
            access_order_list.append(None)
            continue
        else:
            access_order_list.append(list())

        for round_cnt in range(n_inputs_in_partition_0):
            if pointer_list[neuron_id][0] - 1 >= 0:
                pointer_dec = pointer_list[neuron_id][0] - 1
            else:
                pointer_dec = pointer_list[neuron_id][0]

            if pointer_list[neuron_id][1] + 1 < n_inputs_in_partition_0:
                pointer_inc = pointer_list[neuron_id][1] + 1
            else:
                pointer_inc = pointer_list[neuron_id][1]

            if boundary_with_highest_activation_reached[neuron_id] \
                    or l2_dist(activations_with_idx[pointer_dec][0], group_sample[neuron_id]) \
                    <= l2_dist(activations_with_idx[pointer_inc][0], group_sample[neuron_id]):
                access_order_list[neuron_id].append(pointer_dec)
                if pointer_list[neuron_id][0] - 1 >= 0:
                    pointer_list[neuron_id][0] -= 1
            else:
                access_order_list[neuron_id].append(pointer_inc)
                if pointer_list[neuron_id][1] + 1 < n_inputs_in_partition_0:
                    pointer_list[neuron_id][1] += 1
                else:
                    boundary_with_highest_activation_reached[neuron_id] = True

    return access_order_list
예제 #6
0
    
    opt = utils.parse_args()

    if True:
        if opt.glove:
            queryset = utils.load_glove_data('query').to(utils.device)
            neighbors = utils.load_glove_data('answers').to(utils.device)
        elif opt.sift:
            queryset = utils.load_sift_data('query').to(utils.device)
            neighbors = utils.load_sift_data('answers').to(utils.device)    
        else:
            queryset = utils.load_data('query').to(utils.device)
            neighbors = utils.load_data('answers').to(utils.device)
    else:
        queryset = utils.load_data('train').to(utils.device)        
        dist = utils.l2_dist(queryset)
        dist += 2*torch.max(dist).item()*torch.eye(len(dist)) #torch.diag(torch.max(dist))
        val, neighbors = torch.topk(dist, k=opt.k, dim=1, largest=False)
            
    if False:
        trainset = utils.load_data('train').to(utils.device)       
        dist = utils.l2_dist(queryset, trainset)
        #dist += 2*torch.max(dist).item()*torch.eye(len(dist)) #torch.diag(torch.max(dist))
        val, neighbors = torch.topk(dist, k=opt.k, dim=1, largest=False)
        
    height = 1
    n_bins_l = list(range(1, 45, 2))
    n_bins_l = list(range(1, 100))
    n_bins_l = list(range(1, 10, 2)) #[1]
    n_clusters_l = [64]#[16] #[2]
        
예제 #7
0
def ann(xq, xb, xt, query_dist, train_dist, args):
    # analyze(xt, xt, train_dist)
    # analyze(xq, xb, query_dist)
    bias = 0.0
    scales = 2.0**(np.arange(-10, 20))
    if args.dataset == "gen50ks.txt" and args.embed == 'cnn':
        scales = np.linspace(0.01, 2.0, num=50)
    if args.dataset == "gen50ks.txt" and args.embed == 'gru':
        scales = np.linspace(0, 4.0, num=50)
    if args.dataset == "trec" and args.embed == 'cnn':
        scales = np.linspace(0, 2.0, num=50)
    if args.dataset == "trec" and args.embed == 'gru':
        scales = np.linspace(2.5, 3.1, num=50)
        bias = 60
    if args.dataset == "enron" and args.embed == 'cnn':
        scales = np.linspace(0., 1.0, num=50)
    if args.dataset == "enron" and args.embed == 'gru':
        scales = np.linspace(0., 2.000, num=50)
    if args.dataset == "dblp" and args.embed == 'cnn':
        scales = np.linspace(0.1, 2.0, num=50)
    if args.dataset == "dblp" and args.embed == 'gru':
        scales = np.linspace(0.5, 1.6, num=50)
    if args.dataset == "uniref" and args.embed == 'cnn':
        scales = np.linspace(0.5, 4.0, num=50)
    if args.dataset == "uniref" and args.embed == 'gru':
        scales = np.linspace(0., 1.4, num=50)

    print(scales)
    thresholds = [
        1, 5, 10, 15, 20, 25, 50, 75, 100, 125, 150, 300, 500, 800, 1000, 2000
    ]
    train_dist_l2 = l2_dist(xt, xt)
    query_dist_l2 = l2_dist(xq, xb)
    threshold2dist = linear_fit(train_dist, train_dist_l2)
    print("thres\t l2thres\t", end='')
    for scale in scales:
        print("%2.3f\t" % scale, end='')
    print()

    for threshold in thresholds:
        gt = [np.argwhere(dist <= threshold) for dist in query_dist]
        threshold_l2 = threshold2dist(threshold)
        print("%6d\t %.6f\t" % (threshold, threshold_l2), end='')
        for scale in scales:
            items = [
                np.argwhere(dist <= bias + threshold_l2 * scale)
                for dist in query_dist_l2
            ]
            recall = np.mean([
                len(np.intersect1d(i, j)) / len(i) for i, j in zip(gt, items)
                if len(i) > 0
            ])
            print("%.3f\t" % (recall), end='')
        print()
    for threshold in thresholds:
        gt = [np.argwhere(dist <= threshold) for dist in query_dist]
        threshold_l2 = threshold2dist(threshold)
        print("%6d\t %.6f\t" % (threshold, threshold_l2), end='')
        for scale in scales:
            items = [
                np.argwhere(dist <= threshold_l2 * scale)
                for dist in query_dist_l2
            ]
            precs = np.mean([
                len(np.intersect1d(i, j)) / len(j) if len(j) > 0 else 0
                for i, j in zip(gt, items) if len(i) > 0
            ])
            print("%.3f\t" % (precs), end='')
        print()
예제 #8
0
    train_node = train.TrainNode(-1, opt, -1)

    if opt.glove:
        dataset = utils.load_glove_data('train').to(utils.device)
        queryset = utils.load_glove_data('query').to(utils.device)
        neighbors = utils.load_glove_data('answers').to(utils.device)
    else:
        dataset = utils.load_data('train').to(utils.device)
        queryset = utils.load_data('query').to(utils.device)
        neighbors = utils.load_data('answers').to(utils.device)

    ######uncomment
    if False:
        #dataset = queryset ##remove Dec 15
        queryset = dataset
        dist = utils.l2_dist(dataset)
        dist += 2 * torch.max(dist).item() * torch.eye(
            len(dist))  #torch.diag(torch.max(dist))
        val, neighbors = torch.topk(dist, k=opt.k, dim=1, largest=False)

    if False:
        #queryset = dataset
        dist = utils.l2_dist(queryset)
        dist += 2 * torch.max(dist).item() * torch.eye(
            len(dist))  #torch.diag(torch.max(dist))
        val, neighbors = torch.topk(dist, k=opt.k, dim=1, largest=False)

    #dsnode_path = opt.dsnode_path + str(opt.n_clusters)
    #print('dsnode path {}'.format(dsnode_path))
    #dsnode = utils.pickle_load(dsnode_path)
    #print('dsnode {}'.format(dsnode))
예제 #9
0
def answer_query_with_guarantee(model,
                                dataset,
                                act,
                                idx_act,
                                bit_arr,
                                idx_of_idx,
                                par_low_bound,
                                input_sample_id,
                                neuron_group,
                                k,
                                n_partitions,
                                bits_per_input,
                                BATCH_SIZE,
                                batch_size,
                                where=None):
    layer_id = neuron_group.layer_id
    group_sample = get_group_sample(dataset, input_sample_id, layer_id, model,
                                    neuron_group)

    n_inputs = len(dataset)
    n_inputs_rerun = 1
    group_activation_cached = [None] * dataset.shape[0]
    group_activation_cached[input_sample_id] = group_sample
    heap = [(0.0, input_sample_id)]

    activations_with_idx_list, pointer_list = initialize_activations_and_pointers_for_phase_one(
        idx_of_idx, input_sample_id, group_sample, neuron_group, act, idx_act)
    is_sample_in_partition_0 = [
        pointer is not None for pointer in pointer_list
    ]
    n_inputs_in_partition_0 = len(activations_with_idx_list[0])

    access_order_list = get_access_order(neuron_group, group_sample,
                                         n_inputs_in_partition_0,
                                         activations_with_idx_list,
                                         pointer_list)

    print(
        f"input {input_sample_id}, size of neuron group {len(neuron_group.neuron_idx_list)}"
    )

    exit_msg = None
    input_batch = set()
    ta_exited = False

    for round_cnt in range(n_inputs_in_partition_0):
        round_activations_with_idx = list()
        for neuron_id, activations_with_idx in enumerate(
                activations_with_idx_list):
            if access_order_list[neuron_id] is None:
                round_activations_with_idx.append(None)
            else:
                round_activations_with_idx.append(activations_with_idx[
                    access_order_list[neuron_id][round_cnt]])

        for item in round_activations_with_idx:
            if item is None:
                continue
            activation, input_idx = item
            if group_activation_cached[input_idx] is None:
                if where is None:
                    pass
                else:
                    if not where(input_idx):
                        continue
                input_batch.add(input_idx)

        if len(input_batch) >= batch_size \
                or n_inputs_rerun + len(input_batch) == dataset.shape[0] \
                or round_cnt + 1 == n_inputs_in_partition_0:
            if len(input_batch) == 0:
                break
            run_nn_and_update_things(dataset, group_activation_cached,
                                     group_sample, heap, input_batch, k,
                                     layer_id, model, neuron_group, BATCH_SIZE)
            n_inputs_rerun += len(input_batch)
            input_batch = set()

        if len(input_batch) == 0 and len(heap) == k:
            round_activations = list()
            for round_activation_id, item in enumerate(
                    round_activations_with_idx):
                if item is None:
                    round_activations.append(group_sample[round_activation_id])
                    continue
                activation, input_idx = item
                round_activations.append(activation)
            round_activations = np.array(round_activations).reshape(
                group_sample.shape)
            threshold = l2_dist(round_activations, group_sample)

            if heap[0] > (-threshold, n_inputs_in_partition_0):
                ta_exited = True
                break

    if ta_exited:
        return heap, exit_msg, is_sample_in_partition_0, n_inputs_rerun

    partitions_of_input = unpack_bits_and_get_input_partitions(
        idx_of_idx, neuron_group, bit_arr)

    input_batch, n_inputs_rerun = deal_with_remaining_inputs_in_partition_0(
        dataset, group_activation_cached, group_sample, heap, input_batch, k,
        layer_id, model, n_inputs_rerun, neuron_group, partitions_of_input,
        pointer_list, bits_per_input, BATCH_SIZE, where)

    bound_list, partition_pointer_list = initialize_bounds_and_pointers_for_phase_two(
        activations_with_idx_list, input_sample_id, neuron_group,
        partitions_of_input, pointer_list, bits_per_input)

    lower_bound_of_partitions = get_lower_bound_of_partitions(
        idx_of_idx, neuron_group, par_low_bound)
    partition_access_order_list = get_partition_access_order_list(
        group_sample, n_partitions, neuron_group, lower_bound_of_partitions,
        partition_pointer_list)

    round_cnt = 0
    row_cnt = 0
    boundary_partition_processed = [
        [False, False] for idx in range(len(neuron_group.neuron_idx_list))
    ]
    for neuron_id in range(len(neuron_group.neuron_idx_list)):
        if pointer_list[neuron_id] is not None:
            boundary_partition_processed[neuron_id][0] = True
    while n_inputs_rerun < dataset.shape[0]:

        inputs_for_neuron_list = list()
        for neuron_id, partition_of_input in enumerate(partitions_of_input):
            if round_cnt >= len(partition_access_order_list[neuron_id]):
                continue
            inputs_for_current_neuron = get_input_ids_by_partition_id(
                partition_of_input,
                partition_access_order_list[neuron_id][round_cnt],
                bits_per_input, n_inputs)
            inputs_for_neuron_list.append(inputs_for_current_neuron)
            add_inputs_to_batch(input_batch, inputs_for_current_neuron,
                                group_activation_cached, where)

        row_cnt += (n_inputs - n_inputs_in_partition_0) // (n_partitions - 1)
        if len(input_batch) > 0:
            run_nn_and_update_things(dataset, group_activation_cached,
                                     group_sample, heap, input_batch, k,
                                     layer_id, model, neuron_group, BATCH_SIZE)
            n_inputs_rerun += len(input_batch)
            input_batch = set()

        for neuron_id in range(len(neuron_group.neuron_idx_list)):
            if partition_access_order_list[neuron_id][round_cnt] == 0 or (
                    n_inputs_in_partition_0 == 0 and
                    partition_access_order_list[neuron_id][round_cnt] == 1):
                boundary_partition_processed[neuron_id][0] = True
            if partition_access_order_list[neuron_id][
                    round_cnt] == n_partitions - 1:
                boundary_partition_processed[neuron_id][1] = True

        for idx in range(len(neuron_group.neuron_idx_list)):
            for input_id in inputs_for_neuron_list[idx]:
                if bound_list[idx] is None:
                    bound_list[idx] = [
                        group_activation_cached[input_id][idx],
                        group_activation_cached[input_id][idx]
                    ]
                else:
                    bound_list[idx][0] = min(
                        bound_list[idx][0],
                        group_activation_cached[input_id][idx])
                    bound_list[idx][1] = max(
                        bound_list[idx][1],
                        group_activation_cached[input_id][idx])

        if len(heap) == k:
            round_activations = np.array(group_sample)
            for idx in range(len(neuron_group.neuron_idx_list)):
                if boundary_partition_processed[idx][
                        0] and not boundary_partition_processed[idx][1]:
                    round_activations[idx] = bound_list[idx][0]
                elif boundary_partition_processed[idx][
                        1] and not boundary_partition_processed[idx][0]:
                    round_activations[idx] = bound_list[idx][1]
                elif pointer_list[idx] is None:
                    if l2_dist(bound_list[idx][0],
                               group_sample[idx]) < l2_dist(
                                   bound_list[idx][1], group_sample[idx]):
                        round_activations[idx] = bound_list[idx][0]
                    else:
                        round_activations[idx] = bound_list[idx][1]
                else:
                    round_activations[idx] = bound_list[idx][0]

            threshold = l2_dist(round_activations, group_sample)

            if heap[0] > (-threshold, n_inputs_in_partition_0):
                ta_exited = True
                break

        round_cnt += 1

    if ta_exited:
        exit_msg = f"termination: phase 2, round {round_cnt}; inputs re-run: {n_inputs_rerun}"
    else:
        exit_msg = f"termination: none; inputs re-run: {n_inputs_rerun}"

    return heap, exit_msg, is_sample_in_partition_0, n_inputs_rerun