def analyze(q, x, ed): l2 = l2_dist(q, x) from matplotlib import pyplot as plt idx = np.random.choice(np.size(ed), 1000) plt.scatter(ed.reshape(-1)[idx], l2.reshape(-1)[idx], color="r") plt.show()
def run_kmeans(ds, qu, neigh, n_bins, n_clusters, height, ht2cutsz, opt): #used if evaluating performance on training set swap_query_to_data = False if swap_query_to_data: qu = ds #nearest neighbor not itself dist = utils.l2_dist(ds) dist += 2*torch.max(dist).item()*torch.eye(len(ds)) val, neigh = torch.topk(dist, k=opt.k, dim=1, largest=False) neigh = neigh.numpy() if opt.sift: kmeans_path = os.path.join(data_dir, 'sift', 'sift_dsroot{}ht{}'.format(n_clusters, height)) elif opt.glove: if opt.fast_kmeans: kmeans_path = os.path.join(data_dir, 'kmeans', 'fastkmeans_dsroot{}{}{}_{}'.format(n_clusters, km_method, max_loyd, height)) else: kmeans_path = os.path.join(data_dir, 'kmeans', 'kmeans_dsroot{}{}{}_{}'.format(n_clusters, km_method, max_loyd, height)) elif opt.glove_c: #if opt.fast_kmeans: kmeans_path = os.path.join(data_dir, 'kmeans_glove_c', 'fastkmeans_dsroot{}{}{}_{}'.format(n_clusters, km_method, max_loyd, height)) elif opt.sift_c: #if opt.fast_kmeans: kmeans_path = os.path.join(data_dir, 'kmeans_sift_c', 'fastkmeans_dsroot{}{}{}_{}'.format(n_clusters, km_method, max_loyd, height)) else: if opt.fast_kmeans: kmeans_path = os.path.join(data_dir, 'kmeans_mnist', 'fastkmeans_dsroot{}{}{}_{}'.format(n_clusters, km_method, max_loyd, height)) else: kmeans_path = os.path.join(data_dir, 'kmeans_mnist', 'kmeans_dsroot{}{}{}_{}'.format(n_clusters, km_method, max_loyd, height)) save_data = True #True if os.path.exists(kmeans_path) and not (opt.pca or opt.rp or opt.st): #False and with open(kmeans_path, 'rb') as file: root = pickle.load(file) elif opt.cplsh and hasattr(opt, 'cplsh_root'): #can't serialize cpp object root = opt.cplsh_root else: print("Building ...") d_idx = np.array(list(range(len(ds)))) #q_idx = np.array(list(range(len(qu)))) #dataset element indices to bin indices ds2bins = {} root = KNode(d_idx, ds, n_clusters, height, ds2bins, ht2cutsz, opt) if save_data: if opt.cplsh: opt.cplsh_root = root elif not (opt.rp or opt.pca or opt.st): with open(kmeans_path, "wb") as output: pickle.dump(root, output) opt.saved_path = kmeans_path acc, probe, probe95 = check_res_single(root, qu, neigh, n_bins, root.ds2bins, opt) print('n_clusters: {} n_bins: {} height: {} acc: {} probe: {} probe95: {}'.format(n_clusters, n_bins, height, acc, probe, probe95)) return acc, probe, probe95
def update_heap_from_cached_result(group_sample, heap, input_batch, k, cached_neuron_group_result): for input_id, real_id in enumerate(input_batch): neuron_group_result = cached_neuron_group_result[real_id] dist = l2_dist(neuron_group_result, group_sample) if len(heap) < k: heapq.heappush(heap, (-dist, real_id)) elif (-dist, real_id) > heap[0]: heapq.heapreplace(heap, (-dist, real_id))
def get_partition_access_order_list(group_sample, n_partitions, neuron_group, lower_bounds, partition_pointer_list): partition_access_order_list = list() for neuron_id in range(len(neuron_group.neuron_idx_list)): if partition_pointer_list[neuron_id] is None: partition_access_order_list.append( [i for i in range(1, n_partitions)]) else: partition_access_order_list.append( [partition_pointer_list[neuron_id][0]]) while True: pointer_dec = -1 pointer_inc = -1 if partition_pointer_list[neuron_id][0] - 1 >= 0: pointer_dec = partition_pointer_list[neuron_id][0] - 1 if partition_pointer_list[neuron_id][1] + 1 < n_partitions: pointer_inc = partition_pointer_list[neuron_id][1] + 1 if pointer_dec == -1 and pointer_inc == -1: break else: if pointer_dec == -1: partition_access_order_list[neuron_id].append( pointer_inc) partition_pointer_list[neuron_id][1] += 1 elif pointer_inc == -1: partition_access_order_list[neuron_id].append( pointer_dec) partition_pointer_list[neuron_id][0] -= 1 else: if l2_dist(lower_bounds[neuron_id][pointer_dec], group_sample[neuron_id]) \ <= l2_dist(lower_bounds[neuron_id][pointer_inc], group_sample[neuron_id]): partition_access_order_list[neuron_id].append( pointer_dec) partition_pointer_list[neuron_id][0] -= 1 else: partition_access_order_list[neuron_id].append( pointer_inc) partition_pointer_list[neuron_id][1] += 1 return partition_access_order_list
def get_access_order(neuron_group, group_sample, n_inputs_in_partition_0, activations_with_idx_list, pointer_list): access_order_list = list() boundary_with_highest_activation_reached = [False] * len( neuron_group.neuron_idx_list) for neuron_id, activations_with_idx in enumerate( activations_with_idx_list): if pointer_list[neuron_id] is None: access_order_list.append(None) continue else: access_order_list.append(list()) for round_cnt in range(n_inputs_in_partition_0): if pointer_list[neuron_id][0] - 1 >= 0: pointer_dec = pointer_list[neuron_id][0] - 1 else: pointer_dec = pointer_list[neuron_id][0] if pointer_list[neuron_id][1] + 1 < n_inputs_in_partition_0: pointer_inc = pointer_list[neuron_id][1] + 1 else: pointer_inc = pointer_list[neuron_id][1] if boundary_with_highest_activation_reached[neuron_id] \ or l2_dist(activations_with_idx[pointer_dec][0], group_sample[neuron_id]) \ <= l2_dist(activations_with_idx[pointer_inc][0], group_sample[neuron_id]): access_order_list[neuron_id].append(pointer_dec) if pointer_list[neuron_id][0] - 1 >= 0: pointer_list[neuron_id][0] -= 1 else: access_order_list[neuron_id].append(pointer_inc) if pointer_list[neuron_id][1] + 1 < n_inputs_in_partition_0: pointer_list[neuron_id][1] += 1 else: boundary_with_highest_activation_reached[neuron_id] = True return access_order_list
opt = utils.parse_args() if True: if opt.glove: queryset = utils.load_glove_data('query').to(utils.device) neighbors = utils.load_glove_data('answers').to(utils.device) elif opt.sift: queryset = utils.load_sift_data('query').to(utils.device) neighbors = utils.load_sift_data('answers').to(utils.device) else: queryset = utils.load_data('query').to(utils.device) neighbors = utils.load_data('answers').to(utils.device) else: queryset = utils.load_data('train').to(utils.device) dist = utils.l2_dist(queryset) dist += 2*torch.max(dist).item()*torch.eye(len(dist)) #torch.diag(torch.max(dist)) val, neighbors = torch.topk(dist, k=opt.k, dim=1, largest=False) if False: trainset = utils.load_data('train').to(utils.device) dist = utils.l2_dist(queryset, trainset) #dist += 2*torch.max(dist).item()*torch.eye(len(dist)) #torch.diag(torch.max(dist)) val, neighbors = torch.topk(dist, k=opt.k, dim=1, largest=False) height = 1 n_bins_l = list(range(1, 45, 2)) n_bins_l = list(range(1, 100)) n_bins_l = list(range(1, 10, 2)) #[1] n_clusters_l = [64]#[16] #[2]
def ann(xq, xb, xt, query_dist, train_dist, args): # analyze(xt, xt, train_dist) # analyze(xq, xb, query_dist) bias = 0.0 scales = 2.0**(np.arange(-10, 20)) if args.dataset == "gen50ks.txt" and args.embed == 'cnn': scales = np.linspace(0.01, 2.0, num=50) if args.dataset == "gen50ks.txt" and args.embed == 'gru': scales = np.linspace(0, 4.0, num=50) if args.dataset == "trec" and args.embed == 'cnn': scales = np.linspace(0, 2.0, num=50) if args.dataset == "trec" and args.embed == 'gru': scales = np.linspace(2.5, 3.1, num=50) bias = 60 if args.dataset == "enron" and args.embed == 'cnn': scales = np.linspace(0., 1.0, num=50) if args.dataset == "enron" and args.embed == 'gru': scales = np.linspace(0., 2.000, num=50) if args.dataset == "dblp" and args.embed == 'cnn': scales = np.linspace(0.1, 2.0, num=50) if args.dataset == "dblp" and args.embed == 'gru': scales = np.linspace(0.5, 1.6, num=50) if args.dataset == "uniref" and args.embed == 'cnn': scales = np.linspace(0.5, 4.0, num=50) if args.dataset == "uniref" and args.embed == 'gru': scales = np.linspace(0., 1.4, num=50) print(scales) thresholds = [ 1, 5, 10, 15, 20, 25, 50, 75, 100, 125, 150, 300, 500, 800, 1000, 2000 ] train_dist_l2 = l2_dist(xt, xt) query_dist_l2 = l2_dist(xq, xb) threshold2dist = linear_fit(train_dist, train_dist_l2) print("thres\t l2thres\t", end='') for scale in scales: print("%2.3f\t" % scale, end='') print() for threshold in thresholds: gt = [np.argwhere(dist <= threshold) for dist in query_dist] threshold_l2 = threshold2dist(threshold) print("%6d\t %.6f\t" % (threshold, threshold_l2), end='') for scale in scales: items = [ np.argwhere(dist <= bias + threshold_l2 * scale) for dist in query_dist_l2 ] recall = np.mean([ len(np.intersect1d(i, j)) / len(i) for i, j in zip(gt, items) if len(i) > 0 ]) print("%.3f\t" % (recall), end='') print() for threshold in thresholds: gt = [np.argwhere(dist <= threshold) for dist in query_dist] threshold_l2 = threshold2dist(threshold) print("%6d\t %.6f\t" % (threshold, threshold_l2), end='') for scale in scales: items = [ np.argwhere(dist <= threshold_l2 * scale) for dist in query_dist_l2 ] precs = np.mean([ len(np.intersect1d(i, j)) / len(j) if len(j) > 0 else 0 for i, j in zip(gt, items) if len(i) > 0 ]) print("%.3f\t" % (precs), end='') print()
train_node = train.TrainNode(-1, opt, -1) if opt.glove: dataset = utils.load_glove_data('train').to(utils.device) queryset = utils.load_glove_data('query').to(utils.device) neighbors = utils.load_glove_data('answers').to(utils.device) else: dataset = utils.load_data('train').to(utils.device) queryset = utils.load_data('query').to(utils.device) neighbors = utils.load_data('answers').to(utils.device) ######uncomment if False: #dataset = queryset ##remove Dec 15 queryset = dataset dist = utils.l2_dist(dataset) dist += 2 * torch.max(dist).item() * torch.eye( len(dist)) #torch.diag(torch.max(dist)) val, neighbors = torch.topk(dist, k=opt.k, dim=1, largest=False) if False: #queryset = dataset dist = utils.l2_dist(queryset) dist += 2 * torch.max(dist).item() * torch.eye( len(dist)) #torch.diag(torch.max(dist)) val, neighbors = torch.topk(dist, k=opt.k, dim=1, largest=False) #dsnode_path = opt.dsnode_path + str(opt.n_clusters) #print('dsnode path {}'.format(dsnode_path)) #dsnode = utils.pickle_load(dsnode_path) #print('dsnode {}'.format(dsnode))
def answer_query_with_guarantee(model, dataset, act, idx_act, bit_arr, idx_of_idx, par_low_bound, input_sample_id, neuron_group, k, n_partitions, bits_per_input, BATCH_SIZE, batch_size, where=None): layer_id = neuron_group.layer_id group_sample = get_group_sample(dataset, input_sample_id, layer_id, model, neuron_group) n_inputs = len(dataset) n_inputs_rerun = 1 group_activation_cached = [None] * dataset.shape[0] group_activation_cached[input_sample_id] = group_sample heap = [(0.0, input_sample_id)] activations_with_idx_list, pointer_list = initialize_activations_and_pointers_for_phase_one( idx_of_idx, input_sample_id, group_sample, neuron_group, act, idx_act) is_sample_in_partition_0 = [ pointer is not None for pointer in pointer_list ] n_inputs_in_partition_0 = len(activations_with_idx_list[0]) access_order_list = get_access_order(neuron_group, group_sample, n_inputs_in_partition_0, activations_with_idx_list, pointer_list) print( f"input {input_sample_id}, size of neuron group {len(neuron_group.neuron_idx_list)}" ) exit_msg = None input_batch = set() ta_exited = False for round_cnt in range(n_inputs_in_partition_0): round_activations_with_idx = list() for neuron_id, activations_with_idx in enumerate( activations_with_idx_list): if access_order_list[neuron_id] is None: round_activations_with_idx.append(None) else: round_activations_with_idx.append(activations_with_idx[ access_order_list[neuron_id][round_cnt]]) for item in round_activations_with_idx: if item is None: continue activation, input_idx = item if group_activation_cached[input_idx] is None: if where is None: pass else: if not where(input_idx): continue input_batch.add(input_idx) if len(input_batch) >= batch_size \ or n_inputs_rerun + len(input_batch) == dataset.shape[0] \ or round_cnt + 1 == n_inputs_in_partition_0: if len(input_batch) == 0: break run_nn_and_update_things(dataset, group_activation_cached, group_sample, heap, input_batch, k, layer_id, model, neuron_group, BATCH_SIZE) n_inputs_rerun += len(input_batch) input_batch = set() if len(input_batch) == 0 and len(heap) == k: round_activations = list() for round_activation_id, item in enumerate( round_activations_with_idx): if item is None: round_activations.append(group_sample[round_activation_id]) continue activation, input_idx = item round_activations.append(activation) round_activations = np.array(round_activations).reshape( group_sample.shape) threshold = l2_dist(round_activations, group_sample) if heap[0] > (-threshold, n_inputs_in_partition_0): ta_exited = True break if ta_exited: return heap, exit_msg, is_sample_in_partition_0, n_inputs_rerun partitions_of_input = unpack_bits_and_get_input_partitions( idx_of_idx, neuron_group, bit_arr) input_batch, n_inputs_rerun = deal_with_remaining_inputs_in_partition_0( dataset, group_activation_cached, group_sample, heap, input_batch, k, layer_id, model, n_inputs_rerun, neuron_group, partitions_of_input, pointer_list, bits_per_input, BATCH_SIZE, where) bound_list, partition_pointer_list = initialize_bounds_and_pointers_for_phase_two( activations_with_idx_list, input_sample_id, neuron_group, partitions_of_input, pointer_list, bits_per_input) lower_bound_of_partitions = get_lower_bound_of_partitions( idx_of_idx, neuron_group, par_low_bound) partition_access_order_list = get_partition_access_order_list( group_sample, n_partitions, neuron_group, lower_bound_of_partitions, partition_pointer_list) round_cnt = 0 row_cnt = 0 boundary_partition_processed = [ [False, False] for idx in range(len(neuron_group.neuron_idx_list)) ] for neuron_id in range(len(neuron_group.neuron_idx_list)): if pointer_list[neuron_id] is not None: boundary_partition_processed[neuron_id][0] = True while n_inputs_rerun < dataset.shape[0]: inputs_for_neuron_list = list() for neuron_id, partition_of_input in enumerate(partitions_of_input): if round_cnt >= len(partition_access_order_list[neuron_id]): continue inputs_for_current_neuron = get_input_ids_by_partition_id( partition_of_input, partition_access_order_list[neuron_id][round_cnt], bits_per_input, n_inputs) inputs_for_neuron_list.append(inputs_for_current_neuron) add_inputs_to_batch(input_batch, inputs_for_current_neuron, group_activation_cached, where) row_cnt += (n_inputs - n_inputs_in_partition_0) // (n_partitions - 1) if len(input_batch) > 0: run_nn_and_update_things(dataset, group_activation_cached, group_sample, heap, input_batch, k, layer_id, model, neuron_group, BATCH_SIZE) n_inputs_rerun += len(input_batch) input_batch = set() for neuron_id in range(len(neuron_group.neuron_idx_list)): if partition_access_order_list[neuron_id][round_cnt] == 0 or ( n_inputs_in_partition_0 == 0 and partition_access_order_list[neuron_id][round_cnt] == 1): boundary_partition_processed[neuron_id][0] = True if partition_access_order_list[neuron_id][ round_cnt] == n_partitions - 1: boundary_partition_processed[neuron_id][1] = True for idx in range(len(neuron_group.neuron_idx_list)): for input_id in inputs_for_neuron_list[idx]: if bound_list[idx] is None: bound_list[idx] = [ group_activation_cached[input_id][idx], group_activation_cached[input_id][idx] ] else: bound_list[idx][0] = min( bound_list[idx][0], group_activation_cached[input_id][idx]) bound_list[idx][1] = max( bound_list[idx][1], group_activation_cached[input_id][idx]) if len(heap) == k: round_activations = np.array(group_sample) for idx in range(len(neuron_group.neuron_idx_list)): if boundary_partition_processed[idx][ 0] and not boundary_partition_processed[idx][1]: round_activations[idx] = bound_list[idx][0] elif boundary_partition_processed[idx][ 1] and not boundary_partition_processed[idx][0]: round_activations[idx] = bound_list[idx][1] elif pointer_list[idx] is None: if l2_dist(bound_list[idx][0], group_sample[idx]) < l2_dist( bound_list[idx][1], group_sample[idx]): round_activations[idx] = bound_list[idx][0] else: round_activations[idx] = bound_list[idx][1] else: round_activations[idx] = bound_list[idx][0] threshold = l2_dist(round_activations, group_sample) if heap[0] > (-threshold, n_inputs_in_partition_0): ta_exited = True break round_cnt += 1 if ta_exited: exit_msg = f"termination: phase 2, round {round_cnt}; inputs re-run: {n_inputs_rerun}" else: exit_msg = f"termination: none; inputs re-run: {n_inputs_rerun}" return heap, exit_msg, is_sample_in_partition_0, n_inputs_rerun