def integrate_save_score_table_total_parallel(predict_cluster_l, label_map_l, config, save): start_time = time.time() query_len = predict_cluster_l[0].shape[0] predict_cluster_l = np.array(predict_cluster_l) # 2 dimension array, first is the query index, second is the predicted cluster index query_predict_cluster = predict_cluster_l.transpose() with multiprocessing.Pool(multiprocessing.cpu_count() // 10 * 9) as pool: score_table = list( pool.imap( get_score_table_single_query, zip(query_predict_cluster, [label_map_l for _ in range(query_len)], [config['n_item'] for _ in range(query_len)]))) total_score_table_dir = '%s/total_score_table.txt' % config[ 'program_train_para_dir'] if save: dir_io.save_array_txt(total_score_table_dir, score_table, '%d') end_time = time.time() intermediate = {'time': end_time - start_time} print('save score table success') return score_table, intermediate
def integrate_single(score_table, gnd, config): torch.set_num_threads(12) start_time = time.time() # long_term_config, short_term_config, short_term_config_before_run, intermediate_result, total_score_table dir_io.mkdir(config['program_result_dir']) recall_l = [] print("start evaluate") for i, score_arr in enumerate(score_table, 0): if i % 50 == 0: print("evaluate " + str(i)) efsearch_recall_l = evaluate(score_arr, config['efSearch_l'], gnd[i], config['k']) recall_l.append(efsearch_recall_l) print('get all the recall') # transpose makes the same efsearch in every row of recall recall_l = np.array(recall_l).transpose() result_n_candidate_recall = [] for i, efSearch in enumerate(config['efSearch_l'], 0): recall_avg = np.mean(recall_l[i]) result_item = {'n_candidate': efSearch, "recall": recall_avg} result_n_candidate_recall.append(result_item) print('recall: {}, n_candidates: {}'.format(recall_avg, efSearch)) dir_io.save_json(config['program_result_dir'], 'result.json', result_n_candidate_recall) recall_l_save_dir = '%s/recall_l.txt' % config['program_result_dir'] dir_io.save_array_txt(recall_l_save_dir, recall_l, '%.3f') end_time = time.time() intermediate = {'time': end_time - start_time} return intermediate
def integrate_save_score_table_total(cluster_score_l, label_l, config, save=False): start_time = time.time() shape = cluster_score_l[0].shape total_score_table_dir = '%s/total_score_table_total.txt' % config[ 'program_train_para_dir'] score_table = np.zeros(shape=(shape[0], config['n_item']), dtype=np.float32) # iteration for every query for i in range(shape[0]): # iterate for every classifier for k, tmp_cluster_score in enumerate(cluster_score_l, 0): label_map = label_l[k] # iterate for every cluster for j in range(shape[1]): score_item_idx_l = label_map[j] score_table[i][score_item_idx_l] += tmp_cluster_score[i][j] if save: dir_io.save_array_txt(total_score_table_dir, score_table, '%.3f') end_time = time.time() intermediate = {'time': end_time - start_time} print('save score table success') return score_table, intermediate
def _partition(self, base, base_base_gnd, ins_intermediate): start_time = time.time() norm = np.linalg.norm(base, axis=1) # print(norm) self.norm_div = np.max(norm) # print(norm_div) base_normlize = base / self.norm_div self.a = np.random.normal(loc=self.a_miu, scale=self.a_sigma, size=base.shape[1]) proj_result = np.dot(base_normlize, self.a) self.b = np.random.random() * self.r arr = np.floor((proj_result + self.b) / self.r) % self.n_cluster self.labels = arr.astype(np.int) partition_dir = '%s/partition.txt' % self.save_dir dir_io.save_array_txt(partition_dir, self.labels, '%d') end_time = time.time() self.intermediate['hashing_time'] = end_time - start_time
def integrate(score_table_ptr_l, gnd, config): # long_term_config, short_term_config, short_term_config_before_run, intermediate_result, total_score_table dir_io.mkdir(config['program_result_dir']) recall_l = [] iter_idx = 0 while True: end_of_file = False # get the total recall for each query total_score_arr = None for score_table_ptr in score_table_ptr_l: line = score_table_ptr.readline() if not line or line == '': end_of_file = True break tmp_score_table = np.array( [float(number) for number in line.split(' ')]) if total_score_arr is None: total_score_arr = tmp_score_table else: total_score_arr += tmp_score_table if end_of_file: break efsearch_recall_l = evaluate(total_score_arr, config['efSearch_l'], gnd[iter_idx], config['k']) recall_l.append(efsearch_recall_l) iter_idx += 1 print('get all the recall') # transpose makes the same efsearch in every row of recall recall_l = np.array(recall_l).transpose() result_n_candidate_recall = [] for i, efSearch in enumerate(config['efSearch_l'], 0): recall_avg = np.mean(recall_l[i]) result_item = {'n_candidate': efSearch, "recall": recall_avg} result_n_candidate_recall.append(result_item) print('recall: {}, n_candidates: {}'.format(recall_avg, efSearch)) dir_io.save_json(config['program_result_dir'], 'result.json', result_n_candidate_recall) recall_l_save_dir = '%s/recall_l.txt' % config['program_result_dir'] dir_io.save_array_txt(recall_l_save_dir, recall_l, '%.3f')
def _partition(self, base, base_base_gnd, ins_intermediate): start_time = time.time() # use random projection to project the data to 2 part partition_idx = self.random_projection(base) n_part = 2**self.partition_depth labels = np.empty(len(base), dtype=np.int) start_idx = 0 for i in range(n_part): end_idx = int(np.ceil(len(base) / n_part)) * (i + 1) labels[partition_idx[start_idx:end_idx]] = i start_idx = end_idx self.labels = labels partition_dir = '%s/partition.txt' % self.save_dir dir_io.save_array_txt(partition_dir, self.labels, '%d') end_time = time.time() self.intermediate['hashing_time'] = end_time - start_time
def graph_partition(self, config): # this function is to invoke kahip and read partition.txt save_dir = config['save_dir'] graph_partition_type = config['graph_partition_type'] n_part = 2**self.partition_iter n_cluster_l = self.count_proportion(self.partition_idx, n_part, config['n_cluster']) print(n_cluster_l) kahip_dir = config['kahip_dir'] res_labels = np.empty([config['n_item']], dtype=np.int) # for efficiently count the offset of different labels n_cluster_cumsum_l = np.insert(n_cluster_l, 0, values=0, axis=0) n_cluster_cumsum_l = np.cumsum(n_cluster_cumsum_l) for i in range(n_part): partition_dir = '%s/partition_%d.txt' % (save_dir, i) graph_dir = "%s/graph_%d.graph" % (save_dir, i) preconfiguration = config['preconfiguration'] if graph_partition_type == 'kaffpa': kahip_command = '%s/deploy/kaffpa %s --preconfiguration=%s --output_filename=%s ' \ '--k=%d' % ( kahip_dir, graph_dir, preconfiguration, partition_dir, n_cluster_l[i]) print(kahip_command) dir_io.kahip(partition_dir, kahip_command) elif graph_partition_type == 'parhip': kahip_command = 'mpirun -n %d %s/deploy/parhip %s --preconfiguration %s ' \ '--save_partition --k %d' % ( multiprocessing.cpu_count() // 2, kahip_dir, graph_dir, preconfiguration, n_cluster_l[i]) print(kahip_command) dir_io.kahip('./tmppartition.txtp', kahip_command) dir_io.move_file('tmppartition.txtp', partition_dir) tmp_labels = read_data.read_partition(partition_dir) for j in range(len(tmp_labels)): res_labels[self.partition_idx[i] [j]] = tmp_labels[j] + n_cluster_cumsum_l[i] partition_dir = '%s/partition.txt' % save_dir dir_io.save_array_txt(partition_dir, res_labels, '%d') return res_labels
def integrate_save_score_table_total(predict_cluster_l, label_map_l, config, save): start_time = time.time() query_len = predict_cluster_l[0].shape[0] score_table = np.zeros(shape=(query_len, config['n_item']), dtype=np.int) for i in range(query_len): # the length of query, means for every query for j in range(len(predict_cluster_l)): # for every cluster pred_cluster = predict_cluster_l[j][i] for k in label_map_l[j][ pred_cluster]: # for every item in pred_cluster score_table[i][k] += 1 total_score_table_dir = '%s/total_score_table.txt' % config[ 'program_train_para_dir'] if save: dir_io.save_array_txt(total_score_table_dir, score_table, '%d') end_time = time.time() intermediate = {'time': end_time - start_time} print('save score table success') return score_table, intermediate
def save(self): dir_io.mkdir(self.save_dir) eval_res_dir = '%s/eval_res.txt' % self.save_dir dir_io.save_array_txt(eval_res_dir, self.result, fmt='%.3f')
def save(self): save_label_dir = '%s/partition.txt' % self.save_dir dir_io.save_array_txt(save_label_dir, self.labels, '%i')