def integrate_save_score_table_total_parallel(predict_cluster_l, label_map_l,
                                              config, save):
    start_time = time.time()

    query_len = predict_cluster_l[0].shape[0]
    predict_cluster_l = np.array(predict_cluster_l)
    # 2 dimension array, first is the query index, second is the predicted cluster index
    query_predict_cluster = predict_cluster_l.transpose()

    with multiprocessing.Pool(multiprocessing.cpu_count() // 10 * 9) as pool:
        score_table = list(
            pool.imap(
                get_score_table_single_query,
                zip(query_predict_cluster,
                    [label_map_l for _ in range(query_len)],
                    [config['n_item'] for _ in range(query_len)])))

    total_score_table_dir = '%s/total_score_table.txt' % config[
        'program_train_para_dir']
    if save:
        dir_io.save_array_txt(total_score_table_dir, score_table, '%d')

    end_time = time.time()
    intermediate = {'time': end_time - start_time}
    print('save score table success')
    return score_table, intermediate
Пример #2
0
def integrate_single(score_table, gnd, config):
    torch.set_num_threads(12)
    start_time = time.time()
    # long_term_config, short_term_config, short_term_config_before_run, intermediate_result, total_score_table
    dir_io.mkdir(config['program_result_dir'])
    recall_l = []
    print("start evaluate")
    for i, score_arr in enumerate(score_table, 0):
        if i % 50 == 0: print("evaluate " + str(i))
        efsearch_recall_l = evaluate(score_arr, config['efSearch_l'], gnd[i],
                                     config['k'])
        recall_l.append(efsearch_recall_l)
    print('get all the recall')
    # transpose makes the same efsearch in every row of recall
    recall_l = np.array(recall_l).transpose()

    result_n_candidate_recall = []
    for i, efSearch in enumerate(config['efSearch_l'], 0):
        recall_avg = np.mean(recall_l[i])
        result_item = {'n_candidate': efSearch, "recall": recall_avg}
        result_n_candidate_recall.append(result_item)
        print('recall: {}, n_candidates: {}'.format(recall_avg, efSearch))

    dir_io.save_json(config['program_result_dir'], 'result.json',
                     result_n_candidate_recall)

    recall_l_save_dir = '%s/recall_l.txt' % config['program_result_dir']
    dir_io.save_array_txt(recall_l_save_dir, recall_l, '%.3f')
    end_time = time.time()
    intermediate = {'time': end_time - start_time}
    return intermediate
Пример #3
0
def integrate_save_score_table_total(cluster_score_l,
                                     label_l,
                                     config,
                                     save=False):
    start_time = time.time()
    shape = cluster_score_l[0].shape

    total_score_table_dir = '%s/total_score_table_total.txt' % config[
        'program_train_para_dir']

    score_table = np.zeros(shape=(shape[0], config['n_item']),
                           dtype=np.float32)
    # iteration for every query
    for i in range(shape[0]):
        # iterate for every classifier
        for k, tmp_cluster_score in enumerate(cluster_score_l, 0):
            label_map = label_l[k]
            # iterate for every cluster
            for j in range(shape[1]):
                score_item_idx_l = label_map[j]
                score_table[i][score_item_idx_l] += tmp_cluster_score[i][j]

    if save:
        dir_io.save_array_txt(total_score_table_dir, score_table, '%.3f')
    end_time = time.time()
    intermediate = {'time': end_time - start_time}
    print('save score table success')
    return score_table, intermediate
Пример #4
0
 def _partition(self, base, base_base_gnd, ins_intermediate):
     start_time = time.time()
     norm = np.linalg.norm(base, axis=1)
     # print(norm)
     self.norm_div = np.max(norm)
     # print(norm_div)
     base_normlize = base / self.norm_div
     self.a = np.random.normal(loc=self.a_miu,
                               scale=self.a_sigma,
                               size=base.shape[1])
     proj_result = np.dot(base_normlize, self.a)
     self.b = np.random.random() * self.r
     arr = np.floor((proj_result + self.b) / self.r) % self.n_cluster
     self.labels = arr.astype(np.int)
     partition_dir = '%s/partition.txt' % self.save_dir
     dir_io.save_array_txt(partition_dir, self.labels, '%d')
     end_time = time.time()
     self.intermediate['hashing_time'] = end_time - start_time
Пример #5
0
def integrate(score_table_ptr_l, gnd, config):
    # long_term_config, short_term_config, short_term_config_before_run, intermediate_result, total_score_table
    dir_io.mkdir(config['program_result_dir'])
    recall_l = []
    iter_idx = 0
    while True:
        end_of_file = False
        # get the total recall for each query
        total_score_arr = None
        for score_table_ptr in score_table_ptr_l:
            line = score_table_ptr.readline()
            if not line or line == '':
                end_of_file = True
                break
            tmp_score_table = np.array(
                [float(number) for number in line.split(' ')])
            if total_score_arr is None:
                total_score_arr = tmp_score_table
            else:
                total_score_arr += tmp_score_table
        if end_of_file:
            break

        efsearch_recall_l = evaluate(total_score_arr, config['efSearch_l'],
                                     gnd[iter_idx], config['k'])
        recall_l.append(efsearch_recall_l)

        iter_idx += 1
    print('get all the recall')
    # transpose makes the same efsearch in every row of recall
    recall_l = np.array(recall_l).transpose()

    result_n_candidate_recall = []
    for i, efSearch in enumerate(config['efSearch_l'], 0):
        recall_avg = np.mean(recall_l[i])
        result_item = {'n_candidate': efSearch, "recall": recall_avg}
        result_n_candidate_recall.append(result_item)
        print('recall: {}, n_candidates: {}'.format(recall_avg, efSearch))

    dir_io.save_json(config['program_result_dir'], 'result.json',
                     result_n_candidate_recall)

    recall_l_save_dir = '%s/recall_l.txt' % config['program_result_dir']
    dir_io.save_array_txt(recall_l_save_dir, recall_l, '%.3f')
Пример #6
0
    def _partition(self, base, base_base_gnd, ins_intermediate):
        start_time = time.time()

        # use random projection to project the data to 2 part
        partition_idx = self.random_projection(base)

        n_part = 2**self.partition_depth
        labels = np.empty(len(base), dtype=np.int)

        start_idx = 0
        for i in range(n_part):
            end_idx = int(np.ceil(len(base) / n_part)) * (i + 1)
            labels[partition_idx[start_idx:end_idx]] = i
            start_idx = end_idx
        self.labels = labels

        partition_dir = '%s/partition.txt' % self.save_dir
        dir_io.save_array_txt(partition_dir, self.labels, '%d')
        end_time = time.time()
        self.intermediate['hashing_time'] = end_time - start_time
Пример #7
0
    def graph_partition(self, config):
        # this function is to invoke kahip and read partition.txt
        save_dir = config['save_dir']
        graph_partition_type = config['graph_partition_type']
        n_part = 2**self.partition_iter
        n_cluster_l = self.count_proportion(self.partition_idx, n_part,
                                            config['n_cluster'])
        print(n_cluster_l)
        kahip_dir = config['kahip_dir']
        res_labels = np.empty([config['n_item']], dtype=np.int)
        # for efficiently count the offset of different labels
        n_cluster_cumsum_l = np.insert(n_cluster_l, 0, values=0, axis=0)
        n_cluster_cumsum_l = np.cumsum(n_cluster_cumsum_l)

        for i in range(n_part):
            partition_dir = '%s/partition_%d.txt' % (save_dir, i)
            graph_dir = "%s/graph_%d.graph" % (save_dir, i)
            preconfiguration = config['preconfiguration']
            if graph_partition_type == 'kaffpa':
                kahip_command = '%s/deploy/kaffpa %s --preconfiguration=%s --output_filename=%s ' \
                                '--k=%d' % (
                                    kahip_dir, graph_dir, preconfiguration,
                                    partition_dir,
                                    n_cluster_l[i])
                print(kahip_command)
                dir_io.kahip(partition_dir, kahip_command)
            elif graph_partition_type == 'parhip':
                kahip_command = 'mpirun -n %d %s/deploy/parhip %s --preconfiguration %s ' \
                                '--save_partition --k %d' % (
                                    multiprocessing.cpu_count() // 2, kahip_dir, graph_dir, preconfiguration,
                                    n_cluster_l[i])
                print(kahip_command)
                dir_io.kahip('./tmppartition.txtp', kahip_command)
                dir_io.move_file('tmppartition.txtp', partition_dir)
            tmp_labels = read_data.read_partition(partition_dir)
            for j in range(len(tmp_labels)):
                res_labels[self.partition_idx[i]
                           [j]] = tmp_labels[j] + n_cluster_cumsum_l[i]
        partition_dir = '%s/partition.txt' % save_dir
        dir_io.save_array_txt(partition_dir, res_labels, '%d')
        return res_labels
def integrate_save_score_table_total(predict_cluster_l, label_map_l, config,
                                     save):
    start_time = time.time()

    query_len = predict_cluster_l[0].shape[0]
    score_table = np.zeros(shape=(query_len, config['n_item']), dtype=np.int)
    for i in range(query_len):  # the length of query, means for every query
        for j in range(len(predict_cluster_l)):  # for every cluster
            pred_cluster = predict_cluster_l[j][i]
            for k in label_map_l[j][
                    pred_cluster]:  # for every item in pred_cluster
                score_table[i][k] += 1

    total_score_table_dir = '%s/total_score_table.txt' % config[
        'program_train_para_dir']
    if save:
        dir_io.save_array_txt(total_score_table_dir, score_table, '%d')

    end_time = time.time()
    intermediate = {'time': end_time - start_time}
    print('save score table success')
    return score_table, intermediate
 def save(self):
     dir_io.mkdir(self.save_dir)
     eval_res_dir = '%s/eval_res.txt' % self.save_dir
     dir_io.save_array_txt(eval_res_dir, self.result, fmt='%.3f')
 def save(self):
     save_label_dir = '%s/partition.txt' % self.save_dir
     dir_io.save_array_txt(save_label_dir, self.labels, '%i')