def prepare_train(base, base_base_gnd, partition_info, config):
    save_dir = '%s/Classifier_%d/prepare_train_sample' % (
        config['program_train_para_dir'], config['classifier_number'])
    dir_io.mkdir(save_dir)
    classifier_number = config['classifier_number']

    prepare_method = factory(config['type'])
    global label_k
    if 'label_k' in config:
        label_k = config['label_k']
        print(
            "label k %d =========================================================================================="
            % label_k)

    start_time = time.time()
    print('start prepare data %s' % classifier_number)
    trainloader, valloader = prepare_method(base, base_base_gnd,
                                            partition_info,
                                            config['n_cluster'])
    print('finish prepare_data %s' % classifier_number)
    end_time = time.time()
    intermediate_config = {'time': end_time - start_time}
    if save_sample:
        save(trainloader, valloader, save_dir)

    return (trainloader, valloader), intermediate_config
예제 #2
0
def execute(pq, X, Q, G, metric, config, train_size=100000):
    np.random.seed(123)
    print("# ranking metric {}".format(metric))
    print("# " + pq.class_message())
    pq.fit(X[:train_size].astype(dtype=np.float32), iter=40)

    print('# compress items')
    compressed = chunk_compress(pq, X)
    print(compressed.dtype)
    print("# sorting items")
    Ts = [2**i for i in range(1 + int(math.log2(len(X))))]
    recalls = BatchSorter(compressed,
                          Q,
                          X,
                          G,
                          Ts,
                          metric=metric,
                          batch_size=200).recall()
    print("# searching!")

    res_l = []
    # print("expected items, overall time, avg recall, avg precision, avg error, avg items")
    for i, (t, recall) in enumerate(zip(Ts, recalls)):
        tmp_res = {'n_candidate': t, 'recall': recall}
        res_l.append(tmp_res)
        # print("{}, {}, {}, {}, {}, {}".format(
        #     2 ** i, 0, recall, recall * len(G[0]) / t, 0, t))
    save_data_dir = '/home/zhengbian/NN_as_Classification/data/result/%s_%d_baseline_%d_%s' % (
        config['dataset'], config['n_cluster'], config['codebook'],
        config['method'])
    dir_io.delete_dir_if_exist(save_data_dir)
    dir_io.mkdir(save_data_dir)
    dir_io.save_json(save_data_dir, 'result.json', res_l)
예제 #3
0
def integrate_single(score_table, gnd, config):
    torch.set_num_threads(12)
    start_time = time.time()
    # long_term_config, short_term_config, short_term_config_before_run, intermediate_result, total_score_table
    dir_io.mkdir(config['program_result_dir'])
    recall_l = []
    print("start evaluate")
    for i, score_arr in enumerate(score_table, 0):
        if i % 50 == 0: print("evaluate " + str(i))
        efsearch_recall_l = evaluate(score_arr, config['efSearch_l'], gnd[i],
                                     config['k'])
        recall_l.append(efsearch_recall_l)
    print('get all the recall')
    # transpose makes the same efsearch in every row of recall
    recall_l = np.array(recall_l).transpose()

    result_n_candidate_recall = []
    for i, efSearch in enumerate(config['efSearch_l'], 0):
        recall_avg = np.mean(recall_l[i])
        result_item = {'n_candidate': efSearch, "recall": recall_avg}
        result_n_candidate_recall.append(result_item)
        print('recall: {}, n_candidates: {}'.format(recall_avg, efSearch))

    dir_io.save_json(config['program_result_dir'], 'result.json',
                     result_n_candidate_recall)

    recall_l_save_dir = '%s/recall_l.txt' % config['program_result_dir']
    dir_io.save_array_txt(recall_l_save_dir, recall_l, '%.3f')
    end_time = time.time()
    intermediate = {'time': end_time - start_time}
    return intermediate
def prepare_data(config):
    save_dir = '%s/data/dataset/%s_%d' % (config['save_dir'],
                                          config['data_fname'], config['k'])
    dir_io.delete_dir_if_exist(save_dir)
    dir_io.mkdir(save_dir)
    base = np.random.normal(loc=config['miu'],
                            scale=data_config['sigma'],
                            size=(config['base']['length'],
                                  config['base']['dim']))
    base_save_dir = '%s/base.fvecs' % save_dir
    vecs_io.fvecs_write(base_save_dir, base)

    query = np.random.normal(loc=config['miu'],
                             scale=data_config['sigma'],
                             size=(config['query']['length'],
                                   config['query']['dim']))
    query_save_dir = '%s/query.fvecs' % save_dir
    vecs_io.fvecs_write(query_save_dir, query)

    base = base.astype(np.float32)
    query = query.astype(np.float32)

    gnd = groundtruth.get_gnd(base, query, config['k'])
    gnd_save_dir = '%s/gnd.ivecs' % save_dir
    vecs_io.ivecs_write(gnd_save_dir, gnd)

    base_base_gnd = groundtruth.get_gnd(base, base, config['base_base_gnd_k'])
    base_base_gnd_save_dir = '%s/base_base_gnd.ivecs' % save_dir
    vecs_io.ivecs_write(base_base_gnd_save_dir, base_base_gnd)

    print("base", base.shape)
    print("query", query.shape)
    print("gnd", gnd.shape)
    print("base_base_gnd", base_base_gnd.shape)
예제 #5
0
def preprocess(base, config):
    dir_io.mkdir(config['program_train_para_dir'])
    start_time = time.time()
    multiple_model_ins = factory(config)
    model_l, model_intermediate = multiple_model_ins.preprocess(base)

    end_time = time.time()
    model_intermediate['total_time'] = end_time - start_time
    return model_l, model_intermediate
def prepare_data(config):
    data_dir = '%s/data/dataset/%s' % (config['project_dir'],
                                       config['data_fname'])
    print("data_dir", data_dir)
    config['data_dir'] = data_dir

    dir_io.delete_dir_if_exist(data_dir)
    dir_io.mkdir(data_dir)

    # read data
    hdfFile = h5py.File(config['source_data_dir'], 'r')

    base_info = config['file']['base']
    base = hdfFile.get(base_info['name'])
    if base_info['length'] != -1 and base_info['length'] < len(base):
        base = base[:base_info['length']]
    if config['normalization']:
        base = normalization(base)
        print("normalize base")
    base = base.astype(np.float32)
    save_base_dir = '%s/base.fvecs' % data_dir
    vecs_io.fvecs_write(save_base_dir, base)
    print("save base")

    query_info = config['file']['query']
    query = hdfFile.get(query_info['name'])
    if query_info['length'] != -1 and query_info['length'] < len(query):
        query = query[:query_info['length']]
    if config['normalization']:
        query = normalization(query)
        print("normalize query")
    query = query.astype(np.float32)
    save_query_dir = '%s/query.fvecs' % data_dir
    vecs_io.fvecs_write(save_query_dir, query)
    print("save query")

    save_gnd_dir = '%s/gnd-%d.ivecs' % (data_dir, config['k'])
    gnd = groundtruth.get_gnd(base, query, config['k'])
    vecs_io.ivecs_write(save_gnd_dir, gnd)
    print("save gnd")

    base_base_gnd_npy_dir = '%s/base_base_gnd-%d.ivecs' % (
        config['data_dir'], config['base_base_gnd_k'])
    base_base_gnd = groundtruth.get_gnd(
        base, base, max(config['base_base_gnd_k'], config['k']))
    vecs_io.ivecs_write(base_base_gnd_npy_dir, base_base_gnd)
    print("save base_base_gnd for the training set preparation")

    print("base:", base.shape)
    print("query:", query.shape)
    print("gnd:", gnd.shape)
    print("base_base_gnd:", base_base_gnd.shape)
    hdfFile.close()
예제 #7
0
    def __init__(self, config):
        self.type = config['type']
        self.save_dir = '%s/dataset_partition' % config['save_dir']
        dir_io.mkdir(self.save_dir)
        self.classifier_number = config['classifier_number']
        self.obj_id = "%s_%d" % (self.type, self.classifier_number)
        # number of cluster
        self.n_cluster = config['n_cluster']
        self.model_info = None
        # the key of map is the number of every class, its value is the index that belongs to the cluster in base
        self.label_map = {}
        # to count the number of points in different bucket
        self.n_point_label = None

        self.labels = None
예제 #8
0
def preprocess(base, config):
    program_train_para_dir = config['program_train_para_dir']
    dir_io.mkdir(program_train_para_dir)

    start_time = time.time()
    ds_partition_config = config['dataset_partition']
    ds_partition_config['program_train_para_dir'] = program_train_para_dir
    ds_partition_config['n_cluster'] = config['n_cluster']
    ds_partition_config['n_instance'] = config['n_instance']
    multiple_model = factory(ds_partition_config)
    model_l, model_intermediate = multiple_model.preprocess(base)

    end_time = time.time()
    model_intermediate['total_time'] = end_time - start_time
    return model_l, model_intermediate
예제 #9
0
def convert_data_type(config):
    dir_io.mkdir(config['data_dir'])
    print("create directory")

    base_dir = '%s/%s' % (config['source_data_dir'],
                          config['source_data_fname']['base'])
    base_save_dir = '%s/%s' % (config['data_dir'], 'base.fvecs')
    base = vecs2vecs(base_dir,
                     base_save_dir,
                     config['source_data_type']['base'],
                     'fvecs',
                     file_len=config['base_len'])
    print("extract base")

    query_dir = '%s/%s' % (config['source_data_dir'],
                           config['source_data_fname']['query'])
    query_save_dir = '%s/%s' % (config['data_dir'], 'query.fvecs')
    query = vecs2vecs(query_dir,
                      query_save_dir,
                      config['source_data_type']['query'],
                      'fvecs',
                      file_len=config['query_len'])
    print("extract query")

    if config['minus_avg']:
        average_vecs = np.average(base, axis=0)
        print(average_vecs)
        base = base - average_vecs
        query = query - average_vecs
        print("minus average number in each dimension")

    gnd = groundtruth.get_gnd(base, query, config['k'])
    gnd_save_dir = '%s/%s' % (config['data_dir'], 'gnd.ivecs')
    vecs_io.ivecs_write(gnd_save_dir, gnd)
    print("extract gnd")

    base_base_gnd = groundtruth.get_gnd(base, base, config['base_base_gnd_k'])
    base_base_gnd_save_dir = '%s/%s' % (config['data_dir'],
                                        'base_base_gnd.ivecs')
    vecs_io.ivecs_write(base_base_gnd_save_dir, base_base_gnd)
    print("extract base_base_gnd for the training set preparation")

    print("base: ", base.shape)
    print("query: ", query.shape)
    print("gnd: ", gnd.shape)
    print("base_base_gnd: ", base_base_gnd.shape)
    return base, query, gnd, base_base_gnd
    def __init__(self, config):
        self.save_dir = '%s/dataset_partition' % config['save_dir']
        dir_io.mkdir(self.save_dir)
        self.type = config['type']
        self.classifier_number = config['classifier_number']
        self.distance_metric = config[
            'distance_metric'] if 'distance_metric' in config else 'l2'  # pq_nn will not include this parameter

        # number of cluster
        self.n_cluster = config['n_cluster']
        self.model_info = None
        # the key of map is the number of every class, its value is the index that belongs to the cluster in base
        self.label_map = []
        # to count the number of points in different bucket
        self.n_point_label = None
        self.intermediate = {}

        self.labels = None
예제 #11
0
 def __init__(self, config):
     self.program_train_para_dir = config['program_train_para_dir']
     self.type = config['type']
     self.n_instance = config['n_instance']
     self.n_cluster = config['n_cluster']
     self.model_l = []
     self.intermediate = {}
     # for identification
     for i in range(self.n_instance):
         tmp_config = copy.deepcopy(config)
         tmp_config['type'] = self.type
         tmp_config['classifier_number'] = i + 1
         tmp_config['n_cluster'] = self.n_cluster
         tmp_config['save_dir'] = '%s/Classifier_%d' % (
             self.program_train_para_dir, tmp_config['classifier_number'])
         dir_io.mkdir(tmp_config['save_dir'])
         tmp_model = self.get_model(tmp_config)
         self.model_l.append(tmp_model)
예제 #12
0
def integrate(score_table_ptr_l, gnd, config):
    # long_term_config, short_term_config, short_term_config_before_run, intermediate_result, total_score_table
    dir_io.mkdir(config['program_result_dir'])
    recall_l = []
    iter_idx = 0
    while True:
        end_of_file = False
        # get the total recall for each query
        total_score_arr = None
        for score_table_ptr in score_table_ptr_l:
            line = score_table_ptr.readline()
            if not line or line == '':
                end_of_file = True
                break
            tmp_score_table = np.array(
                [float(number) for number in line.split(' ')])
            if total_score_arr is None:
                total_score_arr = tmp_score_table
            else:
                total_score_arr += tmp_score_table
        if end_of_file:
            break

        efsearch_recall_l = evaluate(total_score_arr, config['efSearch_l'],
                                     gnd[iter_idx], config['k'])
        recall_l.append(efsearch_recall_l)

        iter_idx += 1
    print('get all the recall')
    # transpose makes the same efsearch in every row of recall
    recall_l = np.array(recall_l).transpose()

    result_n_candidate_recall = []
    for i, efSearch in enumerate(config['efSearch_l'], 0):
        recall_avg = np.mean(recall_l[i])
        result_item = {'n_candidate': efSearch, "recall": recall_avg}
        result_n_candidate_recall.append(result_item)
        print('recall: {}, n_candidates: {}'.format(recall_avg, efSearch))

    dir_io.save_json(config['program_result_dir'], 'result.json',
                     result_n_candidate_recall)

    recall_l_save_dir = '%s/recall_l.txt' % config['program_result_dir']
    dir_io.save_array_txt(recall_l_save_dir, recall_l, '%.3f')
예제 #13
0
 def __init__(self, config):
     self.program_train_para_dir = config['program_train_para_dir']
     self.type = config['dataset_partition']['type']
     self.n_instance = config['n_instance']
     self.n_cluster = config['n_cluster']
     self.kahip_dir = config['kahip_dir']
     self.distance_metric = config['distance_metric']
     self.model_l = []
     for i in range(self.n_instance):
         tmp_config = copy.deepcopy(config['dataset_partition'])
         tmp_config['type'] = self.type
         tmp_config['classifier_number'] = i
         tmp_config['n_cluster'] = self.n_cluster
         tmp_config['kahip_dir'] = self.kahip_dir
         tmp_config['distance_metric'] = self.distance_metric
         tmp_config['save_dir'] = '%s/Classifier_%d' % (
             self.program_train_para_dir, tmp_config['classifier_number'])
         dir_io.mkdir(tmp_config['save_dir'])
         tmp_model = self.get_model(tmp_config)
         self.model_l.append(tmp_model)
 def save(self):
     dir_io.mkdir(self.save_dir)
     eval_res_dir = '%s/eval_res.txt' % self.save_dir
     dir_io.save_array_txt(eval_res_dir, self.result, fmt='%.3f')
def prepare_data(config):
    data_dir = '%s/data/dataset/%s_%d' % (config['project_dir'],
                                          config['data_fname'], config['k'])
    print("data_dir", data_dir)

    dir_io.delete_dir_if_exist(data_dir)
    '''
    dataset preparation
    make directory, extract base, query, gnd
    '''

    dir_io.mkdir(data_dir)
    print("create directory")

    dataset_dir = '%s/%s' % (config['source_data_dir'],
                             config['source_data']['name'])
    dataset, alphabet = read_txt(dataset_dir,
                                 word_len=config['padding_length'],
                                 n_character=config['n_character'])
    print(alphabet)

    base = dataset[:config['source_data']['base_len']]
    query = dataset[-config['source_data']['query_len']:]

    start = time.time()
    gnd = groundtruth.get_gnd(base, query, config['k'], metrics="string")
    gnd_save_dir = '%s/%s' % (data_dir, 'gnd.npy')
    print(gnd.dtype)
    dir_io.save_numpy(gnd_save_dir, gnd)
    # vecs_io.ivecs_write(gnd_save_dir, gnd)
    end = time.time()
    print("save gnd, time:", end - start)
    print("gnd: ", gnd.shape)
    del gnd

    start = time.time()
    base_base_gnd, n_base_base_gnd = get_base_base_gnd(base, config)
    base_base_gnd_save_dir = '%s/%s' % (data_dir, 'base_base_gnd.npy')
    # vecs_io.ivecs_write(base_base_gnd_save_dir, base_base_gnd)
    print(base_base_gnd.dtype)
    dir_io.save_numpy(base_base_gnd_save_dir, base_base_gnd)
    end = time.time()
    print("save base_base_gnd for the training set preparation, time:",
          end - start)
    print("base_base_gnd: ", base_base_gnd.shape)
    del base_base_gnd

    # encoding and padding
    base_save_dir = '%s/%s' % (data_dir, 'base.npy')
    start = time.time()
    base = words2vector(base, config['padding_length'], alphabet)
    print(base.dtype)
    print("wrods2vector time consume %d" % (time.time() - start))
    # vecs_io.ivecs_write(base_save_dir, base)
    dir_io.save_numpy(base_save_dir, base)
    print("save base")
    print("base: ", base.shape)

    query_save_dir = '%s/%s' % (data_dir, 'query.npy')
    query = words2vector(query, config['padding_length'], alphabet)
    # vecs_io.ivecs_write(query_save_dir, query)
    print(query.dtype)
    dir_io.save_numpy(query_save_dir, query)
    print("save query")
    print("query: ", query.shape)

    description_dir = '%s/%s' % (data_dir, 'readme.txt')
    ptr = dir_io.write_ptr(description_dir)
    ptr.write('the max base_base_gnd is %d\n' % n_base_base_gnd)
    ptr.close()