Exemplo n.º 1
0
def compute_GT(dataset, nb, nt, k, input_dir, output_dir, train=False,
    cpu=False):
    if dataset == 'DEEP':
        xb = util.mmap_fvecs('{}deep1B_base.fvecs'.format(
            input_dir))[:nb*1000000]
        xq = util.mmap_fvecs('{}deep1B_query.fvecs'.format(input_dir))
        xt = util.mmap_fvecs('{}deep1B_learn.fvecs'.format(
            input_dir))[:nt*1000000]
    elif dataset == 'SIFT':
        xb = util.mmap_bvecs('{}bigann_base.bvecs'.format(
            input_dir))[:nb*1000000]
        xq = util.mmap_bvecs('{}bigann_query.bvecs'.format(input_dir))
        xt = util.mmap_bvecs('{}bigann_learn.bvecs'.format(
            input_dir))[:nt*1000000]
    elif dataset == 'GIST':
        # For GIST we don't use the nb and nt, since we always use all the
        # database and training vectors.
        xb = util.mmap_fvecs('{}gist_base.fvecs'.format(input_dir))
        xq = util.mmap_fvecs('{}gist_query.fvecs'.format(input_dir))
        xt = util.mmap_fvecs('{}gist_learn.fvecs'.format(input_dir))
    if train:
        data = []
        # Split the training vectors into 1 million chunks to reduce memory
        # footprint.
        for i_t in range(nt):
            if cpu:
                gt_I, gt_D = compute_GT_CPU(xb,
                    xt[i_t*1000000:(i_t+1)*1000000], k)
            else:
                gt_I, gt_D = compute_GT_GPU(xb,
                    xt[i_t*1000000:(i_t+1)*1000000], k)
            for i in range(len(gt_I)):
                candidate = []
                for j in range(k):
                    if gt_D[i][j] ==  gt_D[i][0]:
                        candidate.append(gt_I[i][j])
                data.append(candidate)
        if dataset == 'GIST':
            util.write_tsv(data, '{}gtGIST1Mtrain500K.tsv'.format(output_dir))
        else:
            util.write_tsv(data, '{}gt{}{}Mtrain{}M.tsv'.format(output_dir,
                dataset, nb, nt))
    else:
        if cpu:
            gt_I, gt_D = compute_GT_CPU(xb, xq, k)
        else:
            gt_I, gt_D = compute_GT_GPU(xb, xq, k)
        data = []
        for i in range(len(gt_I)):
            candidate = []
            for j in range(k):
                if gt_D[i][j] ==  gt_D[i][0]:
                    candidate.append(gt_I[i][j])
            data.append(candidate)
        util.write_tsv(data, '{}gt{}{}Mtest.tsv'.format(output_dir, dataset,
            nb))
Exemplo n.º 2
0
def main(args: argparse.Namespace) -> None:
    # This only needs to be run once, but leaving it here is harmless.
    assert nltk.download("names")
    random.seed(args.seed)
    # This looks strange but this is how the NLTK corpus readers work.
    m_names = nltk.corpus.names.words("male.txt")
    f_names = nltk.corpus.names.words("female.txt")
    # Constructs a data set of tuples where the first element is the name and
    # the second element is the sex label. This is then shuffled, split, and
    # written out in TSV format.
    x = m_names + f_names
    y = itertools.chain(
        itertools.repeat("M", len(m_names)),
        itertools.repeat("F", len(f_names)),
    )
    data = list(zip(x, y))
    random.shuffle(data)
    ten = len(data) // 10
    util.write_tsv(args.test, data[:ten])
    util.write_tsv(args.dev, data[ten:ten + ten])
    util.write_tsv(args.train, data[ten + ten:])
                            data.append(line)
            tr1 = total_recall_at1 / float(nq)
            tr10 = total_recall_at10 / float(nq)
            tr100 = total_recall_at100 / float(nq)
            tt = total_latency * 1000.0 / nq
            print(
                parametersets[param] + ' ' *
                (len(parametersets[-1]) + 1 - len(parametersets[param])) +
                '{:.4f} {:.4f} {:.4f} {:.4f}'.format(round(
                    tr1, 4), round(tr10, 4), round(tr100, 4), round(tt, 4)))
            if it > 0 or num_iter == 1:
                recall_list[param] += total_recall_at100
                latency_list[param] += total_latency
    # Write the training/testing data files.
    if search_mode == -1:
        util.write_tsv(
            data, '{}{}_{}_test.tsv'.format(TRAINING_DIR, dbname, index_key))
    if search_mode == -2:
        util.write_tsv(
            data, '{}{}_{}_train.tsv'.format(TRAINING_DIR, dbname, index_key))

    denom = float(nq * max(num_iter - 1, 1))
    recall_list = [x / denom for x in recall_list]
    latency_list = [round(x * 1000.0 / denom, 4) for x in latency_list]

    print('param_list = {}'.format(param_list))
    print('recall target = {}'.format(recall_list))
    print('average latency(ms) = {}'.format(latency_list))
    print('result_{}_{} = {}'.format(dbname, index_key,
                                     [latency_list, recall_list]))
else:
    # Binary search to find minimum fixed configuration (for baseline) or minimum
def preprocess_and_train(training_dir,
                         model_dir,
                         dbname,
                         index_key,
                         xt,
                         xq,
                         full_feature,
                         pred_thresh,
                         feature_idx,
                         billion_scale=False):
    train_file = '{}{}_{}_train.tsv'.format(training_dir, dbname, index_key)
    test_file = '{}{}_{}_test.tsv'.format(training_dir, dbname, index_key)
    if not os.path.isfile(train_file):
        print 'training file {} not found'.format(train_file)
        return
    if not os.path.isfile(test_file):
        print 'testing file {} not found'.format(train_file)
        return

    if dbname.startswith('SIFT'):
        dim = 128
    elif dbname.startswith('DEEP'):
        dim = 96
    else:
        dim = 960

    if index_key[:4] == 'HNSW':
        log_target = True
    else:
        if billion_scale:
            log_target = True
        else:
            log_target = False

    out_buffer = []
    suffix = ''
    if log_target:
        suffix += '_Log'
    if full_feature:
        suffix += '_Full'
    else:
        suffix += '_Query'
    model_name = '{}{}_{}_model_thresh{}{}.txt'.format(model_dir, dbname,
                                                       index_key, pred_thresh,
                                                       suffix)
    log_name = '{}{}_{}_log_thresh{}{}.tsv'.format(model_dir, dbname,
                                                   index_key, pred_thresh,
                                                   suffix)

    df_train = pd.read_csv(train_file, header=None, sep='\t')
    df_test = pd.read_csv(test_file, header=None, sep='\t')

    # Which intermediate search result features will be used.
    if index_key[:4] == 'HNSW':
        keep_idx = [2] + list(range(feature_idx * 4 + 3,
                                    feature_idx * 4 + 7)) + list(
                                        range(len(df_train.columns)))[-10:]
    else:
        keep_idx = list(range(2, 12)) + list(
            range(feature_idx * 4 + 12, feature_idx * 4 + 16))
    drop_idx = list(set(list(range(len(df_train.columns)))) - set(keep_idx))

    train_target = (df_train[0].values).astype('float32')
    train_query = xt[df_train[1].values]
    if full_feature:
        train_other = df_train.drop(drop_idx, axis=1).values
        train_feature = np.concatenate((train_query, train_other), axis=1)
    else:
        train_feature = train_query
    valid_training = []
    for i in range(len(train_feature)):
        if train_target[i] > 0:
            if log_target:
                train_target[i] = math.log(train_target[i], 2)
            valid_training.append(i)
    train_target = train_target[valid_training]
    train_feature = train_feature[valid_training, :]
    out_buffer.append([
        'training count: {} valid rows out of {} total'.format(
            len(train_target), len(df_train))
    ])

    test_target = (df_test[0].values).astype('float32')
    test_query = xq[df_test[1].values]
    if full_feature:
        test_other = df_test.drop(drop_idx, axis=1).values
        test_feature = np.concatenate((test_query, test_other), axis=1)
    else:
        test_feature = test_query
    out_buffer.append(
        ['testing count: {} total rows'.format(len(test_target))])

    if train_feature.shape[0] < 2:
        print 'training file {} too small'.format(train_file)
        return

    # create dataset for lightgbm
    lgb_train = lgb.Dataset(train_feature, train_target)

    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'l2', 'l1'},
        'num_leaves': 31,
        'boost_from_average': False,
        'learning_rate': 0.2,
        'feature_fraction': 1.0,
        'bagging_fraction': 1.0,
        'bagging_freq': 0,
        'verbose': 0,
        'num_threads': 20
    }
    num_round = 100
    if billion_scale:
        params['learning_rate'] = 0.05
        num_round = 500

    if train_feature.shape[0] < 100:
        params['min_data'] = 1
        params['min_data_in_bin'] = 1

    feature_name = []

    for i in range(dim):
        feature_name.append('F0_query_dim{}'.format(i))

    if full_feature:
        if index_key[:4] == 'HNSW':
            feature_name += [
                'F1_d_start', 'F2_d_1st', 'F3_d_10th', 'F4_1st_to_start',
                'F5_10th_to_start', 'F6_prior1', 'F7_prior2', 'F8_prior3',
                'F9_prior4', 'F10_prior5', 'F11_prior6', 'F12_prior7',
                'F13_prior8', 'F14_prior9', 'F15_prior10'
            ]
        else:
            feature_name += [
                'F1_c_10th_to_c_1st', 'F1_c_20th_to_c_1st',
                'F1_c_30th_to_c_1st', 'F1_c_40th_to_c_1st',
                'F1_c_50th_to_c_1st', 'F1_c_60th_to_c_1st',
                'F1_c_70th_to_c_1st', 'F1_c_80th_to_c_1st',
                'F1_c_90th_to_c_1st', 'F1_c_100th_to_c_1st', 'F2_d_1st',
                'F3_d_10th', 'F4_d_1st_to_d_10th', 'F5_d_1st_to_c_1st'
            ]
    # import pdb
    # pdb.set_trace()
    train_helper(out_buffer,
                 params,
                 lgb_train,
                 feature_name, [train_feature, test_feature],
                 [train_target, test_target], ['Train', 'Test'],
                 model_name,
                 num_round=num_round,
                 log_target=log_target,
                 pred_thresh=pred_thresh,
                 full_feature=full_feature)
    util.write_tsv(out_buffer, log_name)