def compute_GT(dataset, nb, nt, k, input_dir, output_dir, train=False, cpu=False): if dataset == 'DEEP': xb = util.mmap_fvecs('{}deep1B_base.fvecs'.format( input_dir))[:nb*1000000] xq = util.mmap_fvecs('{}deep1B_query.fvecs'.format(input_dir)) xt = util.mmap_fvecs('{}deep1B_learn.fvecs'.format( input_dir))[:nt*1000000] elif dataset == 'SIFT': xb = util.mmap_bvecs('{}bigann_base.bvecs'.format( input_dir))[:nb*1000000] xq = util.mmap_bvecs('{}bigann_query.bvecs'.format(input_dir)) xt = util.mmap_bvecs('{}bigann_learn.bvecs'.format( input_dir))[:nt*1000000] elif dataset == 'GIST': # For GIST we don't use the nb and nt, since we always use all the # database and training vectors. xb = util.mmap_fvecs('{}gist_base.fvecs'.format(input_dir)) xq = util.mmap_fvecs('{}gist_query.fvecs'.format(input_dir)) xt = util.mmap_fvecs('{}gist_learn.fvecs'.format(input_dir)) if train: data = [] # Split the training vectors into 1 million chunks to reduce memory # footprint. for i_t in range(nt): if cpu: gt_I, gt_D = compute_GT_CPU(xb, xt[i_t*1000000:(i_t+1)*1000000], k) else: gt_I, gt_D = compute_GT_GPU(xb, xt[i_t*1000000:(i_t+1)*1000000], k) for i in range(len(gt_I)): candidate = [] for j in range(k): if gt_D[i][j] == gt_D[i][0]: candidate.append(gt_I[i][j]) data.append(candidate) if dataset == 'GIST': util.write_tsv(data, '{}gtGIST1Mtrain500K.tsv'.format(output_dir)) else: util.write_tsv(data, '{}gt{}{}Mtrain{}M.tsv'.format(output_dir, dataset, nb, nt)) else: if cpu: gt_I, gt_D = compute_GT_CPU(xb, xq, k) else: gt_I, gt_D = compute_GT_GPU(xb, xq, k) data = [] for i in range(len(gt_I)): candidate = [] for j in range(k): if gt_D[i][j] == gt_D[i][0]: candidate.append(gt_I[i][j]) data.append(candidate) util.write_tsv(data, '{}gt{}{}Mtest.tsv'.format(output_dir, dataset, nb))
def main(args: argparse.Namespace) -> None: # This only needs to be run once, but leaving it here is harmless. assert nltk.download("names") random.seed(args.seed) # This looks strange but this is how the NLTK corpus readers work. m_names = nltk.corpus.names.words("male.txt") f_names = nltk.corpus.names.words("female.txt") # Constructs a data set of tuples where the first element is the name and # the second element is the sex label. This is then shuffled, split, and # written out in TSV format. x = m_names + f_names y = itertools.chain( itertools.repeat("M", len(m_names)), itertools.repeat("F", len(f_names)), ) data = list(zip(x, y)) random.shuffle(data) ten = len(data) // 10 util.write_tsv(args.test, data[:ten]) util.write_tsv(args.dev, data[ten:ten + ten]) util.write_tsv(args.train, data[ten + ten:])
data.append(line) tr1 = total_recall_at1 / float(nq) tr10 = total_recall_at10 / float(nq) tr100 = total_recall_at100 / float(nq) tt = total_latency * 1000.0 / nq print( parametersets[param] + ' ' * (len(parametersets[-1]) + 1 - len(parametersets[param])) + '{:.4f} {:.4f} {:.4f} {:.4f}'.format(round( tr1, 4), round(tr10, 4), round(tr100, 4), round(tt, 4))) if it > 0 or num_iter == 1: recall_list[param] += total_recall_at100 latency_list[param] += total_latency # Write the training/testing data files. if search_mode == -1: util.write_tsv( data, '{}{}_{}_test.tsv'.format(TRAINING_DIR, dbname, index_key)) if search_mode == -2: util.write_tsv( data, '{}{}_{}_train.tsv'.format(TRAINING_DIR, dbname, index_key)) denom = float(nq * max(num_iter - 1, 1)) recall_list = [x / denom for x in recall_list] latency_list = [round(x * 1000.0 / denom, 4) for x in latency_list] print('param_list = {}'.format(param_list)) print('recall target = {}'.format(recall_list)) print('average latency(ms) = {}'.format(latency_list)) print('result_{}_{} = {}'.format(dbname, index_key, [latency_list, recall_list])) else: # Binary search to find minimum fixed configuration (for baseline) or minimum
def preprocess_and_train(training_dir, model_dir, dbname, index_key, xt, xq, full_feature, pred_thresh, feature_idx, billion_scale=False): train_file = '{}{}_{}_train.tsv'.format(training_dir, dbname, index_key) test_file = '{}{}_{}_test.tsv'.format(training_dir, dbname, index_key) if not os.path.isfile(train_file): print 'training file {} not found'.format(train_file) return if not os.path.isfile(test_file): print 'testing file {} not found'.format(train_file) return if dbname.startswith('SIFT'): dim = 128 elif dbname.startswith('DEEP'): dim = 96 else: dim = 960 if index_key[:4] == 'HNSW': log_target = True else: if billion_scale: log_target = True else: log_target = False out_buffer = [] suffix = '' if log_target: suffix += '_Log' if full_feature: suffix += '_Full' else: suffix += '_Query' model_name = '{}{}_{}_model_thresh{}{}.txt'.format(model_dir, dbname, index_key, pred_thresh, suffix) log_name = '{}{}_{}_log_thresh{}{}.tsv'.format(model_dir, dbname, index_key, pred_thresh, suffix) df_train = pd.read_csv(train_file, header=None, sep='\t') df_test = pd.read_csv(test_file, header=None, sep='\t') # Which intermediate search result features will be used. if index_key[:4] == 'HNSW': keep_idx = [2] + list(range(feature_idx * 4 + 3, feature_idx * 4 + 7)) + list( range(len(df_train.columns)))[-10:] else: keep_idx = list(range(2, 12)) + list( range(feature_idx * 4 + 12, feature_idx * 4 + 16)) drop_idx = list(set(list(range(len(df_train.columns)))) - set(keep_idx)) train_target = (df_train[0].values).astype('float32') train_query = xt[df_train[1].values] if full_feature: train_other = df_train.drop(drop_idx, axis=1).values train_feature = np.concatenate((train_query, train_other), axis=1) else: train_feature = train_query valid_training = [] for i in range(len(train_feature)): if train_target[i] > 0: if log_target: train_target[i] = math.log(train_target[i], 2) valid_training.append(i) train_target = train_target[valid_training] train_feature = train_feature[valid_training, :] out_buffer.append([ 'training count: {} valid rows out of {} total'.format( len(train_target), len(df_train)) ]) test_target = (df_test[0].values).astype('float32') test_query = xq[df_test[1].values] if full_feature: test_other = df_test.drop(drop_idx, axis=1).values test_feature = np.concatenate((test_query, test_other), axis=1) else: test_feature = test_query out_buffer.append( ['testing count: {} total rows'.format(len(test_target))]) if train_feature.shape[0] < 2: print 'training file {} too small'.format(train_file) return # create dataset for lightgbm lgb_train = lgb.Dataset(train_feature, train_target) params = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'regression', 'metric': {'l2', 'l1'}, 'num_leaves': 31, 'boost_from_average': False, 'learning_rate': 0.2, 'feature_fraction': 1.0, 'bagging_fraction': 1.0, 'bagging_freq': 0, 'verbose': 0, 'num_threads': 20 } num_round = 100 if billion_scale: params['learning_rate'] = 0.05 num_round = 500 if train_feature.shape[0] < 100: params['min_data'] = 1 params['min_data_in_bin'] = 1 feature_name = [] for i in range(dim): feature_name.append('F0_query_dim{}'.format(i)) if full_feature: if index_key[:4] == 'HNSW': feature_name += [ 'F1_d_start', 'F2_d_1st', 'F3_d_10th', 'F4_1st_to_start', 'F5_10th_to_start', 'F6_prior1', 'F7_prior2', 'F8_prior3', 'F9_prior4', 'F10_prior5', 'F11_prior6', 'F12_prior7', 'F13_prior8', 'F14_prior9', 'F15_prior10' ] else: feature_name += [ 'F1_c_10th_to_c_1st', 'F1_c_20th_to_c_1st', 'F1_c_30th_to_c_1st', 'F1_c_40th_to_c_1st', 'F1_c_50th_to_c_1st', 'F1_c_60th_to_c_1st', 'F1_c_70th_to_c_1st', 'F1_c_80th_to_c_1st', 'F1_c_90th_to_c_1st', 'F1_c_100th_to_c_1st', 'F2_d_1st', 'F3_d_10th', 'F4_d_1st_to_d_10th', 'F5_d_1st_to_c_1st' ] # import pdb # pdb.set_trace() train_helper(out_buffer, params, lgb_train, feature_name, [train_feature, test_feature], [train_target, test_target], ['Train', 'Test'], model_name, num_round=num_round, log_target=log_target, pred_thresh=pred_thresh, full_feature=full_feature) util.write_tsv(out_buffer, log_name)