def time_measurement_for_ivpq_batch(con, cur, search_parameters, names, query, k, param_query, parameter_variables, num_queries, num_targets): all_execution_times = defaultdict(list) all_inner_times = defaultdict(list) count = 0 data_size = ev.get_vector_dataset_size(cur) for i, search_params in enumerate(search_parameters): set_search_params(con, cur, search_params) for elem in parameter_variables: # set parameter variable cur.execute(param_query.format(elem)) con.commit() times = [] inner_times = [] for iteration in range(NUM_ITERATIONS): samples = ev.get_samples(con, cur, num_queries, data_size) target_samples = ev.get_samples(con, cur, num_targets, data_size) params = [(add_escapes(samples), k, add_escapes(target_samples))] trackings, execution_times = ev.create_track_statistics( cur, con, query, params, log=False) print(param_query.format(elem), search_params, execution_times) times.append(execution_times) inner_times.append(float(trackings[0]['total_time'][0][0])) count += 1 print(str( round((count * 100) / (NUM_ITERATIONS * len(parameter_variables) * len(search_parameters)), 2)) + '%', end='\r') all_execution_times[names[i]].append(mean(times)) all_inner_times[names[i]].append(mean(inner_times)) return all_execution_times, all_inner_times
# set parameters cur.execute('SELECT set_pvf({:d});'.format(pvf)) cur.execute('SELECT set_alpha({:d});'.format(alpha)) cur.execute('SELECT set_method_flag({:d});'.format(method)) cur.execute("SELECT set_use_targetlist('{!s}');".format(use_target_list)) con.commit() dynamic_sizes = list([i * step_size for i in range(1, 11)]) query = 'SELECT * FROM knn_in_iv_batch(' '{!s}' ', {:d}, ' '{!s}' ', \'ivpq_search_in\') LIMIT 1;' convert = lambda x: "'{" + ",".join([ s.replace("'", "''").replace("\"", "\\\"").replace("{", "\}").replace( "{", "\}").replace(",", "\,") for s in x ]) + "}'" get_samples = lambda x: ev.get_samples(con, cur, x, data_size) all_trackings = [defaultdict(list) for i in dynamic_sizes] for iteration in range(num_iters): if variable_parameter == 'query_size': params = [(convert(get_samples(i)), k, convert(get_samples(number_of_target_samples))) for i in dynamic_sizes] if variable_parameter == 'target_size': params = [(convert(get_samples(number_of_query_samples)), k, convert(get_samples(i))) for i in dynamic_sizes] trackings, execution_times = ev.create_track_statistics( cur, con, query, params) for i in range(len(trackings)): for key in trackings[i].keys(): all_trackings[i][key].append(trackings[i][key])
k = config['k'] alpha = config['alpha_start'] method_flag = config['method'] step_size = config['step_size'] query_template = 'SELECT query, target FROM knn_in_ivpq_batch(' '{!s}' ', {:d}, ' '{!s}' ');' ev.set_vec_table_name(config['vecs_table_name']) search_params = {'pvf': 1, 'alpha': alpha, 'method': method_flag} ivpqEv.set_search_params(con, cur, search_params) number_of_vectors = ev.get_vector_dataset_size(cur) query_samples = ev.get_samples(con, cur, number_of_query_samples, number_of_vectors) target_samples = ev.get_samples(con, cur, number_of_target_samples, number_of_vectors) prediction = [] real = [] divergence = [] divergence_relative = [] for i in range(num_iters): params = [(ivpqEv.add_escapes([x]), k, ivpqEv.add_escapes(target_samples)) for x in query_samples] trackings, _ = ev.create_track_statistics(cur, con, query_template, params) prediction += ([float(t['target_count'][0][0]) for t in trackings]) real += ([float(t['retrieved'][0][0]) for t in trackings]) print([float(t['target_count'][0][0]) for t in trackings]) # divergence +=
def time_and_precision_measurement_for_ivpq_batch(con, cur, search_parameters, names, query, k, param_query, parameter_variables, num_queries, num_targets, small_sample_size, outlier_detect=0): USE_MEDIAN = True all_execution_times = defaultdict(list) all_inner_times = defaultdict(list) all_precision_values = defaultdict(list) count = 0 data_size = data_size = ev.get_vector_dataset_size(cur) # init phase for i, search_params in enumerate(search_parameters): all_execution_times[names[i]] = [ [] for j in range(len(parameter_variables[i])) ] all_inner_times[names[i]] = [ [] for j in range(len(parameter_variables[i])) ] all_precision_values[names[i]] = [ [] for j in range(len(parameter_variables[i])) ] # measurement phase for iteration in range(NUM_ITERATIONS): print('Start Iteration', iteration) for i, search_params in enumerate(search_parameters): for j, elem in enumerate(parameter_variables[i]): # TODO set parameter variable cur.execute(param_query.format(elem)) con.commit() times = all_execution_times[names[i]][j] inner_times = all_inner_times[names[i]][j] precision_values = all_precision_values[names[i]][j] # big sample set samples = ev.get_samples(con, cur, num_queries, data_size) # create smaller sample set (bootstraping) small_samples = [ samples[random.randint(0, num_queries - 1)] for i in range(small_sample_size) ] target_samples = ev.get_samples(con, cur, num_targets, data_size) # calculate exact results start_time = time.time() exact_results = get_exact_results(cur, con, add_escapes(small_samples), k, add_escapes(target_samples)) print("--- %s seconds ---" % (time.time() - start_time)) set_search_params(con, cur, search_params) cur.execute(param_query.format(elem)) con.commit() params = [('iv' if names[i] != 'Baseline' else '', add_escapes(samples), k, add_escapes(target_samples))] trackings, execution_times = ev.create_track_statistics( cur, con, query, params, log=False) times.append(execution_times) print(names[i], search_params, elem, "arguments:", len(samples), params[0][2], len(target_samples), float(trackings[0]['total_time'][0][0])) inner_times.append(float(trackings[0]['total_time'][0][0])) # execute approximated query to obtain results cur.execute(query.format(*(params[0]))) approximated_results = defaultdict(list) for res in cur.fetchall(): approximated_results[res[0]].append(res[1]) precision_values.append( calculate_precision(exact_results, approximated_results, k)) count += 1 print(str( round((count * 100) / (NUM_ITERATIONS * sum([len(p) for p in parameter_variables])), 2)) + '%', end='\r') # evaluation phase raw_data = { 'execution_times': copy.deepcopy(all_execution_times), 'inner_times': copy.deepcopy(all_inner_times), 'precision_values': copy.deepcopy(all_precision_values) } for i, search_params in enumerate(search_parameters): for j, elem in enumerate(parameter_variables[i]): if outlier_detect: all_execution_times[names[i]][j] = mean([ v for v in all_execution_times[names[i]][j] if not is_outlier(v, all_execution_times[names[i]][j]) ]) all_inner_times[names[i]][j] = mean([ v for v in all_inner_times[names[i]][j] if not is_outlier(v, all_inner_times[names[i]][j]) ]) else: if USE_MEDIAN: all_execution_times[names[i]][j] = median( all_execution_times[names[i]][j]) all_inner_times[names[i]][j] = median( all_inner_times[names[i]][j]) else: all_execution_times[names[i]][j] = mean( all_execution_times[names[i]][j]) all_inner_times[names[i]][j] = mean( all_inner_times[names[i]][j]) all_precision_values[names[i]][j] = mean( all_precision_values[names[i]][j]) return all_execution_times, all_inner_times, all_precision_values, raw_data