Пример #1
0
def time_measurement_for_ivpq_batch(con, cur, search_parameters, names, query,
                                    k, param_query, parameter_variables,
                                    num_queries, num_targets):
    all_execution_times = defaultdict(list)
    all_inner_times = defaultdict(list)
    count = 0
    data_size = ev.get_vector_dataset_size(cur)
    for i, search_params in enumerate(search_parameters):
        set_search_params(con, cur, search_params)
        for elem in parameter_variables:
            # set parameter variable
            cur.execute(param_query.format(elem))
            con.commit()
            times = []
            inner_times = []
            for iteration in range(NUM_ITERATIONS):
                samples = ev.get_samples(con, cur, num_queries, data_size)
                target_samples = ev.get_samples(con, cur, num_targets,
                                                data_size)
                params = [(add_escapes(samples), k,
                           add_escapes(target_samples))]
                trackings, execution_times = ev.create_track_statistics(
                    cur, con, query, params, log=False)
                print(param_query.format(elem), search_params, execution_times)
                times.append(execution_times)
                inner_times.append(float(trackings[0]['total_time'][0][0]))
                count += 1
                print(str(
                    round((count * 100) /
                          (NUM_ITERATIONS * len(parameter_variables) *
                           len(search_parameters)), 2)) + '%',
                      end='\r')
            all_execution_times[names[i]].append(mean(times))
            all_inner_times[names[i]].append(mean(inner_times))
    return all_execution_times, all_inner_times
# set parameters
cur.execute('SELECT set_pvf({:d});'.format(pvf))
cur.execute('SELECT set_alpha({:d});'.format(alpha))
cur.execute('SELECT set_method_flag({:d});'.format(method))
cur.execute("SELECT set_use_targetlist('{!s}');".format(use_target_list))
con.commit()

dynamic_sizes = list([i * step_size for i in range(1, 11)])

query = 'SELECT * FROM knn_in_iv_batch(' '{!s}' ', {:d}, ' '{!s}' ', \'ivpq_search_in\') LIMIT 1;'
convert = lambda x: "'{" + ",".join([
    s.replace("'", "''").replace("\"", "\\\"").replace("{", "\}").replace(
        "{", "\}").replace(",", "\,") for s in x
]) + "}'"

get_samples = lambda x: ev.get_samples(con, cur, x, data_size)

all_trackings = [defaultdict(list) for i in dynamic_sizes]
for iteration in range(num_iters):
    if variable_parameter == 'query_size':
        params = [(convert(get_samples(i)), k,
                   convert(get_samples(number_of_target_samples)))
                  for i in dynamic_sizes]
    if variable_parameter == 'target_size':
        params = [(convert(get_samples(number_of_query_samples)), k,
                   convert(get_samples(i))) for i in dynamic_sizes]
    trackings, execution_times = ev.create_track_statistics(
        cur, con, query, params)
    for i in range(len(trackings)):
        for key in trackings[i].keys():
            all_trackings[i][key].append(trackings[i][key])
Пример #3
0
k = config['k']
alpha = config['alpha_start']
method_flag = config['method']
step_size = config['step_size']

query_template = 'SELECT query, target FROM knn_in_ivpq_batch(' '{!s}' ', {:d}, ' '{!s}' ');'

ev.set_vec_table_name(config['vecs_table_name'])

search_params = {'pvf': 1, 'alpha': alpha, 'method': method_flag}

ivpqEv.set_search_params(con, cur, search_params)

number_of_vectors = ev.get_vector_dataset_size(cur)

query_samples = ev.get_samples(con, cur, number_of_query_samples,
                               number_of_vectors)
target_samples = ev.get_samples(con, cur, number_of_target_samples,
                                number_of_vectors)

prediction = []
real = []
divergence = []
divergence_relative = []
for i in range(num_iters):
    params = [(ivpqEv.add_escapes([x]), k, ivpqEv.add_escapes(target_samples))
              for x in query_samples]
    trackings, _ = ev.create_track_statistics(cur, con, query_template, params)
    prediction += ([float(t['target_count'][0][0]) for t in trackings])
    real += ([float(t['retrieved'][0][0]) for t in trackings])
    print([float(t['target_count'][0][0]) for t in trackings])
    # divergence +=
Пример #4
0
def time_and_precision_measurement_for_ivpq_batch(con,
                                                  cur,
                                                  search_parameters,
                                                  names,
                                                  query,
                                                  k,
                                                  param_query,
                                                  parameter_variables,
                                                  num_queries,
                                                  num_targets,
                                                  small_sample_size,
                                                  outlier_detect=0):
    USE_MEDIAN = True
    all_execution_times = defaultdict(list)
    all_inner_times = defaultdict(list)
    all_precision_values = defaultdict(list)
    count = 0
    data_size = data_size = ev.get_vector_dataset_size(cur)

    # init phase
    for i, search_params in enumerate(search_parameters):
        all_execution_times[names[i]] = [
            [] for j in range(len(parameter_variables[i]))
        ]
        all_inner_times[names[i]] = [
            [] for j in range(len(parameter_variables[i]))
        ]
        all_precision_values[names[i]] = [
            [] for j in range(len(parameter_variables[i]))
        ]
    # measurement phase
    for iteration in range(NUM_ITERATIONS):
        print('Start Iteration', iteration)
        for i, search_params in enumerate(search_parameters):
            for j, elem in enumerate(parameter_variables[i]):
                # TODO set parameter variable
                cur.execute(param_query.format(elem))
                con.commit()
                times = all_execution_times[names[i]][j]
                inner_times = all_inner_times[names[i]][j]
                precision_values = all_precision_values[names[i]][j]

                # big sample set
                samples = ev.get_samples(con, cur, num_queries, data_size)

                # create smaller sample set (bootstraping)
                small_samples = [
                    samples[random.randint(0, num_queries - 1)]
                    for i in range(small_sample_size)
                ]
                target_samples = ev.get_samples(con, cur, num_targets,
                                                data_size)
                # calculate exact results
                start_time = time.time()
                exact_results = get_exact_results(cur, con,
                                                  add_escapes(small_samples),
                                                  k,
                                                  add_escapes(target_samples))
                print("--- %s seconds ---" % (time.time() - start_time))
                set_search_params(con, cur, search_params)
                cur.execute(param_query.format(elem))
                con.commit()
                params = [('iv' if names[i] != 'Baseline' else '',
                           add_escapes(samples), k,
                           add_escapes(target_samples))]

                trackings, execution_times = ev.create_track_statistics(
                    cur, con, query, params, log=False)

                times.append(execution_times)
                print(names[i], search_params, elem, "arguments:",
                      len(samples), params[0][2], len(target_samples),
                      float(trackings[0]['total_time'][0][0]))
                inner_times.append(float(trackings[0]['total_time'][0][0]))

                # execute approximated query to obtain results
                cur.execute(query.format(*(params[0])))
                approximated_results = defaultdict(list)
                for res in cur.fetchall():
                    approximated_results[res[0]].append(res[1])
                precision_values.append(
                    calculate_precision(exact_results, approximated_results,
                                        k))
                count += 1
                print(str(
                    round((count * 100) /
                          (NUM_ITERATIONS *
                           sum([len(p)
                                for p in parameter_variables])), 2)) + '%',
                      end='\r')
    # evaluation phase
    raw_data = {
        'execution_times': copy.deepcopy(all_execution_times),
        'inner_times': copy.deepcopy(all_inner_times),
        'precision_values': copy.deepcopy(all_precision_values)
    }
    for i, search_params in enumerate(search_parameters):
        for j, elem in enumerate(parameter_variables[i]):
            if outlier_detect:
                all_execution_times[names[i]][j] = mean([
                    v for v in all_execution_times[names[i]][j]
                    if not is_outlier(v, all_execution_times[names[i]][j])
                ])
                all_inner_times[names[i]][j] = mean([
                    v for v in all_inner_times[names[i]][j]
                    if not is_outlier(v, all_inner_times[names[i]][j])
                ])
            else:
                if USE_MEDIAN:
                    all_execution_times[names[i]][j] = median(
                        all_execution_times[names[i]][j])
                    all_inner_times[names[i]][j] = median(
                        all_inner_times[names[i]][j])
                else:
                    all_execution_times[names[i]][j] = mean(
                        all_execution_times[names[i]][j])
                    all_inner_times[names[i]][j] = mean(
                        all_inner_times[names[i]][j])
            all_precision_values[names[i]][j] = mean(
                all_precision_values[names[i]][j])
    return all_execution_times, all_inner_times, all_precision_values, raw_data