Пример #1
0
def generate_benchmark_commands(total_benchmark_commands, bench_fname, all_fname, indexname, docs, stop_words,
                                use_numeric_range_searchs, ts_digest, p_writes):
    total_benchmark_reads = 0
    total_benchmark_writes = 0
    all_csvfile = open(all_fname, 'a', newline='')
    bench_csvfile = open(bench_fname, 'w', newline='')
    all_csv_writer = csv.writer(all_csvfile, delimiter=',', quoting=csv.QUOTE_ALL)
    bench_csv_writer = csv.writer(bench_csvfile, delimiter=',', quoting=csv.QUOTE_ALL)
    progress = tqdm(unit="docs", total=total_benchmark_commands)
    total_docs = len(docs)

    ## timestamp related
    timestamps_pdist = generate_lognormal_dist(total_benchmark_commands)
    min_ts = ts_digest.percentile(0.0)
    max_ts = ts_digest.percentile(100.0)
    query_range_digest = TDigest()

    generated_commands = 0
    while generated_commands < total_benchmark_commands:
        query_ts_pdist = timestamps_pdist[generated_commands]
        percentile = (1.0 - query_ts_pdist) * 100.0
        query_min_ts = ts_digest.percentile(percentile)

        random_doc_pos = random.randint(0, total_docs - 1)
        doc = docs[random_doc_pos]
        # decide read or write
        p_cmd = random.random()
        if p_cmd < p_writes:
            ## WRITE
            total_benchmark_writes = total_benchmark_writes + 1
            generated_row, doc_size = use_case_to_cmd(use_ftadd, doc["title"], doc["text"], doc["comment"],
                                                      doc["username"],
                                                      doc["timestamp"],
                                                      generated_commands)

        else:
            ## READ
            total_benchmark_reads = total_benchmark_reads + 1
            words, totalW = getQueryWords(doc, stop_words, 2)

            choice = random.choices(["simple-1word-query", "2word-union-query", "2word-intersection-query"])[0]
            generated_row = None
            numeric_range_str = ""
            if use_numeric_range_searchs:
                numeric_range_str = "@timestamp:[{} {}] ".format(query_min_ts, max_ts)
                query_range_digest.update(int(max_ts - query_min_ts))
            if choice == "simple-1word-query" and len(words) >= 1:
                generated_row = generate_ft_search_row(indexname, "simple-1word-query",
                                                       "{}{}".format(numeric_range_str, words[0]))
            elif choice == "2word-union-query" and len(words) >= 2:
                generated_row = generate_ft_search_row(indexname, "2word-union-query",
                                                       "{}{} {}".format(numeric_range_str, words[0], words[1]))
            elif choice == "2word-intersection-query" and len(words) >= 2:
                generated_row = generate_ft_search_row(indexname, "2word-intersection-query",
                                                       "{}{}|{}".format(numeric_range_str, words[0], words[1]))
        if generated_row != None:
            #             all_csv_writer.writerow(generated_row)
            #             bench_csv_writer.writerow(generated_row)
            progress.update()
            generated_commands = generated_commands + 1
    progress.close()
    bench_csvfile.close()
    all_csvfile.close()

    #     print()
    xx = []
    yy = []
    p90 = query_range_digest.percentile(90.0)
    dataset_percent = ts_digest.cdf(p90)

    print("90% of the read queries target at max {} percent o keyspace".format(dataset_percent))
    print("100% of the read queries target at max {} percent o keyspace".format(ts_digest.cdf(max_ts - min_ts)))
    for centroid in query_range_digest.centroids_to_list():
        ts_m = centroid["m"]
        xx.append(ts_m)
        yy.append(query_range_digest.cdf(ts_m))
    plt.scatter(xx, yy)

    plt.title('EnWiki pages Query time range')
    plt.xlabel('Query time range')
    plt.ylabel('cdf')
    plt.xscale('log')
    plt.show()

    return total_benchmark_reads, total_benchmark_writes
Пример #2
0
    plt.hist(docs_sizes, bins=bins, alpha=0.5)
    plt.title('EnWiki pages document size frequency. Avg document size: {} Bytes'.format(int(np.average(docs_sizes))))
    plt.xlabel('Document Size in Bytes')
    plt.ylabel('count')
    plt.xscale('log')

    plt.show()

    xx = []
    yy = []

    for centroid in ts_digest.centroids_to_list():
        # print(centroid)
        ts_m = centroid["m"]
        xx.append(ts_m)
        yy.append(ts_digest.cdf(ts_m))
    plt.scatter(xx, yy)

    plt.title('EnWiki pages timestamp range')
    plt.xlabel('timestamp')
    plt.ylabel('cdf')
    #     plt.xscale('log')
    plt.show()

    progress.close()
    all_csvfile.close()
    setup_csvfile.close()

    print("-- generating {} full text search commands -- ".format(total_benchmark_commands))
    print("\t saving to {} and {}".format(bench_fname, all_fname))
    total_benchmark_reads, total_benchmark_writes = generate_benchmark_commands(total_benchmark_commands, bench_fname,