Python TDigest.cdf примеры использования

Язык программирования: Python

Пространство имен/Пакет: tdigest

Класс/Тип: TDigest

Метод/Функция: cdf

Примеров на hotexamples.com: 2

Python TDigest.cdf - 2 примера найдено. Это лучшие примеры Python кода для tdigest.TDigest.cdf, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

TDigest(29)

update(15)

percentile(13)

batch_update(7)

cdf(2)

centroids_to_list(2)

compress(2)

delta(2)

add(1)

merge(1)

quantile(1)

serialize(1)

update_from_dict(1)

Пример #1

Показать файл

def generate_benchmark_commands(total_benchmark_commands, bench_fname, all_fname, indexname, docs, stop_words,
                                use_numeric_range_searchs, ts_digest, p_writes):
    total_benchmark_reads = 0
    total_benchmark_writes = 0
    all_csvfile = open(all_fname, 'a', newline='')
    bench_csvfile = open(bench_fname, 'w', newline='')
    all_csv_writer = csv.writer(all_csvfile, delimiter=',', quoting=csv.QUOTE_ALL)
    bench_csv_writer = csv.writer(bench_csvfile, delimiter=',', quoting=csv.QUOTE_ALL)
    progress = tqdm(unit="docs", total=total_benchmark_commands)
    total_docs = len(docs)

    ## timestamp related
    timestamps_pdist = generate_lognormal_dist(total_benchmark_commands)
    min_ts = ts_digest.percentile(0.0)
    max_ts = ts_digest.percentile(100.0)
    query_range_digest = TDigest()

    generated_commands = 0
    while generated_commands < total_benchmark_commands:
        query_ts_pdist = timestamps_pdist[generated_commands]
        percentile = (1.0 - query_ts_pdist) * 100.0
        query_min_ts = ts_digest.percentile(percentile)

        random_doc_pos = random.randint(0, total_docs - 1)
        doc = docs[random_doc_pos]
        # decide read or write
        p_cmd = random.random()
        if p_cmd < p_writes:
            ## WRITE
            total_benchmark_writes = total_benchmark_writes + 1
            generated_row, doc_size = use_case_to_cmd(use_ftadd, doc["title"], doc["text"], doc["comment"],
                                                      doc["username"],
                                                      doc["timestamp"],
                                                      generated_commands)

        else:
            ## READ
            total_benchmark_reads = total_benchmark_reads + 1
            words, totalW = getQueryWords(doc, stop_words, 2)

            choice = random.choices(["simple-1word-query", "2word-union-query", "2word-intersection-query"])[0]
            generated_row = None
            numeric_range_str = ""
            if use_numeric_range_searchs:
                numeric_range_str = "@timestamp:[{} {}] ".format(query_min_ts, max_ts)
                query_range_digest.update(int(max_ts - query_min_ts))
            if choice == "simple-1word-query" and len(words) >= 1:
                generated_row = generate_ft_search_row(indexname, "simple-1word-query",
                                                       "{}{}".format(numeric_range_str, words[0]))
            elif choice == "2word-union-query" and len(words) >= 2:
                generated_row = generate_ft_search_row(indexname, "2word-union-query",
                                                       "{}{} {}".format(numeric_range_str, words[0], words[1]))
            elif choice == "2word-intersection-query" and len(words) >= 2:
                generated_row = generate_ft_search_row(indexname, "2word-intersection-query",
                                                       "{}{}|{}".format(numeric_range_str, words[0], words[1]))
        if generated_row != None:
            #             all_csv_writer.writerow(generated_row)
            #             bench_csv_writer.writerow(generated_row)
            progress.update()
            generated_commands = generated_commands + 1
    progress.close()
    bench_csvfile.close()
    all_csvfile.close()

    #     print()
    xx = []
    yy = []
    p90 = query_range_digest.percentile(90.0)
    dataset_percent = ts_digest.cdf(p90)

    print("90% of the read queries target at max {} percent o keyspace".format(dataset_percent))
    print("100% of the read queries target at max {} percent o keyspace".format(ts_digest.cdf(max_ts - min_ts)))
    for centroid in query_range_digest.centroids_to_list():
        ts_m = centroid["m"]
        xx.append(ts_m)
        yy.append(query_range_digest.cdf(ts_m))
    plt.scatter(xx, yy)

    plt.title('EnWiki pages Query time range')
    plt.xlabel('Query time range')
    plt.ylabel('cdf')
    plt.xscale('log')
    plt.show()

    return total_benchmark_reads, total_benchmark_writes

Пример #2

Показать файл

    plt.hist(docs_sizes, bins=bins, alpha=0.5)
    plt.title('EnWiki pages document size frequency. Avg document size: {} Bytes'.format(int(np.average(docs_sizes))))
    plt.xlabel('Document Size in Bytes')
    plt.ylabel('count')
    plt.xscale('log')

    plt.show()

    xx = []
    yy = []

    for centroid in ts_digest.centroids_to_list():
        # print(centroid)
        ts_m = centroid["m"]
        xx.append(ts_m)
        yy.append(ts_digest.cdf(ts_m))
    plt.scatter(xx, yy)

    plt.title('EnWiki pages timestamp range')
    plt.xlabel('timestamp')
    plt.ylabel('cdf')
    #     plt.xscale('log')
    plt.show()

    progress.close()
    all_csvfile.close()
    setup_csvfile.close()

    print("-- generating {} full text search commands -- ".format(total_benchmark_commands))
    print("\t saving to {} and {}".format(bench_fname, all_fname))
    total_benchmark_reads, total_benchmark_writes = generate_benchmark_commands(total_benchmark_commands, bench_fname,