示例#1
0
def test_sort_desc_desc():

    field_names = ('_0', '_1')

    sort_expressions = [
        SortExpression('_0', float, 'DESC'),
        SortExpression('_1', float, 'DESC')
    ]

    heap = []

    heappush(heap, HeapSortableTuple((1, 1), field_names, sort_expressions))
    heappush(heap, HeapSortableTuple((1, 2), field_names, sort_expressions))
    heappush(heap, HeapSortableTuple((2, 1), field_names, sort_expressions))
    heappush(heap, HeapSortableTuple((2, 2), field_names, sort_expressions))

    popped_t1 = heappop(heap).tuple
    popped_t2 = heappop(heap).tuple
    popped_t3 = heappop(heap).tuple
    popped_t4 = heappop(heap).tuple

    assert popped_t1 == (2, 2)
    assert popped_t2 == (2, 1)
    assert popped_t3 == (1, 2)
    assert popped_t4 == (1, 1)
示例#2
0
def test_min_heap():
    print('\n\n\nminheap')
    sort_exp = SortExpression('_4', int, 'DESC', 'l_quantity')
    heap = MinHeap(10)
    for t in tuples:
        ht = HeapTuple(t, field_names, sort_exp)
        heap.push(ht)

    for t in heap.get_all_items(sort=True):
        print(t.tuple)
示例#3
0
def test_sort_desc():
    """Executes a sorted query. The results are collated.

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    ts = query_plan.add_operator(
        SQLTableScan('supplier.csv', 'select * from S3Object '
                     'limit 3;', False, 'ts', query_plan, False))

    s = query_plan.add_operator(
        Sort([SortExpression('_5', float, 'DESC')], 's', query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    ts.connect(s)
    s.connect(c)

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    assert len(c.tuples()) == 3 + 1

    assert c.tuples()[0] == ['_0', '_1', '_2', '_3', '_4', '_5', '_6']

    assert c.tuples()[1] == [
        '1', 'Supplier#000000001', ' N kD4on9OM Ipw3,gf0JBoQDd7tgrzrddZ', '17',
        '27-918-335-1736', '5755.94', 'each slyly above the careful'
    ]
    assert c.tuples()[2] == [
        '3', 'Supplier#000000003', 'q1,G3Pj6OjIuUYfUoH18BFTKP5aU9bEV3', '1',
        '11-383-516-1199', '4192.40',
        'blithely silent requests after the express dependencies are sl'
    ]
    assert c.tuples()[3] == [
        '2', 'Supplier#000000002', '89eJ5ksX3ImxJQBvxObC,', '5',
        '15-679-861-2259', '4032.68', ' slyly bold instructions. idle dependen'
    ]

    # Write the metrics
    query_plan.print_metrics()
示例#4
0
def test_sort_empty():
    """Executes a sorted query with no results returned. We tst this as it's somewhat peculiar with s3 select, in so much
    as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple
    should be present in the results.

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    ts = query_plan.add_operator(
        SQLTableScan('supplier.csv', 'select * from S3Object '
                     'limit 0;', False, 'ts', query_plan, False))

    s = query_plan.add_operator(
        Sort([SortExpression('_5', float, 'ASC')], 's', query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    ts.connect(s)
    s.connect(c)

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    assert len(c.tuples()) == 0

    # Write the metrics
    query_plan.print_metrics()
示例#5
0
def run_head_table_sampling(stats,
                            sort_field_index,
                            sort_field,
                            k,
                            sample_size,
                            parallel,
                            use_pandas,
                            sort_order,
                            buffer_size,
                            table_parts_start,
                            table_parts_end,
                            tbl_s3key,
                            shards_path,
                            format_,
                            sampling_only=True):

    secure = False
    use_native = False
    print('')
    print(
        "Top K Benchmark, Head Table Sampling. Sort Field: {}, Order: {}, k: {}, Sample Size:{}"
        .format(sort_field, sort_order, k, sample_size))
    print("----------------------")

    stats += [
        'sampling_{}_{}'.format('head_table', 'non-filtered'), shards_path,
        sort_field, sort_order, k, sample_size, 1
    ]

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    table_parts = table_parts_end - table_parts_start + 1
    per_part_samples = int(sample_size / table_parts)
    table_name = os.path.basename(tbl_s3key)
    sample_scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                "{}.{}".format(shards_path, p),
                'select {} from S3Object limit {};'.format(
                    sort_field, per_part_samples), format_, use_pandas, secure,
                use_native, 'sample_scan_{}'.format(p), query_plan, False)),
        range(table_parts_start, table_parts_end + 1))

    # Sampling project
    def project_fn(df):
        df.columns = [sort_field]
        df[[sort_field]] = df[[sort_field]].astype(np.float)
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    sample_project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'sample_project_{}'.format(p), query_plan,
                    False, project_fn)),
        range(table_parts_start, table_parts_end + 1))

    # TopK samples
    sort_expr = [SortExpression(sort_field, float, sort_order)]
    sample_topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p),
                query_plan, False)),
        range(table_parts_start, table_parts_end + 1))

    sample_topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False))

    # Generate SQL command for second scan
    sql_gen = query_plan.add_operator(
        TopKFilterBuild(sort_order, 'float', 'select * from S3object ',
                        ' CAST({} as float) '.format(sort_field), 'sql_gen',
                        query_plan, False))

    if not sampling_only:
        # Scan
        scan = map(
            lambda p: query_plan.add_operator(
                SQLTableScan("{}.{}".format(shards_path, p), "", format_,
                             use_pandas, secure, use_native, 'scan_{}'.format(
                                 p), query_plan, False)),
            range(table_parts_start, table_parts_end + 1))

        # Project
        def project_fn(df):
            df.columns = [
                sort_field if x == sort_field_index else x for x in df.columns
            ]
            df[[sort_field]] = df[[sort_field]].astype(np.float)
            return df

        project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

        project = map(
            lambda p: query_plan.add_operator(
                Project(project_exprs, 'project_{}'.format(p), query_plan,
                        False, project_fn)),
            range(table_parts_start, table_parts_end + 1))

        # TopK
        topk = map(
            lambda p: query_plan.add_operator(
                Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan,
                    False)), range(table_parts_start, table_parts_end + 1))

        # TopK reduce
        topk_reduce = query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan))
    map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project))
    map(lambda op: op.connect(sample_topk_reduce), sample_topk)
    sample_topk_reduce.connect(sql_gen)

    if not sampling_only:
        map(lambda (p, o): sql_gen.connect(o), enumerate(scan))
        map(lambda (p, o): o.connect(project[p]), enumerate(scan))
        map(lambda (p, o): o.connect(topk[p]), enumerate(project))
        map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
        topk_reduce.connect(collate)
    else:
        sql_gen.connect(collate)

    # Start the query
    query_plan.execute()
    print('Done')

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    sampling_time = query_plan.total_elapsed_time
    cost, bytes_scanned, bytes_returned, rows = query_plan.cost()
    computation_cost = query_plan.computation_cost()
    data_cost = query_plan.data_cost()[0]

    stats += [
        sql_gen.threshold, sampling_time, 0, sampling_time, rows,
        bytes_scanned, bytes_returned, data_cost, computation_cost, cost
    ]
示例#6
0
def run_local_indexed_sampling(stats,
                               sort_field_index,
                               sort_field,
                               k,
                               sample_size,
                               batch_size,
                               parallel,
                               use_pandas,
                               sort_order,
                               buffer_size,
                               table_parts_start,
                               table_parts_end,
                               tbl_s3key,
                               shards_path,
                               format_,
                               sampling_only=True):
    """
    Executes the randomly sampled topk query by firstly building a random sample, then extracting the filtering threshold
    Finally scanning the table to retrieve only the records beyond the threshold
    :return:
    """

    secure = False
    use_native = False
    n_threads = multiprocessing.cpu_count()

    print('')
    print("Top K Benchmark, Sampling. Sort Field: {}, Order: {}".format(
        sort_field, sort_order))
    print("----------------------")

    stats += [
        'sampling_{}_{}'.format('indexed', 'non-filtered'), shards_path,
        sort_field, sort_order, k, sample_size, batch_size
    ]

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    tbl_smpler = query_plan.add_operator(
        TableRandomSampleGenerator(tbl_s3key, sample_size, batch_size,
                                   "table_sampler", query_plan, False))
    sample_scan = map(
        lambda p: query_plan.add_operator(
            TableRangeAccess(tbl_s3key, use_pandas, secure, use_native,
                             "sample_scan_{}".format(p), query_plan, False)),
        range(table_parts_start, table_parts_end + 1))
    map(lambda (i, op): sample_scan[i].set_nthreads(n_threads),
        enumerate(sample_scan))

    # sample_scan = query_plan.add_operator(
    #     TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p),
    #                      query_plan, False))
    # sample_scan.set_nthreads(n_threads)

    # Sampling project
    def project_fn(df):
        df.columns = [
            sort_field if x == sort_field_index else x for x in df.columns
        ]
        df = df[[sort_field]].astype(np.float, errors='ignore')
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    sample_project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'sample_project_{}'.format(p), query_plan,
                    False, project_fn)),
        range(table_parts_start, table_parts_end + 1))
    # sample_project = query_plan.add_operator(
    #                             Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn))

    # TopK samples
    sort_expr = [SortExpression(sort_field, float, sort_order)]
    sample_topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p),
                query_plan, False)),
        range(table_parts_start, table_parts_end + 1))
    # sample_topk = query_plan.add_operator(
    #                 Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False))

    sample_topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False))

    # Generate SQL command for second scan
    sql_gen = query_plan.add_operator(
        TopKFilterBuild(sort_order, 'float', 'select * from S3object ',
                        ' CAST({} as float) '.format(sort_field), 'sql_gen',
                        query_plan, False))
    if not sampling_only:
        # Scan
        scan = map(
            lambda p: query_plan.add_operator(
                SQLTableScan("{}.{}".format(shards_path, p), "", format_,
                             use_pandas, secure, use_native, 'scan_{}'.format(
                                 p), query_plan, False)),
            range(table_parts_start, table_parts_end + 1))

        # Project
        def project_fn(df):
            df.columns = [
                sort_field if x == sort_field_index else x for x in df.columns
            ]
            df[[sort_field]] = df[[sort_field]].astype(np.float)
            return df

        project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

        project = map(
            lambda p: query_plan.add_operator(
                Project(project_exprs, 'project_{}'.format(p), query_plan,
                        False, project_fn)),
            range(table_parts_start, table_parts_end + 1))

        # TopK
        topk = map(
            lambda p: query_plan.add_operator(
                Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan,
                    False)), range(table_parts_start, table_parts_end + 1))

        # TopK reduce
        topk_reduce = query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda o: tbl_smpler.connect(o), sample_scan)
    map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan))
    map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project))
    map(lambda o: o.connect(sample_topk_reduce), sample_topk)
    sample_topk_reduce.connect(sql_gen)

    if not sampling_only:
        map(lambda (p, o): sql_gen.connect(o), enumerate(scan))
        map(lambda (p, o): o.connect(project[p]), enumerate(scan))
        map(lambda (p, o): o.connect(topk[p]), enumerate(project))
        map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
        topk_reduce.connect(collate)
    else:
        sql_gen.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts_end - table_parts_start))
    print('')

    # Write the plan graph
    # query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    # tuples = collate.tuples()

    # collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    sampling_time = query_plan.total_elapsed_time
    cost, bytes_scanned, bytes_returned, rows = query_plan.cost()
    computation_cost = query_plan.computation_cost()
    data_cost = query_plan.data_cost()[0]

    stats += [
        sql_gen.threshold, sampling_time, 0, sampling_time, rows,
        bytes_scanned, bytes_returned, data_cost, computation_cost, cost
    ]
示例#7
0
def run_memory_indexed_sampling(stats,
                                sort_field_index,
                                sort_field,
                                k,
                                sample_size,
                                batch_size,
                                parallel,
                                use_pandas,
                                sort_order,
                                buffer_size,
                                table_parts_start,
                                table_parts_end,
                                tbl_s3key,
                                shards_path,
                                format_,
                                sampling_only=True):
    """
    Executes the randomly sampled topk query by firstly building a random sample, then extracting the filtering threshold
    Finally scanning the table to retrieve only the records beyond the threshold
    :return:
    """

    secure = False
    use_native = False
    n_threads = multiprocessing.cpu_count()

    print('')
    print(
        "Top K Benchmark, Memory Indexed Sampling. Sort Field: {}, Order: {}, K: {}, Sample Size: {}, Batch Size: {}"
        .format(sort_field, sort_order, k, sample_size, batch_size))
    print("----------------------")

    stats += [
        'sampling_{}_{}'.format('memory_indexed', 'non-filtered'), shards_path,
        sort_field, sort_order, k, sample_size, batch_size
    ]

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    tbl_smpler = query_plan.add_operator(
        TableRandomSampleGenerator(tbl_s3key, sample_size, batch_size,
                                   "table_sampler", query_plan, False))

    sample_scanners = map(
        lambda p: query_plan.add_operator(
            TableRangeAccess(tbl_s3key, use_pandas, secure, use_native,
                             "sample_scan_{}".format(p), query_plan, False)),
        range(table_parts_start, table_parts_end + 1))
    map(lambda op: op.set_nthreads(n_threads), sample_scanners)

    # sample_scan = query_plan.add_operator(
    #     TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p),
    #                      query_plan, False))
    # sample_scan.set_nthreads(n_threads)

    # Sampling project
    def project_fn(df):
        df.columns = [
            sort_field if x == sort_field_index else x for x in df.columns
        ]
        df = df[[sort_field]].astype(np.float, errors='ignore')
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    sample_project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'sample_project_{}'.format(p), query_plan,
                    False, project_fn)),
        range(table_parts_start, table_parts_end + 1))
    # sample_project = query_plan.add_operator(
    #                             Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn))

    # TopK samples
    sort_expr = [SortExpression(sort_field, float, sort_order)]
    sample_topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p),
                query_plan, False)),
        range(table_parts_start, table_parts_end + 1))
    # sample_topk = query_plan.add_operator(
    #                 Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False))

    sample_topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False))

    # Generate SQL command for second scan
    sql_gen = query_plan.add_operator(
        TopKFilterBuild(sort_order, 'float', 'select * from S3object ',
                        ' CAST({} as float) '.format(sort_field),
                        'sample_sql_gen', query_plan, False))

    if not sampling_only:
        # Scan
        scan = map(
            lambda p: query_plan.add_operator(
                SQLTableScan("{}.{}".format(shards_path, p), "", format_,
                             use_pandas, secure, use_native, 'scan_{}'.format(
                                 p), query_plan, False)),
            range(table_parts_start, table_parts_end + 1))

        # Project
        def project_fn(df):
            df.columns = [
                sort_field if x == sort_field_index else x for x in df.columns
            ]
            df[[sort_field]] = df[[sort_field]].astype(np.float)
            return df

        project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

        project = map(
            lambda p: query_plan.add_operator(
                Project(project_exprs, 'project_{}'.format(p), query_plan,
                        False, project_fn)),
            range(table_parts_start, table_parts_end + 1))

        # TopK
        topk = map(
            lambda p: query_plan.add_operator(
                Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan,
                    False)), range(table_parts_start, table_parts_end + 1))

        # TopK reduce
        topk_reduce = query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda op: tbl_smpler.connect(op), sample_scanners)
    map(lambda (p, o): o.connect(sample_project[p]),
        enumerate(sample_scanners))
    map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project))
    map(lambda o: o.connect(sample_topk_reduce), sample_topk)
    sample_topk_reduce.connect(sql_gen)

    if not sampling_only:
        map(lambda (p, o): sql_gen.connect(o), enumerate(scan))
        map(lambda (p, o): o.connect(project[p]), enumerate(scan))
        map(lambda (p, o): o.connect(topk[p]), enumerate(project))
        map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
        topk_reduce.connect(collate)
    else:
        sql_gen.connect(collate)

    # Start the query
    query_plan.execute()
    print('Done')
    # tuples = collate.tuples()

    # collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    # query_plan.stop()

    sampling_threshold = query_plan.retrieve_sampling_threshold()
    sampling_runtime = query_plan.get_phase_runtime('sampl')
    sampling_num_http_requests, sampling_requests_cost = query_plan.requests_cost(
        'sampl')
    sampling_returned_bytes, sampling_returned_rows, sampling_transfer_cost = query_plan.data_transfer_cost(
        phase_keyword='sampl')
    sampling_scanned_bytes, sampling_scan_cost = query_plan.data_scanning_cost(
        'sampl')

    total_runtime = query_plan.total_elapsed_time
    total_http_requests, total_requests_cost = query_plan.requests_cost()
    total_returned_bytes, total_returned_rows, total_transfer_cost = query_plan.data_transfer_cost(
    )
    total_scanned_bytes, total_scan_cost = query_plan.data_scanning_cost()

    total_data_cost = query_plan.data_cost()[0]
    total_computation_cost = query_plan.computation_cost()
    total_cost = query_plan.cost()[0]

    stats += [
        sampling_threshold, sampling_runtime, total_runtime - sampling_runtime,
        total_runtime, sampling_returned_rows,
        sampling_scanned_bytes * BYTE_TO_MB,
        sampling_returned_bytes * BYTE_TO_MB, sampling_num_http_requests,
        sampling_requests_cost, sampling_transfer_cost, sampling_scan_cost,
        total_returned_rows, total_scanned_bytes * BYTE_TO_MB,
        total_returned_bytes * BYTE_TO_MB, total_http_requests,
        total_requests_cost, total_transfer_cost, total_scan_cost,
        total_data_cost, total_computation_cost, total_cost
    ]
示例#8
0
def run(parallel, use_pandas, secure, use_native, buffer_size, format_, customer_parts, order_parts, lineitem_parts,
        customer_sharded,
        order_sharded, lineitem_sharded, other_parts, fp_rate, sf, expected_result, customer_filter_sql=None,
        order_filter_sql=None, lineitem_filter_sql=None):
    """

    :return: None
    """

    print('')
    print("TPCH Q3 Bloom Join")
    print("------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    customer_scan = map(lambda p:
                        query_plan.add_operator(
                            SQLTableScan(get_file_key('customer', customer_sharded, p, sf, format_),
                                         "select "
                                         "  c_custkey "
                                         "from "
                                         "  S3Object "
                                         "where "
                                         "  c_mktsegment = 'BUILDING' "
                                         "  {} "
                                         "  {} "
                                         .format(
                                             ' and ' + customer_filter_sql if customer_filter_sql is not None else '',
                                             get_sql_suffix('customer', customer_parts, p, customer_sharded,
                                                            add_where=False)),
                                         format_,
                                         use_pandas, secure, use_native,
                                         'customer_scan' + '_{}'.format(p),
                                         query_plan,
                                         False)),
                        range(0, customer_parts))

    def customer_project_fn(df):
        df = df.filter(items=['_0'], axis=1)

        df.rename(columns={'_0': 'c_custkey'},
                  inplace=True)

        return df

    customer_project = map(lambda p:
                           query_plan.add_operator(
                               Project([],
                                       'customer_project' + '_{}'.format(p),
                                       query_plan,
                                       False, customer_project_fn)),
                           range(0, customer_parts))

    customer_bloom_create = query_plan.add_operator(
        BloomCreate('c_custkey', 'customer_bloom_create', query_plan, False,
                    fp_rate))

    customer_map = map(lambda p:
                       query_plan.add_operator(Map('c_custkey', 'customer_map' + '_' + str(p), query_plan, False)),
                       range(0, customer_parts))

    order_scan = map(lambda p:
                     query_plan.add_operator(
                         SQLTableScanBloomUse(get_file_key('orders', order_sharded, p, sf, format_),
                                              "select "
                                              "  o_custkey, o_orderkey, o_orderdate, o_shippriority "
                                              "from "
                                              "  S3Object "
                                              "where "
                                              "  cast(o_orderdate as timestamp) < cast('1995-03-01' as timestamp) "
                                              "  {} "
                                              "  {} "
                                              .format(
                                                  ' and ' + order_filter_sql if order_filter_sql is not None else '',
                                                  get_sql_suffix('orders', order_parts, p, order_sharded,
                                                                 add_where=False)),
                                              'o_custkey',
                                              format_,
                                              use_pandas, secure, use_native,
                                              'order_scan' + '_{}'.format(p),
                                              query_plan,
                                              False)),
                     range(0, order_parts))

    def order_project_fn(df):
        df = df.filter(items=['_0', '_1', '_2', '_3'], axis=1)

        df.rename(columns={'_0': 'o_custkey', '_1': 'o_orderkey', '_2': 'o_orderdate', '_3': 'o_shippriority'},
                  inplace=True)

        return df

    order_project = map(lambda p:
                        query_plan.add_operator(
                            Project([],
                                    'order_project' + '_{}'.format(p),
                                    query_plan,
                                    False, order_project_fn)),
                        range(0, customer_parts))

    order_map_1 = map(lambda p:
                      query_plan.add_operator(Map('o_custkey', 'order_map_1' + '_' + str(p), query_plan, False)),
                      range(0, order_parts))

    customer_order_join_build = map(lambda p:
                                    query_plan.add_operator(
                                        HashJoinBuild('c_custkey',
                                                      'customer_order_join_build' + '_' + str(p), query_plan,
                                                      False)),
                                    range(0, other_parts))

    customer_order_join_probe = map(lambda p:
                                    query_plan.add_operator(
                                        HashJoinProbe(JoinExpression('c_custkey', 'o_custkey'),
                                                      'customer_order_join_probe' + '_' + str(p),
                                                      query_plan, False)),
                                    range(0, other_parts))

    order_bloom_create = query_plan.add_operator(
        BloomCreate('o_orderkey', 'order_bloom_create', query_plan, False,
                    fp_rate))

    lineitem_scan = map(lambda p:
                        query_plan.add_operator(
                            SQLTableScanBloomUse(get_file_key('lineitem', lineitem_sharded, p, sf, format_),
                                                 "select "
                                                 "  l_orderkey, l_extendedprice, l_discount "
                                                 "from "
                                                 "  S3Object "
                                                 "where "
                                                 "  cast(l_shipdate as timestamp) > cast('1995-03-01' as timestamp) "
                                                 "  {} "
                                                 "  {} "
                                                 .format(
                                                     ' and ' + lineitem_filter_sql if lineitem_filter_sql is not None else '',
                                                     get_sql_suffix('lineitem', lineitem_parts, p, lineitem_sharded,
                                                                    add_where=False)),
                                                 'l_orderkey', format_,
                                                 use_pandas, secure, use_native,
                                                 'lineitem_scan' + '_{}'.format(p),
                                                 query_plan,
                                                 False)),
                        range(0, lineitem_parts))

    def lineitem_project_fn(df):
        df = df.filter(items=['_0', '_1', '_2'], axis=1)

        df.rename(columns={'_0': 'l_orderkey', '_1': 'l_extendedprice', '_2': 'l_discount'},
                  inplace=True)

        return df

    lineitem_project = map(lambda p:
                           query_plan.add_operator(
                               Project([],
                                       'lineitem_project' + '_{}'.format(p),
                                       query_plan,
                                       False, lineitem_project_fn)),
                           range(0, lineitem_parts))

    lineitem_map = map(lambda p:
                       query_plan.add_operator(Map('l_orderkey', 'lineitem_map' + '_' + str(p), query_plan, False)),
                       range(0, lineitem_parts))

    order_map_2 = map(lambda p:
                      query_plan.add_operator(Map('o_orderkey', 'order_map_2' + '_' + str(p), query_plan, False)),
                      range(0, other_parts))

    customer_order_lineitem_join_build = map(lambda p:
                                             query_plan.add_operator(
                                                 HashJoinBuild('o_orderkey',
                                                               'customer_order_lineitem_join_build' + '_' + str(p),
                                                               query_plan,
                                                               False)),
                                             range(0, other_parts))

    customer_order_lineitem_join_probe = map(lambda p:
                                             query_plan.add_operator(
                                                 HashJoinProbe(JoinExpression('o_orderkey', 'l_orderkey'),
                                                               'customer_order_lineitem_join_probe' + '_' + str(p),
                                                               query_plan, False)),
                                             range(0, other_parts))

    def groupby_fn(df):
        df['l_extendedprice'] = df['l_extendedprice'].astype(np.float)
        df['l_discount'] = df['l_discount'].astype(np.float)
        df['revenue'] = df['l_extendedprice'] * (1 - df['l_discount'])
        grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority'])
        agg_df = grouped['revenue'].sum()
        return agg_df.reset_index()

    group = map(lambda p:
                query_plan.add_operator(
                    Group(
                        ['l_orderkey', 'o_orderdate', 'o_shippriority'],  # l_partkey
                        [
                            AggregateExpression(AggregateExpression.SUM,
                                                lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount'])))
                        ],
                        'group' + '_{}'.format(p), query_plan,
                        False, groupby_fn)),
                range(0, other_parts))

    def group_reduce_fn(df):
        grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority'])
        agg_df = grouped['revenue'].sum()
        return agg_df.reset_index()

    group_reduce = query_plan.add_operator(
        Group(
            ['l_orderkey', 'o_orderdate', 'o_shippriority'],  # l_partkey
            [
                AggregateExpression(AggregateExpression.SUM,
                                    lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount'])))
            ],
            'group_reduce', query_plan,
            False, group_reduce_fn))

    top = query_plan.add_operator(
        Top(10, [SortExpression('revenue', float, 'DESC'), SortExpression('o_orderdate', date, 'ASC')], use_pandas,
            'top', query_plan,
            False))

    collate = query_plan.add_operator(tpch_q19.collate_op('collate', query_plan))

    # Inline what we can
    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), customer_project)
    map(lambda o: o.set_async(False), order_project)
    map(lambda o: o.set_async(False), lineitem_map)
    map(lambda o: o.set_async(False), customer_map)
    map(lambda o: o.set_async(False), order_map_1)
    map(lambda o: o.set_async(False), order_map_2)

    # Connect the operators
    connect_many_to_many(customer_scan, customer_project)
    connect_many_to_many(customer_project, customer_map)

    connect_many_to_one(customer_project, customer_bloom_create)
    connect_one_to_many(customer_bloom_create, order_scan)

    connect_many_to_many(order_scan, order_project)
    connect_many_to_many(order_project, order_map_1)

    connect_all_to_all(customer_map, customer_order_join_build)
    connect_many_to_many(customer_order_join_build, customer_order_join_probe)
    connect_all_to_all(order_map_1, customer_order_join_probe)

    # connect_many_to_one(customer_order_join_probe, collate)

    connect_many_to_one(order_project, order_bloom_create)
    connect_one_to_many(order_bloom_create, lineitem_scan)

    connect_many_to_many(lineitem_scan, lineitem_project)
    connect_many_to_many(lineitem_project, lineitem_map)

    connect_many_to_many(customer_order_join_probe, order_map_2)
    connect_all_to_all(order_map_2, customer_order_lineitem_join_build)
    connect_many_to_many(customer_order_lineitem_join_build, customer_order_lineitem_join_probe)
    connect_all_to_all(lineitem_map, customer_order_lineitem_join_probe)

    # connect_many_to_one(customer_order_lineitem_join_probe, collate)

    connect_many_to_many(customer_order_lineitem_join_probe, group)
    # connect_many_to_one(group, collate)

    connect_many_to_one(group, group_reduce)
    connect_one_to_one(group_reduce, top)
    connect_one_to_one(top, collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("customer_parts: {}".format(customer_parts))
    print("order_parts: {}".format(order_parts))
    print("lineitem_parts: {}".format(lineitem_parts))
    print("customer_sharded: {}".format(customer_sharded))
    print("order_sharded: {}".format(order_sharded))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("other_parts: {}".format(other_parts))
    print("fp_rate: {}".format(fp_rate))
    print("format: {}".format(format_))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['l_orderkey', 'o_orderdate', 'o_shippriority', 'revenue']

    assert len(tuples) == 10 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    test_util.assert_tuples(expected_result, tuples)
示例#9
0
def test_group_baseline():
    """TPC-H Q1

    :return: None
    """

    # Query plan

    # select
    #   l_returnflag,
    #   l_linestatus,
    #   sum(l_quantity) as sum_qty,
    #   sum(l_extendedprice) as sum_base_price,
    #   sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
    #   sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
    #   avg(l_quantity) as avg_qty,
    #   avg(l_extendedprice) as avg_price,
    #   avg(l_discount) as avg_disc,
    #   count(*) as count_order
    # from
    #   lineitem
    # where
    #   l_shipdate <= date '1992-04-01' - interval '[DELTA]' day(3)
    # group by
    #   l_returnflag,
    #   l_linestatus
    # order by
    #   l_returnflag,
    #   l_linestatus;

    query_plan = QueryPlan()

    delta_days = 60  # TODO: This is supposed to be randomized I think
    shipped_date = datetime.strptime('1992-04-01', '%Y-%m-%d') - timedelta(days=delta_days)

    ts = query_plan.add_operator(SQLTableScan("lineitem.csv",
                                              "select * from S3Object "
                                              "where cast(l_shipdate as timestamp) <= cast(\'{}\' as timestamp)"
                                              .format(shipped_date.strftime('%Y-%m-%d')),
                                              False, 'lineitem', query_plan,
                                              False))
    g = query_plan.add_operator(Group(
        [
            '_8',  # l_returnflag
            '_9'  # l_linestatus
        ],
        [
            # sum(l_quantity)
            AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_4'])),
            # sum(l_extendedprice) as sum_base_price
            AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_5'])),
            # sum(l_extendedprice * (1 - l_discount)) as sum_disc_price
            AggregateExpression(AggregateExpression.SUM, lambda t_: float(t_['_5']) * (1 - float(t_['_6']))),
            # sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge
            AggregateExpression(AggregateExpression.SUM,
                                lambda t_: float(t_['_5']) * (1 - float(t_['_6'])) * (1 + float(t_['_7']))),
            # avg(l_quantity)
            AggregateExpression(AggregateExpression.AVG, lambda t_: float(t_['_4'])),
            # avg(l_extendedprice)
            AggregateExpression(AggregateExpression.AVG, lambda t_: float(t_['_5'])),
            # avg(l_discount)
            AggregateExpression(AggregateExpression.AVG, lambda t_: float(t_['_6'])),
            # count(*) as count_order
            AggregateExpression(AggregateExpression.COUNT, lambda t_: t_['_0'])
        ],
        'lineitem_grouped', query_plan,
        False))
    s = query_plan.add_operator(Sort(
        [
            SortExpression('_0', str, 'ASC'),
            SortExpression('_1', str, 'ASC')
        ],
        'lineitem_group_sorted', query_plan,
        False))
    c = query_plan.add_operator(Collate('collation', query_plan, False))

    ts.connect(g)
    g.connect(s)
    s.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    field_names = ['_0', '_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9']

    assert c.tuples()[0] == field_names

    assert len(c.tuples()) == 2 + 1

    # These have been verified in Postgres
    assert c.tuples()[1] == ["A", "F", 129850, 194216048.19000033, 184525343.78730044, 191943492.96455324,
                             25.445816186556925,
                             38059.19031746035, 0.050005878894768374, 5103]
    assert c.tuples()[2] == ["R", "F", 129740, 193438367.33999985, 183701990.7670003, 191045646.36937532,
                             25.509241053873353,
                             38033.49731419579, 0.05061541486433399, 5086]

    # Write the metrics
    query_plan.print_metrics()
示例#10
0
def topk_with_sampling(stats,
                       k,
                       k_scale=1,
                       sort_index='_5',
                       col_type=float,
                       sort_field='l_extendedprice',
                       sort_order='DESC',
                       use_pandas=True,
                       filtered=False,
                       conservative=False,
                       table_name='tpch-sf1/lineitem.csv',
                       shards_prefix='tpch-sf1/lineitem_sharded',
                       shards_start=0,
                       shards_end=31):
    """
    Executes the optimized topk query by firstly retrieving the first k tuples.
    Based on the retrieved tuples, table scan operator gets only the tuples larger/less than the most significant
    tuple in the sample
    :return:
    """

    limit = k
    num_rows = 0
    parallel_shards = True
    processes = multiprocessing.cpu_count()

    query_stats = [
        'sampling_{}_{}'.format(
            'conservative' if conservative else 'aggressive',
            'filtered' if filtered else 'non-filtered'), shards_prefix,
        sort_field, sort_order, limit * k_scale, limit * k_scale
    ]

    sql = 'select * from S3Object;'

    if filtered:
        sql = '''select l_extendedprice
              from S3Object;'''
        sort_index = '_0'

    print("\n\nSampling params:")
    print("Scale: {}, Sort Field: {}, Sort Order: {}\n".format(
        k_scale, sort_field, sort_order))

    query_plan = QueryPlan(is_async=True)

    # Query plan
    ts = query_plan.add_operator(
        TopKTableScan(
            table_name, sql, use_pandas, True, False, limit, k_scale,
            SortExpression(sort_index, col_type, sort_order, sort_field),
            conservative, shards_start, shards_end, parallel_shards,
            shards_prefix, processes, 'topk_table_scan', query_plan, False))
    c = query_plan.add_operator(Collate('collate', query_plan, False))

    ts.connect(c)

    # Write the plan graph
    # query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    for t in c.tuples():
        num_rows += 1
        # print("{}:{}".format(num_rows, t))

    # assert num_rows == limit + 1

    cost, bytes_scanned, bytes_returned, rows = query_plan.cost()
    computation_cost = query_plan.computation_cost()
    data_cost = query_plan.data_cost()[0]

    query_stats += [
        ts.msv,
        ts.op_metrics.sampling_time,
        query_plan.total_elapsed_time,
        ts.op_metrics.sampling_time + query_plan.total_elapsed_time,
        # 0 if num_rows >= limit else 1,
        rows,
        bytes_scanned,
        bytes_returned,
        data_cost,
        computation_cost,
        cost  #,
        # num_rows == limit + 1
    ]
    stats.append(query_stats)

    # Write the metrics
    query_plan.print_metrics()
    query_plan.stop()
    topk_op = query_plan.operators["topk_table_scan"]
    max_table_scan, _ = max([(op, op.op_metrics.elapsed_time())
                             for op in query_plan.operators.values()],
                            key=lambda tup: tup[1])
    OpMetrics.print_overall_metrics(
        [max_table_scan, topk_op, topk_op.sample_op],
        "TopKTableScan total time")
示例#11
0
def run(sort_field, k, parallel, use_pandas, sort_order, buffer_size, table_parts, path, format_):
    """
    Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap
    :return:
    """

    secure = False
    use_native = False
    print('')
    print("Top K Benchmark, Filtered. Sort Field: {}, Order: {}".format(sort_field, sort_order))
    print("----------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)
    
    # Scan
    scan = map(lambda p: 
               query_plan.add_operator(
                    SQLTableScan("{}/topk_data_{}.csv".format(path, p),
                        "select * from S3Object;", format_, use_pandas, secure, use_native,
                        'scan_{}'.format(p), query_plan,
                        False)),
               range(0, table_parts))
  
    # Project
    def project_fn(df):
        df.columns = ['F0', 'F1', 'F2']
        df[ [sort_field] ] = df[ [sort_field] ].astype(np.float)
        return df
   
    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] 
    
    project = map(lambda p: 
                  query_plan.add_operator( 
                      Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)),
                  range(0, table_parts))

    # TopK
    sort_expr = SortExpression(sort_field, 'float', sort_order)
    topk = map(lambda p: 
               query_plan.add_operator(
                    Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)),
               range(0, table_parts))

    # TopK reduce
    topk_reduce = query_plan.add_operator(
                    Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) 

    collate = query_plan.add_operator(
        Collate('collate', query_plan, False))

    #profile_path = '../benchmark-output/topk/'
    #scan[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_scan_0" + ".prof"))
    #project[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_project_0" + ".prof"))
    #groupby[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_0" + ".prof"))
    #groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_reduce" + ".prof"))
    #collate.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_collate" + ".prof"))

    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(topk[p]), enumerate(project))
    map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
    topk_reduce.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
示例#12
0
def __test_topk_with_sampling(k_scale=1,
                              sort_index='_5',
                              col_type=float,
                              sort_field='l_extendedprice',
                              sort_order='DESC',
                              use_pandas=True,
                              filtered=False):
    """
    Executes the optimized topk query by firstly retrieving the first k tuples.
    Based on the retrieved tuples, table scan operator gets only the tuples larger/less than the most significant
    tuple in the sample
    :return:
    """

    limit = 50
    num_rows = 0
    shards = 32
    parallel_shards = True
    shards_prefix = "sf1000-lineitem"
    processes = multiprocessing.cpu_count()

    sql = 'select * from S3Object;'

    if filtered:
        sql = '''select l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_comment
              from S3Object;'''

    print("\n\nSampling params:")
    print("Scale: {}, Sort Field: {}, Sort Order: {}\n".format(
        k_scale, sort_field, sort_order))

    query_plan = QueryPlan(is_async=True)

    # Query plan
    ts = query_plan.add_operator(
        TopKTableScan(
            'lineitem.csv', sql, use_pandas, limit, k_scale,
            SortExpression(sort_index, col_type, sort_order,
                           sort_field), shards, parallel_shards, shards_prefix,
            processes, 'topk_table_scan', query_plan, False))
    c = query_plan.add_operator(Collate('collate', query_plan, False))

    ts.connect(c)

    # Write the plan graph
    # query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    for t in c.tuples():
        num_rows += 1
        print("{}:{}".format(num_rows, t))

    assert num_rows == limit + 1

    # Write the metrics
    query_plan.print_metrics()
    query_plan.stop()
    topk_op = query_plan.operators["topk_table_scan"]
    max_table_scan, _ = max([(op, op.op_metrics.elapsed_time())
                             for op in query_plan.operators.values()],
                            key=lambda tup: tup[1])
    OpMetrics.print_overall_metrics(
        [max_table_scan, topk_op, topk_op.sample_op],
        "TopKTableScan total time")
示例#13
0
def __test_topk_baseline(sort_index='_5',
                         col_type=float,
                         col_name='l_quantity',
                         sort_order='DESC',
                         use_pandas=True,
                         filtered=False):
    """
    Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap
    :return:
    """

    limit = 50
    num_rows = 0
    shards = 32
    parallel_shards = True
    shards_prefix = "sf1000-lineitem"
    processes = multiprocessing.cpu_count()

    sql = 'select * from S3Object;'

    if filtered:
        sql = '''select l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_comment
                  from S3Object;'''

    print("\n\nBaseline TopK with order {} on field {}\n".format(
        sort_order, col_name))

    query_plan = QueryPlan(is_async=True)

    # Query plan
    # ts = query_plan.add_operator(
    #     SQLTableScan('lineitem.csv', 'select * from S3Object limit {};'.format(limit), 'table_scan', query_plan, False))
    sort_exp = SortExpression(sort_index, col_type, sort_order)
    top_op = query_plan.add_operator(
        Top(limit, sort_exp, use_pandas, 'topk', query_plan, False))
    for process in range(processes):
        proc_parts = [x for x in range(shards) if x % processes == process]
        pc = query_plan.add_operator(
            SQLShardedTableScan("lineitem.csv", sql, use_pandas, True,
                                "topk_table_scan_parts_{}".format(proc_parts),
                                proc_parts, shards_prefix, parallel_shards,
                                query_plan, False))
        # pc.set_profiled(True, "topk_table_scan_parts_{}.txt".format(proc_parts))
        pc_top = query_plan.add_operator(
            Top(limit, sort_exp, use_pandas,
                'topk_parts_{}'.format(proc_parts), query_plan, False))
        pc.connect(pc_top)
        pc_top.connect(top_op)

    c = query_plan.add_operator(Collate('collate', query_plan, False))

    top_op.connect(c)

    # Write the plan graph
    # query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    for t in c.tuples():
        num_rows += 1
        print("{}:{}".format(num_rows, t))

    assert num_rows == limit + 1

    # Write the metrics
    query_plan.print_metrics()
    query_plan.stop()
示例#14
0
def run_baseline_topk(stats, sort_field_index, sort_field, k, parallel,
                      use_pandas, sort_order, buffer_size, table_parts_start,
                      table_parts_end, tbl_s3key, format_, shards_path):

    secure = False
    use_native = False
    print('')
    print("Top K Benchmark, Baseline. Sort Field: {}, Order: {}, k: {}".format(
        sort_field, sort_order, k))
    print("----------------------")

    stats += ['baseline', shards_path, sort_field, sort_order, k, 0, 0]

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    table_parts = table_parts_end - table_parts_start + 1
    per_part_samples = int(sample_size / table_parts)
    table_name = os.path.basename(tbl_s3key)

    # Scan
    scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan("{}.{}".format(shards_path, p
                                        ), "select * from S3Object;", format_,
                         use_pandas, secure, use_native, 'scan_{}'.format(
                             p), query_plan, False)),
        range(table_parts_start, table_parts_end + 1))

    # Project
    def project_fn(df):
        df.columns = [
            sort_field if x == sort_field_index else x for x in df.columns
        ]
        df[[sort_field]] = df[[sort_field]].astype(np.float)
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'project_{}'.format(p), query_plan,
                    False, project_fn)),
        range(table_parts_start, table_parts_end + 1))

    # TopK
    sort_expr = [SortExpression(sort_field, float, sort_order)]
    topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan,
                False)), range(table_parts_start, table_parts_end + 1))

    # TopK reduce
    topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(topk[p]), enumerate(project))
    map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
    topk_reduce.connect(collate)

    # Start the query
    query_plan.execute()
    print('Done')

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    query_time = query_plan.total_elapsed_time
    cost, bytes_scanned, bytes_returned, rows = query_plan.cost()
    computation_cost = query_plan.computation_cost()
    data_cost = query_plan.data_cost()[0]

    stats += [
        0, 0, 0, query_time, rows, bytes_scanned, bytes_returned, data_cost,
        computation_cost, cost
    ]
示例#15
0
def topk_baseline(stats,
                  k,
                  sort_index='_5',
                  col_type=float,
                  col_name='l_extendedprice',
                  sort_order='DESC',
                  use_pandas=True,
                  filtered=False,
                  table_name='tpch-sf1/lineitem.csv',
                  shards_prefix='tpch-sf1/lineitem_sharded',
                  shards_start=0,
                  shards_end=31):
    """
    Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap
    :return:
    """

    limit = k
    num_rows = 0
    parallel_shards = True
    processes = multiprocessing.cpu_count()

    query_stats = [
        'baseline' if filtered is False else 'filtered', shards_prefix,
        col_name, sort_order, limit
    ]

    sql = 'select * from S3Object;'

    if filtered:
        sql = '''select l_extendedprice
                  from S3Object;'''
        sort_index = '_0'

    print("\n\nBaseline TopK with order {} on field {}\n".format(
        sort_order, col_name))

    query_plan = QueryPlan(is_async=True)

    # Query plan
    # ts = query_plan.add_operator(
    #     SQLTableScan('lineitem.csv', 'select * from S3Object limit {};'.format(limit), 'table_scan', query_plan, False))
    sort_exp = SortExpression(sort_index, col_type, sort_order)
    top_op = query_plan.add_operator(
        Top(limit, sort_exp, use_pandas, 'topk', query_plan, False))
    for process in range(processes):
        proc_parts = [
            x for x in range(shards_start, shards_end + 1)
            if x % processes == process
        ]
        pc = query_plan.add_operator(
            SQLShardedTableScan(table_name, sql, use_pandas, True, False,
                                "topk_table_scan_parts_{}".format(proc_parts),
                                proc_parts, shards_prefix, parallel_shards,
                                query_plan, False))
        # pc.set_profiled(True, "topk_table_scan_parts_{}.txt".format(proc_parts))
        pc_top = query_plan.add_operator(
            Top(limit, sort_exp, use_pandas,
                'topk_parts_{}'.format(proc_parts), query_plan, False))
        pc.connect(pc_top)
        pc_top.connect(top_op)

    c = query_plan.add_operator(Collate('collate', query_plan, False))

    top_op.connect(c)

    # Write the plan graph
    # query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    for t in c.tuples():
        num_rows += 1
        # print("{}:{}".format(num_rows, t))

    # assert num_rows == limit + 1

    cost, bytes_scanned, bytes_returned, rows = query_plan.cost()
    computation_cost = query_plan.computation_cost()
    data_cost = query_plan.data_cost()[0]

    query_stats += [
        0, 0, query_plan.total_elapsed_time, query_plan.total_elapsed_time, 0,
        rows, bytes_scanned, bytes_returned, data_cost, computation_cost, cost,
        num_rows == limit + 1
    ]
    stats.append(query_stats)

    # Write the metrics
    query_plan.print_metrics()
    query_plan.stop()
示例#16
0
def run(sort_field, k, parallel, use_pandas, sort_order, buffer_size,
        table_first_part, table_parts, queried_columns, select_columns, path,
        format_):
    """
    Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap
    :return:
    """

    secure = False
    use_native = False
    print('')
    print("Top K Benchmark, ColumnScan. Sort Field: {}, Order: {}".format(
        sort_field, sort_order))
    print("----------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    sample_scan = map(
        lambda p: query_plan.add_operator(
            #SQLTableScan("{}/lineitem.snappy.parquet.{}".format(path, p),
            SQLTableScan(
                "{}/lineitem.typed.1RowGroup.parquet.{}".format(
                    path, p), 'select {} from S3Object;'.format(
                        sort_field), format_, use_pandas, secure, use_native,
                'column_scan_{}'.format(p), query_plan, False)),
        range(table_first_part, table_first_part + table_parts))

    # Sampling project
    def project_fn1(df):
        df.columns = [sort_field]
        df[[sort_field]] = df[[sort_field]].astype(np.float)
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    sample_project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'sample_project_{}'.format(p), query_plan,
                    False, project_fn1)),
        range(table_first_part, table_first_part + table_parts))

    # TopK samples
    sort_expr = SortExpression(sort_field, float, sort_order)

    sample_topk = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'sample_topk', query_plan, False))

    # Generate SQL command for second scan
    sql_gen = query_plan.add_operator(
        TopKFilterBuild(
            sort_order,
            'float',
            'select {} from S3object '.format(select_columns),
            #' CAST({} as float) '.format(sort_field), 'sql_gen', query_plan, False ))
            ' {} '.format(sort_field),
            'sql_gen',
            query_plan,
            False))

    # Scan
    scan = map(
        lambda p: query_plan.add_operator(
            #SQLTableScan("{}/lineitem.snappy.parquet.{}".format(path, p),
            SQLTableScan(
                "{}/lineitem.typed.1RowGroup.parquet.{}".format(path, p), "",
                format_, use_pandas, secure, use_native, 'scan_{}'.format(
                    p), query_plan, False)),
        range(table_first_part, table_first_part + table_parts))

    # Project
    def project_fn2(df):
        df.columns = queried_columns
        df[[sort_field]] = df[[sort_field]].astype(np.float)
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'project_{}'.format(p), query_plan, False,
                    project_fn2)),
        range(table_first_part, table_first_part + table_parts))

    # TopK
    topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_{}'.format(p),
                query_plan, False)),
        range(table_first_part, table_first_part + table_parts))

    # TopK reduce
    topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    #profile_path = '../benchmark-output/groupby/'
    #scan[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_scan_0" + ".prof"))
    #project[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_project_0" + ".prof"))
    #groupby[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_0" + ".prof"))
    #groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_reduce" + ".prof"))
    #collate.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_collate" + ".prof"))

    map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan))
    map(lambda (p, o): o.connect(sample_topk), enumerate(sample_project))
    sample_topk.connect(sql_gen)

    map(lambda (p, o): sql_gen.connect(o), enumerate(scan))
    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(topk[p]), enumerate(project))
    map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
    topk_reduce.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
示例#17
0
def test_sort_topk():
    """Executes a sorted top k query, which must use the top operator as record sorting can't be pushed into s3. The
    results are collated.

    :return: None
    """

    limit = 5

    query_plan = QueryPlan()

    # Query plan
    ts = query_plan.add_operator(SQLTableScan('supplier.csv',
                                              'select * from S3Object;',
                                              False,
                                              'ts',
                                              query_plan,
                                              False))
    s = query_plan.add_operator(Sort([
        SortExpression('_5', float, 'ASC')
    ], 's', query_plan, False))
    t = query_plan.add_operator(Limit(limit, 't', query_plan, False))
    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(s)
    s.connect(t)
    t.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    field_names = ['_0', '_1', '_2', '_3', '_4', '_5', '_6']

    assert len(c.tuples()) == limit + 1

    assert c.tuples()[0] == field_names

    prev = None
    num_rows = 0
    for t in c.tuples():
        num_rows += 1
        # print("{}:{}".format(num_rows, t))
        if num_rows > 1:
            if prev is None:
                prev = t
            else:
                lt = IndexedTuple.build(t, field_names)
                prev_lt = IndexedTuple.build(prev, field_names)
                assert float(lt['_5']) > float(prev_lt['_5'])

    # Write the metrics
    query_plan.print_metrics()