Пример #1
0
def test_scan_empty():
    """Executes a scan where no records are returned. We tst this as it's somewhat peculiar with s3 select, in so much
    as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple
    should be present in the results.

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    ts = query_plan.add_operator(
        SQLTableScan('nation.csv', "select * from s3object limit 0", False,
                     'ts', query_plan, False))
    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    assert len(c.tuples()) == 0

    # Write the metrics
    query_plan.print_metrics()
Пример #2
0
def test_scan_simple():
    """Executes a scan. The results are then collated.

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    ts = query_plan.add_operator(
        TableScan('nation.csv', 'ts', query_plan, False))
    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    assert len(c.tuples()) == 25 + 1

    assert c.tuples()[0] == ['_0', '_1', '_2', '_3']

    assert c.tuples()[1] == [
        '0', 'ALGERIA', '0',
        ' haggle. carefully final deposits detect slyly agai'
    ]
    assert c.tuples()[2] == [
        '1', 'ARGENTINA', '1',
        'al foxes promise slyly according to the regular accounts. bold requests alon'
    ]
    assert c.tuples()[3] == [
        '2', 'BRAZIL', '1',
        'y alongside of the pending deposits. carefully special packages are about '
        'the ironic forges. slyly special '
    ]

    # Write the metrics
    query_plan.print_metrics()
Пример #3
0
def test_random_scan_simple():
    """Executes a random scan. The results are then collated.

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    random_col_defs = [
        RandomIntColumnDef(0, 9),
        RandomStringColumnDef(10, 20),
        RandomDateColumnDef(datetime.strptime('2017-01-01', '%Y-%m-%d'),
                            datetime.strptime('2018-01-01', '%Y-%m-%d'))
    ]

    random_table_scan = query_plan.add_operator(
        RandomTableScan(10,
                        random_col_defs,
                        'random_table_scan', query_plan,
                        False))

    collate = query_plan.add_operator(
        Collate('collate', query_plan, False))

    random_table_scan.connect(collate)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    collate.print_tuples()

    # Write the metrics
    query_plan.print_metrics()

    # Assert the results
    assert len(collate.tuples()) == 10 + 1

    assert collate.tuples()[0] == ['_0', '_1', '_2']
Пример #4
0
def test_group_sum():
    """Tests a group by query with a sum aggregate

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    # select s_nationkey, sum(float(s_acctbal)) from supplier.csv group by s_nationkey
    ts = query_plan.add_operator(
        SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts',
                     query_plan, False))

    g = query_plan.add_operator(
        Group(['_3'], [
            AggregateExpression(AggregateExpression.SUM,
                                lambda t_: float(t_['_5']))
        ], 'g', query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(g)
    g.connect(c)

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t_))

    field_names = ['_0', '_1']

    assert c.tuples()[0] == field_names

    assert len(c.tuples()) == 25 + 1

    nation_24 = filter(
        lambda t_: IndexedTuple.build(t_, field_names)['_0'] == '24',
        c.tuples())[0]
    assert round(nation_24[1], 2) == 1833872.56

    # Write the metrics
    query_plan.print_metrics()
Пример #5
0
def test_filter_1():
    """

    :return:
    """

    query_plan = QueryPlan(buffer_size=64, is_async=True, use_shared_mem=False)

    # Query plan

    ts = query_plan.add_operator(
        SQLTableScan(
            'tpch-sf1/lineitem_sharded/lineitem.csv.0',
            'select * from S3Object where cast(l_extendedprice as float) >= 0 and cast(l_extendedprice  as float)<= 910;',
            Format.CSV, True, False, False, 'ts', query_plan, False))

    # using a 'use_native=True' argument will result in a None object being returned
    '''
    ts = query_plan.add_operator(
        SQLTableScan('tpch-sf1/lineitem_sharded/lineitem.csv.0', 'select l_partkey  from S3Object limit 5;',Format.CSV , True, False,False, 'ts', query_plan, False))
    #random_strings_2.csv
    '''
    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(c)
    #f.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    #assert 2 + 1 == len(c.tuples())
    print("Tuples:")
    print(c.tuples())
    # Write the metrics
    query_plan.print_metrics()
    print(ROOT_DIR)
Пример #6
0
def test_group_empty():
    """Executes a group where no records are returned. We tst this as it's somewhat peculiar with s3 select, in so much
    as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple
    should be present in the results.

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    # select s_nationkey, sum(float(s_acctbal)) from supplier.csv group by s_nationkey
    ts = query_plan.add_operator(
        SQLTableScan('supplier.csv', 'select * from S3Object limit 0;', False,
                     'ts', query_plan, False))

    g = query_plan.add_operator(
        Group(['_3'], [
            AggregateExpression(AggregateExpression.SUM,
                                lambda t_: float(t_['_5']))
        ], 'g', query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(g)
    g.connect(c)

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    field_names = ['_0', '_1']

    assert c.tuples()[0] == field_names

    assert len(c.tuples()) == 0 + 1

    # Write the metrics
    query_plan.print_metrics()
Пример #7
0
def test_filter_arg(object_, sql_query):
    # Let's forget about the local filter for now. The pd._expr field of the PredicateExpression class is not well documented and it is needed for the Filter class (on line 102).
    """

    :return:
    """

    query_plan = QueryPlan(buffer_size=64, is_async=True, use_shared_mem=False)

    # Query plan
    '''
    ts = query_plan.add_operator(
        SQLTableScan('lineitem.csv', 'select * from S3Object limit 3;' , False, 'ts', query_plan, False))
    '''
    # using a 'use_native=True' argument will result in a None object being returned
    ts = query_plan.add_operator(
        SQLTableScan(object_, sql_query, Format.CSV, True, False, False, 'ts',
                     query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    #assert 2 + 1 == len(c.tuples())
    print("Tuples:")
    print(c.tuples())
    # Write the metrics
    query_plan.print_metrics()
    print(ROOT_DIR)
Пример #8
0
    def sample_table(s3key, k, sort_exp):
        """
        Given a table name, return a random sample of records. Currently, the returned records are the first k tuples
        :param s3key: the s3 object name
        :param k: the number of tuples to return (the number added to the SQL Limit clause
        :param sort_exp: the sort expression in which the topk is chosen upon
        :return: the list of selected keys from the first k tuples in the table
        """
        projection = "CAST({} as {})".format(sort_exp.col_name,
                                             sort_exp.col_type.__name__)

        sql = "SELECT {} FROM S3Object LIMIT {}".format(projection, k)
        q_plan = QueryPlan(None, is_async=False)
        select_op = q_plan.add_operator(
            SQLTableScan(s3key, sql, True, True, False,
                         "sample_{}_scan".format(s3key), q_plan, False))

        from copy import deepcopy
        sample_topk_sort_exp = deepcopy(sort_exp)
        sample_topk_sort_exp.col_index = '_0'
        topk = q_plan.add_operator(
            Top(max_tuples=k,
                sort_expression=sample_topk_sort_exp,
                use_pandas=True,
                name="sampling_topk",
                query_plan=q_plan,
                log_enabled=False))
        collate = q_plan.add_operator(
            Collate("sample_{}_collate".format(s3key), q_plan, False))
        select_op.connect(topk)
        topk.connect(collate)

        q_plan.execute()

        q_plan.print_metrics()

        return collate.tuples(), select_op, q_plan
Пример #9
0
def main():
    parts = 32
    query_plan = QueryPlan(is_async=True, buffer_size=0)

    # Query plan
    lineitem_scan = map(lambda p:
                        query_plan.add_operator(
                            SQLTableScan(get_file_key('lineitem', True, p),
                                         "select * from S3Object;", Format.CSV,
                                         use_pandas=True, secure=False, use_native=False,
                                         name='scan_' + str(p), query_plan=query_plan,
                                         log_enabled=False)),
                        range(0, parts))

    collate = query_plan.add_operator(
        Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(collate), enumerate(lineitem_scan))

    query_plan.execute()
Пример #10
0
def run_baseline_topk(stats, sort_field_index, sort_field, k, parallel,
                      use_pandas, sort_order, buffer_size, table_parts_start,
                      table_parts_end, tbl_s3key, format_, shards_path):

    secure = False
    use_native = False
    print('')
    print("Top K Benchmark, Baseline. Sort Field: {}, Order: {}, k: {}".format(
        sort_field, sort_order, k))
    print("----------------------")

    stats += ['baseline', shards_path, sort_field, sort_order, k, 0, 0]

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    table_parts = table_parts_end - table_parts_start + 1
    per_part_samples = int(sample_size / table_parts)
    table_name = os.path.basename(tbl_s3key)

    # Scan
    scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan("{}.{}".format(shards_path, p
                                        ), "select * from S3Object;", format_,
                         use_pandas, secure, use_native, 'scan_{}'.format(
                             p), query_plan, False)),
        range(table_parts_start, table_parts_end + 1))

    # Project
    def project_fn(df):
        df.columns = [
            sort_field if x == sort_field_index else x for x in df.columns
        ]
        df[[sort_field]] = df[[sort_field]].astype(np.float)
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'project_{}'.format(p), query_plan,
                    False, project_fn)),
        range(table_parts_start, table_parts_end + 1))

    # TopK
    sort_expr = [SortExpression(sort_field, float, sort_order)]
    topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan,
                False)), range(table_parts_start, table_parts_end + 1))

    # TopK reduce
    topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(topk[p]), enumerate(project))
    map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
    topk_reduce.connect(collate)

    # Start the query
    query_plan.execute()
    print('Done')

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    query_time = query_plan.total_elapsed_time
    cost, bytes_scanned, bytes_returned, rows = query_plan.cost()
    computation_cost = query_plan.computation_cost()
    data_cost = query_plan.data_cost()[0]

    stats += [
        0, 0, 0, query_time, rows, bytes_scanned, bytes_returned, data_cost,
        computation_cost, cost
    ]
Пример #11
0
def run_head_table_sampling(stats,
                            sort_field_index,
                            sort_field,
                            k,
                            sample_size,
                            parallel,
                            use_pandas,
                            sort_order,
                            buffer_size,
                            table_parts_start,
                            table_parts_end,
                            tbl_s3key,
                            shards_path,
                            format_,
                            sampling_only=True):

    secure = False
    use_native = False
    print('')
    print(
        "Top K Benchmark, Head Table Sampling. Sort Field: {}, Order: {}, k: {}, Sample Size:{}"
        .format(sort_field, sort_order, k, sample_size))
    print("----------------------")

    stats += [
        'sampling_{}_{}'.format('head_table', 'non-filtered'), shards_path,
        sort_field, sort_order, k, sample_size, 1
    ]

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    table_parts = table_parts_end - table_parts_start + 1
    per_part_samples = int(sample_size / table_parts)
    table_name = os.path.basename(tbl_s3key)
    sample_scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                "{}.{}".format(shards_path, p),
                'select {} from S3Object limit {};'.format(
                    sort_field, per_part_samples), format_, use_pandas, secure,
                use_native, 'sample_scan_{}'.format(p), query_plan, False)),
        range(table_parts_start, table_parts_end + 1))

    # Sampling project
    def project_fn(df):
        df.columns = [sort_field]
        df[[sort_field]] = df[[sort_field]].astype(np.float)
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    sample_project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'sample_project_{}'.format(p), query_plan,
                    False, project_fn)),
        range(table_parts_start, table_parts_end + 1))

    # TopK samples
    sort_expr = [SortExpression(sort_field, float, sort_order)]
    sample_topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p),
                query_plan, False)),
        range(table_parts_start, table_parts_end + 1))

    sample_topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False))

    # Generate SQL command for second scan
    sql_gen = query_plan.add_operator(
        TopKFilterBuild(sort_order, 'float', 'select * from S3object ',
                        ' CAST({} as float) '.format(sort_field), 'sql_gen',
                        query_plan, False))

    if not sampling_only:
        # Scan
        scan = map(
            lambda p: query_plan.add_operator(
                SQLTableScan("{}.{}".format(shards_path, p), "", format_,
                             use_pandas, secure, use_native, 'scan_{}'.format(
                                 p), query_plan, False)),
            range(table_parts_start, table_parts_end + 1))

        # Project
        def project_fn(df):
            df.columns = [
                sort_field if x == sort_field_index else x for x in df.columns
            ]
            df[[sort_field]] = df[[sort_field]].astype(np.float)
            return df

        project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

        project = map(
            lambda p: query_plan.add_operator(
                Project(project_exprs, 'project_{}'.format(p), query_plan,
                        False, project_fn)),
            range(table_parts_start, table_parts_end + 1))

        # TopK
        topk = map(
            lambda p: query_plan.add_operator(
                Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan,
                    False)), range(table_parts_start, table_parts_end + 1))

        # TopK reduce
        topk_reduce = query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan))
    map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project))
    map(lambda op: op.connect(sample_topk_reduce), sample_topk)
    sample_topk_reduce.connect(sql_gen)

    if not sampling_only:
        map(lambda (p, o): sql_gen.connect(o), enumerate(scan))
        map(lambda (p, o): o.connect(project[p]), enumerate(scan))
        map(lambda (p, o): o.connect(topk[p]), enumerate(project))
        map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
        topk_reduce.connect(collate)
    else:
        sql_gen.connect(collate)

    # Start the query
    query_plan.execute()
    print('Done')

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    sampling_time = query_plan.total_elapsed_time
    cost, bytes_scanned, bytes_returned, rows = query_plan.cost()
    computation_cost = query_plan.computation_cost()
    data_cost = query_plan.data_cost()[0]

    stats += [
        sql_gen.threshold, sampling_time, 0, sampling_time, rows,
        bytes_scanned, bytes_returned, data_cost, computation_cost, cost
    ]
Пример #12
0
def run_local_indexed_sampling(stats,
                               sort_field_index,
                               sort_field,
                               k,
                               sample_size,
                               batch_size,
                               parallel,
                               use_pandas,
                               sort_order,
                               buffer_size,
                               table_parts_start,
                               table_parts_end,
                               tbl_s3key,
                               shards_path,
                               format_,
                               sampling_only=True):
    """
    Executes the randomly sampled topk query by firstly building a random sample, then extracting the filtering threshold
    Finally scanning the table to retrieve only the records beyond the threshold
    :return:
    """

    secure = False
    use_native = False
    n_threads = multiprocessing.cpu_count()

    print('')
    print("Top K Benchmark, Sampling. Sort Field: {}, Order: {}".format(
        sort_field, sort_order))
    print("----------------------")

    stats += [
        'sampling_{}_{}'.format('indexed', 'non-filtered'), shards_path,
        sort_field, sort_order, k, sample_size, batch_size
    ]

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    tbl_smpler = query_plan.add_operator(
        TableRandomSampleGenerator(tbl_s3key, sample_size, batch_size,
                                   "table_sampler", query_plan, False))
    sample_scan = map(
        lambda p: query_plan.add_operator(
            TableRangeAccess(tbl_s3key, use_pandas, secure, use_native,
                             "sample_scan_{}".format(p), query_plan, False)),
        range(table_parts_start, table_parts_end + 1))
    map(lambda (i, op): sample_scan[i].set_nthreads(n_threads),
        enumerate(sample_scan))

    # sample_scan = query_plan.add_operator(
    #     TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p),
    #                      query_plan, False))
    # sample_scan.set_nthreads(n_threads)

    # Sampling project
    def project_fn(df):
        df.columns = [
            sort_field if x == sort_field_index else x for x in df.columns
        ]
        df = df[[sort_field]].astype(np.float, errors='ignore')
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    sample_project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'sample_project_{}'.format(p), query_plan,
                    False, project_fn)),
        range(table_parts_start, table_parts_end + 1))
    # sample_project = query_plan.add_operator(
    #                             Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn))

    # TopK samples
    sort_expr = [SortExpression(sort_field, float, sort_order)]
    sample_topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p),
                query_plan, False)),
        range(table_parts_start, table_parts_end + 1))
    # sample_topk = query_plan.add_operator(
    #                 Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False))

    sample_topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False))

    # Generate SQL command for second scan
    sql_gen = query_plan.add_operator(
        TopKFilterBuild(sort_order, 'float', 'select * from S3object ',
                        ' CAST({} as float) '.format(sort_field), 'sql_gen',
                        query_plan, False))
    if not sampling_only:
        # Scan
        scan = map(
            lambda p: query_plan.add_operator(
                SQLTableScan("{}.{}".format(shards_path, p), "", format_,
                             use_pandas, secure, use_native, 'scan_{}'.format(
                                 p), query_plan, False)),
            range(table_parts_start, table_parts_end + 1))

        # Project
        def project_fn(df):
            df.columns = [
                sort_field if x == sort_field_index else x for x in df.columns
            ]
            df[[sort_field]] = df[[sort_field]].astype(np.float)
            return df

        project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

        project = map(
            lambda p: query_plan.add_operator(
                Project(project_exprs, 'project_{}'.format(p), query_plan,
                        False, project_fn)),
            range(table_parts_start, table_parts_end + 1))

        # TopK
        topk = map(
            lambda p: query_plan.add_operator(
                Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan,
                    False)), range(table_parts_start, table_parts_end + 1))

        # TopK reduce
        topk_reduce = query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda o: tbl_smpler.connect(o), sample_scan)
    map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan))
    map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project))
    map(lambda o: o.connect(sample_topk_reduce), sample_topk)
    sample_topk_reduce.connect(sql_gen)

    if not sampling_only:
        map(lambda (p, o): sql_gen.connect(o), enumerate(scan))
        map(lambda (p, o): o.connect(project[p]), enumerate(scan))
        map(lambda (p, o): o.connect(topk[p]), enumerate(project))
        map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
        topk_reduce.connect(collate)
    else:
        sql_gen.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts_end - table_parts_start))
    print('')

    # Write the plan graph
    # query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    # tuples = collate.tuples()

    # collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    sampling_time = query_plan.total_elapsed_time
    cost, bytes_scanned, bytes_returned, rows = query_plan.cost()
    computation_cost = query_plan.computation_cost()
    data_cost = query_plan.data_cost()[0]

    stats += [
        sql_gen.threshold, sampling_time, 0, sampling_time, rows,
        bytes_scanned, bytes_returned, data_cost, computation_cost, cost
    ]
Пример #13
0
def run_memory_indexed_sampling(stats,
                                sort_field_index,
                                sort_field,
                                k,
                                sample_size,
                                batch_size,
                                parallel,
                                use_pandas,
                                sort_order,
                                buffer_size,
                                table_parts_start,
                                table_parts_end,
                                tbl_s3key,
                                shards_path,
                                format_,
                                sampling_only=True):
    """
    Executes the randomly sampled topk query by firstly building a random sample, then extracting the filtering threshold
    Finally scanning the table to retrieve only the records beyond the threshold
    :return:
    """

    secure = False
    use_native = False
    n_threads = multiprocessing.cpu_count()

    print('')
    print(
        "Top K Benchmark, Memory Indexed Sampling. Sort Field: {}, Order: {}, K: {}, Sample Size: {}, Batch Size: {}"
        .format(sort_field, sort_order, k, sample_size, batch_size))
    print("----------------------")

    stats += [
        'sampling_{}_{}'.format('memory_indexed', 'non-filtered'), shards_path,
        sort_field, sort_order, k, sample_size, batch_size
    ]

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    tbl_smpler = query_plan.add_operator(
        TableRandomSampleGenerator(tbl_s3key, sample_size, batch_size,
                                   "table_sampler", query_plan, False))

    sample_scanners = map(
        lambda p: query_plan.add_operator(
            TableRangeAccess(tbl_s3key, use_pandas, secure, use_native,
                             "sample_scan_{}".format(p), query_plan, False)),
        range(table_parts_start, table_parts_end + 1))
    map(lambda op: op.set_nthreads(n_threads), sample_scanners)

    # sample_scan = query_plan.add_operator(
    #     TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p),
    #                      query_plan, False))
    # sample_scan.set_nthreads(n_threads)

    # Sampling project
    def project_fn(df):
        df.columns = [
            sort_field if x == sort_field_index else x for x in df.columns
        ]
        df = df[[sort_field]].astype(np.float, errors='ignore')
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    sample_project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'sample_project_{}'.format(p), query_plan,
                    False, project_fn)),
        range(table_parts_start, table_parts_end + 1))
    # sample_project = query_plan.add_operator(
    #                             Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn))

    # TopK samples
    sort_expr = [SortExpression(sort_field, float, sort_order)]
    sample_topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p),
                query_plan, False)),
        range(table_parts_start, table_parts_end + 1))
    # sample_topk = query_plan.add_operator(
    #                 Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False))

    sample_topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False))

    # Generate SQL command for second scan
    sql_gen = query_plan.add_operator(
        TopKFilterBuild(sort_order, 'float', 'select * from S3object ',
                        ' CAST({} as float) '.format(sort_field),
                        'sample_sql_gen', query_plan, False))

    if not sampling_only:
        # Scan
        scan = map(
            lambda p: query_plan.add_operator(
                SQLTableScan("{}.{}".format(shards_path, p), "", format_,
                             use_pandas, secure, use_native, 'scan_{}'.format(
                                 p), query_plan, False)),
            range(table_parts_start, table_parts_end + 1))

        # Project
        def project_fn(df):
            df.columns = [
                sort_field if x == sort_field_index else x for x in df.columns
            ]
            df[[sort_field]] = df[[sort_field]].astype(np.float)
            return df

        project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

        project = map(
            lambda p: query_plan.add_operator(
                Project(project_exprs, 'project_{}'.format(p), query_plan,
                        False, project_fn)),
            range(table_parts_start, table_parts_end + 1))

        # TopK
        topk = map(
            lambda p: query_plan.add_operator(
                Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan,
                    False)), range(table_parts_start, table_parts_end + 1))

        # TopK reduce
        topk_reduce = query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda op: tbl_smpler.connect(op), sample_scanners)
    map(lambda (p, o): o.connect(sample_project[p]),
        enumerate(sample_scanners))
    map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project))
    map(lambda o: o.connect(sample_topk_reduce), sample_topk)
    sample_topk_reduce.connect(sql_gen)

    if not sampling_only:
        map(lambda (p, o): sql_gen.connect(o), enumerate(scan))
        map(lambda (p, o): o.connect(project[p]), enumerate(scan))
        map(lambda (p, o): o.connect(topk[p]), enumerate(project))
        map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
        topk_reduce.connect(collate)
    else:
        sql_gen.connect(collate)

    # Start the query
    query_plan.execute()
    print('Done')
    # tuples = collate.tuples()

    # collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    # query_plan.stop()

    sampling_threshold = query_plan.retrieve_sampling_threshold()
    sampling_runtime = query_plan.get_phase_runtime('sampl')
    sampling_num_http_requests, sampling_requests_cost = query_plan.requests_cost(
        'sampl')
    sampling_returned_bytes, sampling_returned_rows, sampling_transfer_cost = query_plan.data_transfer_cost(
        phase_keyword='sampl')
    sampling_scanned_bytes, sampling_scan_cost = query_plan.data_scanning_cost(
        'sampl')

    total_runtime = query_plan.total_elapsed_time
    total_http_requests, total_requests_cost = query_plan.requests_cost()
    total_returned_bytes, total_returned_rows, total_transfer_cost = query_plan.data_transfer_cost(
    )
    total_scanned_bytes, total_scan_cost = query_plan.data_scanning_cost()

    total_data_cost = query_plan.data_cost()[0]
    total_computation_cost = query_plan.computation_cost()
    total_cost = query_plan.cost()[0]

    stats += [
        sampling_threshold, sampling_runtime, total_runtime - sampling_runtime,
        total_runtime, sampling_returned_rows,
        sampling_scanned_bytes * BYTE_TO_MB,
        sampling_returned_bytes * BYTE_TO_MB, sampling_num_http_requests,
        sampling_requests_cost, sampling_transfer_cost, sampling_scan_cost,
        total_returned_rows, total_scanned_bytes * BYTE_TO_MB,
        total_returned_bytes * BYTE_TO_MB, total_http_requests,
        total_requests_cost, total_transfer_cost, total_scan_cost,
        total_data_cost, total_computation_cost, total_cost
    ]
Пример #14
0
def run(buffer_size):
    """The baseline tst uses hash joins with no projection and no filtering pushed down to s3.

    :return: None
    """

    print('')
    print("TPCH Q14 Baseline Join")
    print("----------------------")

    query_plan = QueryPlan(is_async=False, buffer_size=buffer_size)

    # Query plan
    date = '1993-01-01'
    min_shipped_date = datetime.strptime(date, '%Y-%m-%d')
    max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30)

    lineitem_scan = query_plan.add_operator(tpch_q14.sql_scan_lineitem_operator_def('lineitem_scan', query_plan))
    lineitem_project = query_plan.add_operator(
        tpch_q14.project_partkey_extendedprice_discount_shipdate_operator_def('lineitem_project', query_plan))
    part_scan = query_plan.add_operator(tpch_q14.sql_scan_part_operator_def('part_scan', query_plan))
    part_project = query_plan.add_operator(tpch_q14.project_partkey_brand_type_operator_def('part_project', query_plan))
    lineitem_filter = query_plan.add_operator(
        tpch_q14.filter_shipdate_operator_def(min_shipped_date, max_shipped_date, 'lineitem_filter', query_plan))
    part_filter = query_plan.add_operator(tpch_q14.filter_brand12_operator_def('part_filter', query_plan))
    join = query_plan.add_operator(tpch_q14.join_lineitem_part_operator_def('join', query_plan))
    aggregate = query_plan.add_operator(tpch_q14.aggregate_promo_revenue_operator_def('aggregate', query_plan))
    aggregate_project = query_plan.add_operator(
        tpch_q14.project_promo_revenue_operator_def('aggregate_project', query_plan))
    collate = query_plan.add_operator(tpch_q14.collate_operator_def('collate', query_plan))

    lineitem_scan.connect(lineitem_project)
    lineitem_project.connect(lineitem_filter)
    join.connect_left_producer(lineitem_filter)
    part_scan.connect(part_project)
    part_project.connect(part_filter)
    join.connect_right_producer(part_filter)
    join.connect(aggregate)
    aggregate.connect(aggregate_project)
    aggregate_project.connect(collate)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in collate.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    collate.print_tuples()

    # Write the metrics
    query_plan.print_metrics()

    field_names = ['promo_revenue']

    assert len(collate.tuples()) == 1 + 1

    assert collate.tuples()[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    assert collate.tuples()[1] == [15.090116526324298]
Пример #15
0
def run(parallel, use_pandas, secure, use_native, buffer_size, format_, customer_parts, order_parts, lineitem_parts,
        customer_sharded,
        order_sharded, lineitem_sharded, other_parts, fp_rate, sf, expected_result, customer_filter_sql=None,
        order_filter_sql=None, lineitem_filter_sql=None):
    """

    :return: None
    """

    print('')
    print("TPCH Q3 Bloom Join")
    print("------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    customer_scan = map(lambda p:
                        query_plan.add_operator(
                            SQLTableScan(get_file_key('customer', customer_sharded, p, sf, format_),
                                         "select "
                                         "  c_custkey "
                                         "from "
                                         "  S3Object "
                                         "where "
                                         "  c_mktsegment = 'BUILDING' "
                                         "  {} "
                                         "  {} "
                                         .format(
                                             ' and ' + customer_filter_sql if customer_filter_sql is not None else '',
                                             get_sql_suffix('customer', customer_parts, p, customer_sharded,
                                                            add_where=False)),
                                         format_,
                                         use_pandas, secure, use_native,
                                         'customer_scan' + '_{}'.format(p),
                                         query_plan,
                                         False)),
                        range(0, customer_parts))

    def customer_project_fn(df):
        df = df.filter(items=['_0'], axis=1)

        df.rename(columns={'_0': 'c_custkey'},
                  inplace=True)

        return df

    customer_project = map(lambda p:
                           query_plan.add_operator(
                               Project([],
                                       'customer_project' + '_{}'.format(p),
                                       query_plan,
                                       False, customer_project_fn)),
                           range(0, customer_parts))

    customer_bloom_create = query_plan.add_operator(
        BloomCreate('c_custkey', 'customer_bloom_create', query_plan, False,
                    fp_rate))

    customer_map = map(lambda p:
                       query_plan.add_operator(Map('c_custkey', 'customer_map' + '_' + str(p), query_plan, False)),
                       range(0, customer_parts))

    order_scan = map(lambda p:
                     query_plan.add_operator(
                         SQLTableScanBloomUse(get_file_key('orders', order_sharded, p, sf, format_),
                                              "select "
                                              "  o_custkey, o_orderkey, o_orderdate, o_shippriority "
                                              "from "
                                              "  S3Object "
                                              "where "
                                              "  cast(o_orderdate as timestamp) < cast('1995-03-01' as timestamp) "
                                              "  {} "
                                              "  {} "
                                              .format(
                                                  ' and ' + order_filter_sql if order_filter_sql is not None else '',
                                                  get_sql_suffix('orders', order_parts, p, order_sharded,
                                                                 add_where=False)),
                                              'o_custkey',
                                              format_,
                                              use_pandas, secure, use_native,
                                              'order_scan' + '_{}'.format(p),
                                              query_plan,
                                              False)),
                     range(0, order_parts))

    def order_project_fn(df):
        df = df.filter(items=['_0', '_1', '_2', '_3'], axis=1)

        df.rename(columns={'_0': 'o_custkey', '_1': 'o_orderkey', '_2': 'o_orderdate', '_3': 'o_shippriority'},
                  inplace=True)

        return df

    order_project = map(lambda p:
                        query_plan.add_operator(
                            Project([],
                                    'order_project' + '_{}'.format(p),
                                    query_plan,
                                    False, order_project_fn)),
                        range(0, customer_parts))

    order_map_1 = map(lambda p:
                      query_plan.add_operator(Map('o_custkey', 'order_map_1' + '_' + str(p), query_plan, False)),
                      range(0, order_parts))

    customer_order_join_build = map(lambda p:
                                    query_plan.add_operator(
                                        HashJoinBuild('c_custkey',
                                                      'customer_order_join_build' + '_' + str(p), query_plan,
                                                      False)),
                                    range(0, other_parts))

    customer_order_join_probe = map(lambda p:
                                    query_plan.add_operator(
                                        HashJoinProbe(JoinExpression('c_custkey', 'o_custkey'),
                                                      'customer_order_join_probe' + '_' + str(p),
                                                      query_plan, False)),
                                    range(0, other_parts))

    order_bloom_create = query_plan.add_operator(
        BloomCreate('o_orderkey', 'order_bloom_create', query_plan, False,
                    fp_rate))

    lineitem_scan = map(lambda p:
                        query_plan.add_operator(
                            SQLTableScanBloomUse(get_file_key('lineitem', lineitem_sharded, p, sf, format_),
                                                 "select "
                                                 "  l_orderkey, l_extendedprice, l_discount "
                                                 "from "
                                                 "  S3Object "
                                                 "where "
                                                 "  cast(l_shipdate as timestamp) > cast('1995-03-01' as timestamp) "
                                                 "  {} "
                                                 "  {} "
                                                 .format(
                                                     ' and ' + lineitem_filter_sql if lineitem_filter_sql is not None else '',
                                                     get_sql_suffix('lineitem', lineitem_parts, p, lineitem_sharded,
                                                                    add_where=False)),
                                                 'l_orderkey', format_,
                                                 use_pandas, secure, use_native,
                                                 'lineitem_scan' + '_{}'.format(p),
                                                 query_plan,
                                                 False)),
                        range(0, lineitem_parts))

    def lineitem_project_fn(df):
        df = df.filter(items=['_0', '_1', '_2'], axis=1)

        df.rename(columns={'_0': 'l_orderkey', '_1': 'l_extendedprice', '_2': 'l_discount'},
                  inplace=True)

        return df

    lineitem_project = map(lambda p:
                           query_plan.add_operator(
                               Project([],
                                       'lineitem_project' + '_{}'.format(p),
                                       query_plan,
                                       False, lineitem_project_fn)),
                           range(0, lineitem_parts))

    lineitem_map = map(lambda p:
                       query_plan.add_operator(Map('l_orderkey', 'lineitem_map' + '_' + str(p), query_plan, False)),
                       range(0, lineitem_parts))

    order_map_2 = map(lambda p:
                      query_plan.add_operator(Map('o_orderkey', 'order_map_2' + '_' + str(p), query_plan, False)),
                      range(0, other_parts))

    customer_order_lineitem_join_build = map(lambda p:
                                             query_plan.add_operator(
                                                 HashJoinBuild('o_orderkey',
                                                               'customer_order_lineitem_join_build' + '_' + str(p),
                                                               query_plan,
                                                               False)),
                                             range(0, other_parts))

    customer_order_lineitem_join_probe = map(lambda p:
                                             query_plan.add_operator(
                                                 HashJoinProbe(JoinExpression('o_orderkey', 'l_orderkey'),
                                                               'customer_order_lineitem_join_probe' + '_' + str(p),
                                                               query_plan, False)),
                                             range(0, other_parts))

    def groupby_fn(df):
        df['l_extendedprice'] = df['l_extendedprice'].astype(np.float)
        df['l_discount'] = df['l_discount'].astype(np.float)
        df['revenue'] = df['l_extendedprice'] * (1 - df['l_discount'])
        grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority'])
        agg_df = grouped['revenue'].sum()
        return agg_df.reset_index()

    group = map(lambda p:
                query_plan.add_operator(
                    Group(
                        ['l_orderkey', 'o_orderdate', 'o_shippriority'],  # l_partkey
                        [
                            AggregateExpression(AggregateExpression.SUM,
                                                lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount'])))
                        ],
                        'group' + '_{}'.format(p), query_plan,
                        False, groupby_fn)),
                range(0, other_parts))

    def group_reduce_fn(df):
        grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority'])
        agg_df = grouped['revenue'].sum()
        return agg_df.reset_index()

    group_reduce = query_plan.add_operator(
        Group(
            ['l_orderkey', 'o_orderdate', 'o_shippriority'],  # l_partkey
            [
                AggregateExpression(AggregateExpression.SUM,
                                    lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount'])))
            ],
            'group_reduce', query_plan,
            False, group_reduce_fn))

    top = query_plan.add_operator(
        Top(10, [SortExpression('revenue', float, 'DESC'), SortExpression('o_orderdate', date, 'ASC')], use_pandas,
            'top', query_plan,
            False))

    collate = query_plan.add_operator(tpch_q19.collate_op('collate', query_plan))

    # Inline what we can
    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), customer_project)
    map(lambda o: o.set_async(False), order_project)
    map(lambda o: o.set_async(False), lineitem_map)
    map(lambda o: o.set_async(False), customer_map)
    map(lambda o: o.set_async(False), order_map_1)
    map(lambda o: o.set_async(False), order_map_2)

    # Connect the operators
    connect_many_to_many(customer_scan, customer_project)
    connect_many_to_many(customer_project, customer_map)

    connect_many_to_one(customer_project, customer_bloom_create)
    connect_one_to_many(customer_bloom_create, order_scan)

    connect_many_to_many(order_scan, order_project)
    connect_many_to_many(order_project, order_map_1)

    connect_all_to_all(customer_map, customer_order_join_build)
    connect_many_to_many(customer_order_join_build, customer_order_join_probe)
    connect_all_to_all(order_map_1, customer_order_join_probe)

    # connect_many_to_one(customer_order_join_probe, collate)

    connect_many_to_one(order_project, order_bloom_create)
    connect_one_to_many(order_bloom_create, lineitem_scan)

    connect_many_to_many(lineitem_scan, lineitem_project)
    connect_many_to_many(lineitem_project, lineitem_map)

    connect_many_to_many(customer_order_join_probe, order_map_2)
    connect_all_to_all(order_map_2, customer_order_lineitem_join_build)
    connect_many_to_many(customer_order_lineitem_join_build, customer_order_lineitem_join_probe)
    connect_all_to_all(lineitem_map, customer_order_lineitem_join_probe)

    # connect_many_to_one(customer_order_lineitem_join_probe, collate)

    connect_many_to_many(customer_order_lineitem_join_probe, group)
    # connect_many_to_one(group, collate)

    connect_many_to_one(group, group_reduce)
    connect_one_to_one(group_reduce, top)
    connect_one_to_one(top, collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("customer_parts: {}".format(customer_parts))
    print("order_parts: {}".format(order_parts))
    print("lineitem_parts: {}".format(lineitem_parts))
    print("customer_sharded: {}".format(customer_sharded))
    print("order_sharded: {}".format(order_sharded))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("other_parts: {}".format(other_parts))
    print("fp_rate: {}".format(fp_rate))
    print("format: {}".format(format_))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['l_orderkey', 'o_orderdate', 'o_shippriority', 'revenue']

    assert len(tuples) == 10 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    test_util.assert_tuples(expected_result, tuples)
Пример #16
0
def run(parallel, use_pandas, buffer_size, table_parts, perc, path, nthreads=16, format_=Format.CSV):
    secure = False
    use_native = False
    print('')
    print("Indexing Benchmark")
    print("------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Scan Index Files
    upper = perc * 100
    index_scan = map(lambda p:
                     query_plan.add_operator(
                         SQLTableScan('{}/index/index_f0_{}.csv'.format(path, p),
                                      "select first_byte, last_byte "
                                      " from S3Object "
                                      " where cast(name as float) < {};".format(upper), format_,
                                      use_pandas, secure, use_native,
                                      'index_scan_{}'.format(p), query_plan,
                                      False)),
                     range(0, table_parts))

    # Range accesses 
    range_access = map(lambda p:
                       query_plan.add_operator(
                           TableRangeAccess('{}/data_{}.csv'.format(path, p),
                                            use_pandas, secure, use_native,
                                            'range_access_{}'.format(p), query_plan,
                                            False)),
                       range(0, table_parts))

    map(lambda o: o.set_nthreads(nthreads), range_access)

    collate = query_plan.add_operator(
        Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(range_access[p]), enumerate(index_scan))
    map(lambda (p, o): o.connect(collate), enumerate(range_access))

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    # collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Пример #17
0
def run(parallel, buffer_size):
    """

    :return: None
    """

    print('')
    print("TPCH Q14 Filtered Join")
    print("----------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    # DATE is the first day of a month randomly selected from a random year within [1993 .. 1997].
    date = '1993-01-01'
    min_shipped_date = datetime.strptime(date, '%Y-%m-%d')
    max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30)

    lineitem_scan = query_plan.add_operator(
        tpch_q14.
        sql_scan_lineitem_partkey_extendedprice_discount_where_shipdate_operator_def(
            min_shipped_date, max_shipped_date, 'lineitem_scan', query_plan))
    lineitem_project = query_plan.add_operator(
        tpch_q14.project_partkey_extendedprice_discount_operator_def(
            'lineitem_project', query_plan))
    part_scan = query_plan.add_operator(
        tpch_q14.sql_scan_part_partkey_type_part_where_brand12_operator_def(
            'part_scan', query_plan))
    part_project = query_plan.add_operator(
        tpch_q14.project_partkey_type_operator_def('part_project', query_plan))
    join = query_plan.add_operator(
        tpch_q14.join_lineitem_part_operator_def('join', query_plan))
    aggregate = query_plan.add_operator(
        tpch_q14.aggregate_promo_revenue_operator_def('aggregate', query_plan))
    aggregate_project = query_plan.add_operator(
        tpch_q14.project_promo_revenue_operator_def('aggregate_project',
                                                    query_plan))
    collate = query_plan.add_operator(
        tpch_q14.collate_operator_def('collate', query_plan))

    lineitem_scan.connect(lineitem_project)
    part_scan.connect(part_project)
    join.connect_left_producer(lineitem_project)
    join.connect_right_producer(part_project)
    join.connect(aggregate)
    aggregate.connect(aggregate_project)
    aggregate_project.connect(collate)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['promo_revenue']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    assert tuples[1] == [15.090116526324298]
Пример #18
0
def run(sf, parallel, use_pandas, secure, use_native, buffer_sizje,
        lineitem_parts, sharded, format_):
    """

    :return: None
    """

    print('')
    print("TPCH Q1 Baseline Group By")
    print("----------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    lineitem_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q1.sql_scan_lineitem_operator_def(
                sharded, p, sf, use_pandas, secure, use_native,
                'lineitem_scan' + '_' + str(p), query_plan, format_)),
        range(0, lineitem_parts))
    '''
    lineitem_project = map(lambda p:
                           query_plan.add_operator(
                               tpch_q1.project_lineitem_operator_def(
                                   'lineitem_project' + '_' + str(p),
                                   query_plan)),
                           range(0, lineitem_parts))

    date = '1998-12-01'
    max_shipped_date = datetime.strptime(date, '%Y-%m-%d') - timedelta(days=117)

    lineitem_filter = map(lambda p:
                          query_plan.add_operator(
                              tpch_q1.filter_shipdate_operator_def(
                                  max_shipped_date,
                                  'lineitem_filter' + '_' + str(p),
                                  query_plan)),
                          range(0, lineitem_parts))

    profile_file_name = os.path.join(ROOT_DIR, "../tests-output/" + gen_test_id() + ".prof")

    groupby = map(lambda p:
                  query_plan.add_operator(
                      tpch_q1.groupby_returnflag_linestatus_operator_def(
                          'groupby' + '_' + str(p),
                          query_plan)),
                  range(0, lineitem_parts))

    # groupby[0].set_profiled(True, profile_file_name)

    groupby_reduce = query_plan.add_operator(
        tpch_q1.groupby_reduce_returnflag_linestatus_operator_def(
            'groupby_reduce',
            query_plan, use_pandas))
    # lineitem_scan[0].set_profiled(True,
    #                               os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_0" + ".prof"))
    # lineitem_project[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/",
    #                                                     gen_test_id() + "_project_0" + ".prof"))
    # groupby[0].set_profiled(True,
    #                         os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_groupby_0" + ".prof"))
    # groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/",
    #                                                gen_test_id() + "_groupby_reduce" + ".prof"))

    collate = query_plan.add_operator(
        Collate('collate', query_plan, False))

    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), lineitem_filter)
    # map(lambda o: o.set_async(False), groupby)

    map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_scan))
    map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(lineitem_project))
    map(lambda (p, o): o.connect(groupby[p]), enumerate(lineitem_filter))
    map(lambda (p, o): o.connect(groupby_reduce), enumerate(groupby))
    groupby_reduce.connect(collate)
    '''
    groupby = map(
        lambda p: query_plan.add_operator(
            tpch_q1.groupby_returnflag_linestatus_operator_def(
                'groupby' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))
    collate = query_plan.add_operator(Collate('collate', query_plan, False))
    map(lambda (o): o.connect(collate), lineitem_scan)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("lineitem parts: {}".format(lineitem_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(lineitem_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Пример #19
0
def run(sort_field, k, parallel, use_pandas, sort_order, buffer_size,
        table_first_part, table_parts, queried_columns, select_columns, path,
        format_):
    """
    Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap
    :return:
    """

    secure = False
    use_native = False
    print('')
    print("Top K Benchmark, ColumnScan. Sort Field: {}, Order: {}".format(
        sort_field, sort_order))
    print("----------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    sample_scan = map(
        lambda p: query_plan.add_operator(
            #SQLTableScan("{}/lineitem.snappy.parquet.{}".format(path, p),
            SQLTableScan(
                "{}/lineitem.typed.1RowGroup.parquet.{}".format(
                    path, p), 'select {} from S3Object;'.format(
                        sort_field), format_, use_pandas, secure, use_native,
                'column_scan_{}'.format(p), query_plan, False)),
        range(table_first_part, table_first_part + table_parts))

    # Sampling project
    def project_fn1(df):
        df.columns = [sort_field]
        df[[sort_field]] = df[[sort_field]].astype(np.float)
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    sample_project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'sample_project_{}'.format(p), query_plan,
                    False, project_fn1)),
        range(table_first_part, table_first_part + table_parts))

    # TopK samples
    sort_expr = SortExpression(sort_field, float, sort_order)

    sample_topk = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'sample_topk', query_plan, False))

    # Generate SQL command for second scan
    sql_gen = query_plan.add_operator(
        TopKFilterBuild(
            sort_order,
            'float',
            'select {} from S3object '.format(select_columns),
            #' CAST({} as float) '.format(sort_field), 'sql_gen', query_plan, False ))
            ' {} '.format(sort_field),
            'sql_gen',
            query_plan,
            False))

    # Scan
    scan = map(
        lambda p: query_plan.add_operator(
            #SQLTableScan("{}/lineitem.snappy.parquet.{}".format(path, p),
            SQLTableScan(
                "{}/lineitem.typed.1RowGroup.parquet.{}".format(path, p), "",
                format_, use_pandas, secure, use_native, 'scan_{}'.format(
                    p), query_plan, False)),
        range(table_first_part, table_first_part + table_parts))

    # Project
    def project_fn2(df):
        df.columns = queried_columns
        df[[sort_field]] = df[[sort_field]].astype(np.float)
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'project_{}'.format(p), query_plan, False,
                    project_fn2)),
        range(table_first_part, table_first_part + table_parts))

    # TopK
    topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_{}'.format(p),
                query_plan, False)),
        range(table_first_part, table_first_part + table_parts))

    # TopK reduce
    topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    #profile_path = '../benchmark-output/groupby/'
    #scan[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_scan_0" + ".prof"))
    #project[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_project_0" + ".prof"))
    #groupby[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_0" + ".prof"))
    #groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_reduce" + ".prof"))
    #collate.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_collate" + ".prof"))

    map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan))
    map(lambda (p, o): o.connect(sample_topk), enumerate(sample_project))
    sample_topk.connect(sql_gen)

    map(lambda (p, o): sql_gen.connect(o), enumerate(scan))
    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(topk[p]), enumerate(project))
    map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
    topk_reduce.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Пример #20
0
def test_filter_baseline():
    """

    :return:
    """

    query_plan = QueryPlan(buffer_size=64, is_async=True, use_shared_mem=False)

    # Query plan
    '''
    ts = query_plan.add_operator(
        SQLTableScan('lineitem.csv', 'select * from S3Object limit 3;' , False, 'ts', query_plan, False))
    '''
    # using a 'use_native=True' argument will result in a None object being returned
    ts = query_plan.add_operator(
        SQLTableScan('random_strings_2.csv', 'select * from S3Object limit 3;',
                     Format.CSV, True, False, False, 'ts', query_plan, False))
    f = query_plan.add_operator(
        Filter(
            PredicateExpression(lambda t_: cast(t_['_10'], timestamp) >= cast(
                '1996-03-01', timestamp)), 'f', query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(f)
    f.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))
    print(len(c.tuples()))
    assert 2 + 1 == len(c.tuples())

    field_names = [
        '_0', '_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10',
        '_11', '_12', '_13', '_14', '_15'
    ]

    assert c.tuples()[0] == field_names

    assert c.tuples()[1] == [
        '1', '155190', '7706', '1', '17', '21168.23', '0.04', '0.02', 'N', 'O',
        '1996-03-13', '1996-02-12', '1996-03-22', 'DELIVER IN PERSON', 'TRUCK',
        'egular courts above the'
    ]

    assert c.tuples()[2] == [
        '1', '67310', '7311', '2', '36', '45983.16', '0.09', '0.06', 'N', 'O',
        '1996-04-12', '1996-02-28', '1996-04-20', 'TAKE BACK RETURN', 'MAIL',
        'ly final dependencies: slyly bold '
    ]

    # Write the metrics
    query_plan.print_metrics()
Пример #21
0
def run(parallel,
        use_pandas,
        buffer_size,
        table_parts,
        lower,
        upper,
        sf,
        format_=Format.CSV):
    secure = False
    use_native = False
    print('')
    print("Indexing Benchmark")
    print("------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # SQL scan the file

    scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                get_file_key('lineitem', True, p, sf=sf, format_=format_
                             ), "select * from S3Object "
                "where cast(l_extendedprice as float) >= {} and cast(l_extendedprice as float) <= {};"
                .format(lower, upper), format_, use_pandas, secure,
                use_native, 'scan_{}'.format(p), query_plan, False)),
        range(0, table_parts))
    '''
    scan = map(lambda p:
               query_plan.add_operator(
                   SQLTableScan(get_file_key('lineitem', True, p, sf=sf, format_=format_),
                                "select * from S3Object "
                                "where l_extendedprice  >= {} and l_extendedprice  <= {};".format(
                                    lower, upper), format_,
                                use_pandas, secure, use_native,
                                'scan_{}'.format(p), query_plan,
                                False)),
               range(0, table_parts))
    '''

    # project
    def fn(df):
        df.columns = [
            'l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber',
            'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax',
            'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate',
            'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment'
        ]
        df[['l_extendedprice']] = df[['l_extendedprice']].astype(np.float)
        return df

    project = map(
        lambda p: query_plan.add_operator(
            Project([], 'project_{}'.format(p), query_plan, False, fn)),
        range(0, table_parts))

    # aggregation
    def agg_fun(df):
        return pd.DataFrame({'count': [len(df)]})

    aggregate = query_plan.add_operator(
        Aggregate([], True, 'agg', query_plan, False, agg_fun))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(aggregate), enumerate(project))
    aggregate.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Пример #22
0
def test_pandas_filter_baseline():
    """

    :return:
    """

    query_plan = QueryPlan(buffer_size=8192)

    # Query plan
    ts = query_plan.add_operator(
        SQLTableScan('lineitem.csv', 'select * from S3Object limit 3;', True,
                     'ts', query_plan, False))

    # f = query_plan.add_operator(
    #     Filter(PredicateExpression(lambda t_: cast(t_['_10'], timestamp) >= cast('1996-03-01', timestamp)),
    #            'f', query_plan,
    #            False))

    def pd_expr(df):
        # df['_10'] = pd.to_datetime(df['_10'])
        return pd.to_datetime(df['_10']) >= '1996-03-01'

    f = query_plan.add_operator(
        Filter(PredicateExpression(None, pd_expr), 'f', query_plan, True))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(f)
    f.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    assert 2 + 1 == len(c.tuples())

    field_names = [
        '_0', '_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10',
        '_11', '_12', '_13', '_14', '_15'
    ]

    assert c.tuples()[0] == field_names

    assert c.tuples()[1] == [
        '1', '155190', '7706', '1', '17', '21168.23', '0.04', '0.02', 'N', 'O',
        '1996-03-13', '1996-02-12', '1996-03-22', 'DELIVER IN PERSON', 'TRUCK',
        'egular courts above the'
    ]

    assert c.tuples()[2] == [
        '1', '67310', '7311', '2', '36', '45983.16', '0.09', '0.06', 'N', 'O',
        '1996-04-12', '1996-02-28', '1996-04-20', 'TAKE BACK RETURN', 'MAIL',
        'ly final dependencies: slyly bold '
    ]

    # Write the metrics
    query_plan.print_metrics()
Пример #23
0
def run(parallel,
        use_pandas,
        buffer_size,
        table_parts,
        lower,
        upper,
        sf,
        nthreads=16,
        format_=Format.CSV):
    secure = False
    use_native = False
    print('')
    print("Indexing Benchmark")
    print("------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)
    assert sf == 1 or sf == 10

    # Scan Index Files
    index_scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                'tpch-sf{}/lineitem_sharded/index/index_l_extendedprice.csv.{}'
                .format(sf, p
                        if sf == 1 else p + 1), "select first_byte, last_byte "
                " from S3Object "
                " where col_values  >= {} and col_values  <= {};".format(
                    lower, upper), format_, use_pandas, secure, use_native,
                'index_scan_{}'.format(p), query_plan, False)),
        range(0, table_parts))

    # Range accesses
    range_access = map(
        lambda p: query_plan.add_operator(
            TableRangeAccess(
                get_file_key('lineitem', True, p, sf=sf, format_=Format.CSV),
                use_pandas, secure, use_native, 'range_access_{}'.format(
                    p), query_plan, False)), range(0, table_parts))

    map(lambda o: o.set_nthreads(nthreads), range_access)

    # Aggregation
    def agg_fun(df):
        return pd.DataFrame({'count': [len(df)]})

    aggregate = query_plan.add_operator(
        Aggregate([], True, 'agg', query_plan, False, agg_fun))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(range_access[p]), enumerate(index_scan))
    map(lambda (p, o): o.connect(aggregate), enumerate(range_access))
    aggregate.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Пример #24
0
def query_plan(settings):
    # type: (SyntheticSemiJoinSettings) -> QueryPlan
    """

    :return: None
    """

    if settings.use_shared_mem:
        system = WorkerSystem(settings.shared_memory_size)
    else:
        system = None

    query_plan = QueryPlan(system,
                           is_async=settings.parallel,
                           buffer_size=settings.buffer_size,
                           use_shared_mem=settings.use_shared_mem)

    # Define the operators
    scan_a = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p, settings.sf),
                             "select "
                             "  {} "
                             "from "
                             "  S3Object "
                             "where "
                             "  {} "
                             "  {} "
                             .format(settings.table_A_AB_join_key,
                                     settings.table_A_filter_sql,
                                     get_sql_suffix(settings.table_A_key, settings.table_A_parts, p,
                                                    settings.table_A_sharded)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_a' + '_{}'.format(p),
                             query_plan,
                             False)),
            range(0, settings.table_A_parts))

    field_names_map_a = OrderedDict(
        zip([
            '_{}'.format(i)
            for i, name in enumerate(settings.table_A_field_names)
        ], settings.table_A_field_names))

    def project_fn_a(df):
        df = df.rename(columns=field_names_map_a, copy=False)
        return df

    project_a = map(
        lambda p: query_plan.add_operator(
            Project([
                ProjectExpression(k, v)
                for k, v in field_names_map_a.iteritems()
            ], 'project_a' + '_{}'.format(p), query_plan, False, project_fn_a)
        ), range(0, settings.table_A_parts))

    bloom_create_ab_join_key = map(
        lambda p: query_plan.add_operator(
            BloomCreate(settings.table_A_AB_join_key,
                        'bloom_create_ab_join_key' + '_{}'.format(p),
                        query_plan,
                        False,
                        fp_rate=settings.fp_rate)),
        range(0, settings.table_A_parts))

    scan_b_on_ab_join_key = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScanBloomUse(get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf),
                                     "select "
                                     "  {},{} "
                                     "from "
                                     "  S3Object "
                                     "where "
                                     "  {} "
                                     "  {} "
                                     .format(settings.table_B_BC_join_key,
                                             settings.table_B_AB_join_key,
                                             settings.table_B_filter_sql,
                                             get_sql_suffix(settings.table_B_key, settings.table_B_parts, p,
                                                            settings.table_B_sharded, add_where=False)), settings.format_,
                                     settings.table_B_AB_join_key,
                                     settings.use_pandas,
                                     settings.secure,
                                     settings.use_native,
                                     'scan_b_on_ab_join_key' + '_{}'.format(p),
                                     query_plan,
                                     False)),
            range(0, settings.table_B_parts))

    if settings.table_C_key is None:

        scan_b_detail_on_b_pk = \
            map(lambda p:
                query_plan.add_operator(
                    SQLTableScanBloomUse(get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf),
                                         "select "
                                         "  {},{} "
                                         "from "
                                         "  S3Object "
                                         "where "
                                         "  {} "
                                         "  {} "
                                         .format(settings.table_B_primary_key,
                                                 settings.table_B_detail_field_name,
                                                 settings.table_B_filter_sql,
                                                 get_sql_suffix(settings.table_B_key, settings.table_B_parts, p,
                                                                settings.table_B_sharded, add_where=False)), settings.format_,
                                         settings.table_B_primary_key,
                                         settings.use_pandas,
                                         settings.secure,
                                         settings.use_native,
                                         'scan_c_detail_on_b_pk' + '_{}'.format(p),
                                         query_plan,
                                         False)),
                range(0, settings.table_B_parts))

        field_names_map_b_detail = OrderedDict([
            ('_0', settings.table_B_primary_key),
            ('_1', settings.table_B_detail_field_name)
        ])

        def project_fn_b_detail(df):
            df.rename(columns=field_names_map_b_detail, inplace=True)
            return df

        project_b_detail = map(
            lambda p: query_plan.add_operator(
                Project([
                    ProjectExpression(k, v)
                    for k, v in field_names_map_b_detail.iteritems()
                ], 'project_b_detail' + '_{}'.format(p), query_plan,
                        False, project_fn_b_detail)),
            range(0, settings.table_B_parts))

        map_b_pk_1 = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_B_primary_key, 'map_b_pk_1' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_B_parts))

        map_b_pk_2 = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_B_primary_key, 'map_b_pk_2' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_B_parts))

        bloom_create_b_pk = map(
            lambda p: query_plan.add_operator(
                BloomCreate(settings.table_B_primary_key,
                            'bloom_create_b_pk' + '_{}'.format(p),
                            query_plan,
                            False,
                            fp_rate=settings.fp_rate)),
            range(0, settings.table_B_parts))

        join_probe_ab_and_b_on_b_pk = map(
            lambda p: query_plan.add_operator(
                HashJoinProbe(
                    JoinExpression(settings.table_B_primary_key, settings.
                                   table_B_primary_key),
                    'join_probe_ab_and_b_on_b_pk' + '_{}'.format(
                        p), query_plan, False)),
            range(0, settings.table_B_parts))

        join_build_ab_and_b_on_b_pk = map(
            lambda p: query_plan.add_operator(
                HashJoinBuild(settings.table_B_primary_key,
                              'join_build_ab_and_b_on_b_pk' + '_{}'.format(
                                  p), query_plan, False)),
            range(0, settings.table_B_parts))

    else:
        scan_c_on_bc_join_key = \
            map(lambda p:
                query_plan.add_operator(
                    SQLTableScanBloomUse(get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf),
                                         "select "
                                         "  {}, {} "
                                         "from "
                                         "  S3Object "
                                         "where "
                                         "  {} "
                                         "  {} "
                                         .format(settings.table_C_primary_key,
                                                 settings.table_C_BC_join_key,
                                                 settings.table_C_filter_sql,
                                                 get_sql_suffix(settings.table_C_key, settings.table_C_parts, p,
                                                                settings.table_C_sharded, add_where=False)),
                                         settings.table_C_BC_join_key, settings.format_,
                                         settings.use_pandas,
                                         settings.secure,
                                         settings.use_native,
                                         'scan_c_on_bc_join_key' + '_{}'.format(p),
                                         query_plan,
                                         False)),
                range(0, settings.table_C_parts))

        field_names_map_c = OrderedDict(
            zip([
                '_{}'.format(i)
                for i, name in enumerate(settings.table_C_field_names)
            ], settings.table_C_field_names))

        def project_fn_c(df):
            df.rename(columns=field_names_map_c, inplace=True)
            return df

        project_c = map(
            lambda p: query_plan.add_operator(
                Project([
                    ProjectExpression(k, v)
                    for k, v in field_names_map_c.iteritems()
                ], 'project_c' + '_{}'.format(p), query_plan, False,
                        project_fn_c)), range(0, settings.table_C_parts))

        scan_c_detail_on_c_pk = \
            map(lambda p:
                query_plan.add_operator(
                    SQLTableScanBloomUse(get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf),
                                         "select "
                                         "  {},{} "
                                         "from "
                                         "  S3Object "
                                         "where "
                                         "  {} "
                                         "  {} "
                                         .format(settings.table_C_primary_key,
                                                 settings.table_C_detail_field_name,
                                                 settings.table_C_filter_sql,
                                                 get_sql_suffix(settings.table_C_key, settings.table_C_parts, p,
                                                                settings.table_C_sharded, add_where=False)),
                                         settings.table_C_primary_key, settings.format_,
                                         settings.use_pandas,
                                         settings.secure,
                                         settings.use_native,
                                         'scan_c_detail_on_c_pk' + '_{}'.format(p),
                                         query_plan,
                                         False)),
                range(0, settings.table_C_parts))

        field_names_map_c_detail = OrderedDict([
            ('_0', settings.table_C_primary_key),
            ('_1', settings.table_C_detail_field_name)
        ])

        def project_fn_c_detail(df):
            df.rename(columns=field_names_map_c_detail, inplace=True)
            return df

        project_c_detail = map(
            lambda p: query_plan.add_operator(
                Project([
                    ProjectExpression(k, v)
                    for k, v in field_names_map_c_detail.iteritems()
                ], 'project_c_detail' + '_{}'.format(p), query_plan,
                        False, project_fn_c_detail)),
            range(0, settings.table_C_parts))

        map_bc_b_join_key = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_B_BC_join_key, 'map_bc_b_join_key' + '_{}'.
                    format(p), query_plan, False)),
            range(0, settings.table_C_parts))

        map_c_pk_1 = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_C_primary_key, 'map_c_pk_1' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_C_parts))

        map_c_pk_2 = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_C_primary_key, 'map_c_pk_2' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_C_parts))

        bloom_create_c_pk = map(
            lambda p: query_plan.add_operator(
                BloomCreate(settings.table_C_primary_key,
                            'bloom_create_bc_b_to_c_join_key_{}'.format(p),
                            query_plan,
                            False,
                            fp_rate=settings.fp_rate)),
            range(0, settings.table_C_parts))

        join_build_ab_and_c_on_bc_join_key = map(
            lambda p: query_plan.add_operator(
                HashJoinBuild(
                    settings.table_B_BC_join_key,
                    'join_build_ab_and_c_on_bc_join_key' + '_{}'.format(
                        p), query_plan, False)),
            range(0, settings.table_C_parts))

        join_probe_ab_and_c_on_bc_join_key = map(
            lambda p: query_plan.add_operator(
                HashJoinProbe(
                    JoinExpression(settings.table_B_BC_join_key, settings.
                                   table_C_BC_join_key),
                    'join_probe_ab_and_c_on_bc_join_key' + '_{}'.format(
                        p), query_plan, False)),
            range(0, settings.table_C_parts))

        join_build_abc_and_c_on_c_pk = map(
            lambda p: query_plan.add_operator(
                HashJoinBuild(settings.table_C_primary_key,
                              'join_build_abc_and_c_on_c_pk' + '_{}'.format(
                                  p), query_plan, False)),
            range(0, settings.table_C_parts))

        join_probe_abc_and_c_on_c_pk = map(
            lambda p: query_plan.add_operator(
                HashJoinProbe(
                    JoinExpression(settings.table_C_primary_key, settings.
                                   table_C_primary_key),
                    'join_probe_abc_and_c_on_c_pk' + '_{}'.format(
                        p), query_plan, False)),
            range(0, settings.table_C_parts))

        bloom_create_bc_join_key = map(
            lambda p: query_plan.add_operator(
                BloomCreate(settings.table_B_BC_join_key,
                            'bloom_create_bc_join_key' + '_{}'.format(
                                p), query_plan, False)),
            range(0, settings.table_B_parts))

        map_bc_c_join_key = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_C_BC_join_key, 'map_bc_c_join_key' + '_{}'.
                    format(p), query_plan, False)),
            range(0, settings.table_B_parts))

    field_names_map_b = OrderedDict(
        zip([
            '_{}'.format(i)
            for i, name in enumerate(settings.table_B_field_names)
        ], settings.table_B_field_names))

    def project_fn_b(df):
        df.rename(columns=field_names_map_b, inplace=True)
        return df

    project_b = map(
        lambda p: query_plan.add_operator(
            Project([
                ProjectExpression(k, v)
                for k, v in field_names_map_b.iteritems()
            ], 'project_b' + '_{}'.format(p), query_plan, False, project_fn_b)
        ), range(0, settings.table_B_parts))

    map_ab_a_join_key = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_A_AB_join_key, 'map_ab_a_join_key' + '_{}'
                .format(p), query_plan, False)),
        range(0, settings.table_A_parts))

    map_ab_b_join_key = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_B_AB_join_key, 'map_ab_b_join_key' + '_{}'
                .format(p), query_plan, False)),
        range(0, settings.table_B_parts))

    join_build_a_and_b_on_ab_join_key = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(
                settings.table_A_AB_join_key,
                'join_build_a_and_b_on_ab_join_key' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_B_parts))

    join_probe_a_and_b_on_ab_join_key = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression(settings.table_A_AB_join_key, settings.
                               table_B_AB_join_key),
                'join_probe_a_and_b_on_ab_join_key' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_B_parts))

    if settings.table_C_key is None:

        def part_aggregate_fn(df):
            sum_ = df[settings.table_B_detail_field_name].astype(
                np.float).sum()
            return pd.DataFrame({'_0': [sum_]})

        part_aggregate = map(
            lambda p: query_plan.add_operator(
                Aggregate([
                    AggregateExpression(
                        AggregateExpression.SUM, lambda t: float(t[
                            settings.table_B_detail_field_name]))
                ], settings.use_pandas, 'part_aggregate_{}'.format(p),
                          query_plan, False, part_aggregate_fn)),
            range(0, settings.table_B_parts))

    else:

        def part_aggregate_fn(df):
            sum_ = df[settings.table_C_detail_field_name].astype(
                np.float).sum()
            return pd.DataFrame({'_0': [sum_]})

        part_aggregate = map(
            lambda p: query_plan.add_operator(
                Aggregate([
                    AggregateExpression(
                        AggregateExpression.SUM, lambda t: float(t[
                            settings.table_C_detail_field_name]))
                ], settings.use_pandas, 'part_aggregate_{}'.format(p),
                          query_plan, False, part_aggregate_fn)),
            range(0, settings.table_C_parts))

    def aggregate_reduce_fn(df):
        sum_ = df['_0'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], settings.use_pandas, 'aggregate_reduce', query_plan, False,
                  aggregate_reduce_fn))

    aggregate_project = query_plan.add_operator(
        Project([ProjectExpression(lambda t: t['_0'], 'total_balance')],
                'aggregate_project', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    # Inline some of the operators
    map(lambda o: o.set_async(False), project_a)
    map(lambda o: o.set_async(False), project_b)
    map(lambda o: o.set_async(False), map_ab_a_join_key)
    map(lambda o: o.set_async(False), map_ab_b_join_key)
    if settings.table_C_key is None:
        map(lambda o: o.set_async(False), map_b_pk_1)
        map(lambda o: o.set_async(False), map_b_pk_2)
        map(lambda o: o.set_async(False), project_b_detail)
    else:
        map(lambda o: o.set_async(False), map_bc_b_join_key)
        map(lambda o: o.set_async(False), map_bc_c_join_key)
        map(lambda o: o.set_async(False), map_c_pk_1)
        map(lambda o: o.set_async(False), map_c_pk_2)
        map(lambda o: o.set_async(False), project_c)
        map(lambda o: o.set_async(False), project_c_detail)
    aggregate_project.set_async(False)

    # Connect the operators
    connect_many_to_many(scan_a, project_a)

    connect_many_to_many(project_a, map_ab_a_join_key)

    connect_all_to_all(map_ab_a_join_key, join_build_a_and_b_on_ab_join_key)
    connect_all_to_all(project_a, bloom_create_ab_join_key)
    # connect_all_to_all(map_A_to_B, join_build_a_and_b_on_ab_join_key)
    connect_many_to_many(join_build_a_and_b_on_ab_join_key,
                         join_probe_a_and_b_on_ab_join_key)

    # connect_all_to_all(map_bloom_A_to_B, bloom_create_ab_join_key)
    connect_many_to_many(bloom_create_ab_join_key, scan_b_on_ab_join_key)
    connect_many_to_many(scan_b_on_ab_join_key, project_b)
    # connect_many_to_many(project_b, join_probe_a_and_b_on_ab_join_key)
    # connect_all_to_all(map_B_to_B, join_probe_a_and_b_on_ab_join_key)

    connect_many_to_many(project_b, map_ab_b_join_key)
    connect_all_to_all(map_ab_b_join_key, join_probe_a_and_b_on_ab_join_key)

    # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_bloom_B_to_B)

    if settings.table_C_key is None:
        # connect_all_to_all(join_probe_a_and_b_on_ab_join_key, part_aggregate)
        connect_many_to_many(scan_b_detail_on_b_pk, project_b_detail)
        connect_many_to_many(project_b_detail, map_b_pk_2)
        connect_many_to_many(bloom_create_b_pk, scan_b_detail_on_b_pk)
        connect_all_to_all(join_probe_a_and_b_on_ab_join_key,
                           bloom_create_b_pk)
        connect_all_to_all(map_b_pk_2, join_probe_ab_and_b_on_b_pk)
        connect_many_to_many(join_probe_ab_and_b_on_b_pk, part_aggregate)
        connect_many_to_many(join_build_ab_and_b_on_b_pk,
                             join_probe_ab_and_b_on_b_pk)
        connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_b_pk_1)
        connect_all_to_all(map_b_pk_1, join_build_ab_and_b_on_b_pk)

    else:
        connect_all_to_all(join_probe_a_and_b_on_ab_join_key,
                           bloom_create_bc_join_key)
        connect_many_to_many(bloom_create_bc_join_key, scan_c_on_bc_join_key)
        connect_many_to_many(scan_c_on_bc_join_key, project_c)
        # connect_many_to_many(project_c, join_probe_ab_and_c_on_bc_join_key)
        connect_all_to_all(map_bc_c_join_key,
                           join_probe_ab_and_c_on_bc_join_key)
        # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, join_build_ab_and_c_on_bc_join_key)
        connect_many_to_many(join_probe_a_and_b_on_ab_join_key,
                             map_bc_b_join_key)
        connect_all_to_all(map_bc_b_join_key,
                           join_build_ab_and_c_on_bc_join_key)
        connect_all_to_all(join_probe_ab_and_c_on_bc_join_key,
                           bloom_create_c_pk)
        # connect_many_to_many(join_probe_ab_and_c_on_bc_join_key, join_build_abc_and_c_on_c_pk)
        connect_many_to_many(join_probe_ab_and_c_on_bc_join_key, map_c_pk_1)
        connect_all_to_all(map_c_pk_1, join_build_abc_and_c_on_c_pk)
        connect_many_to_many(bloom_create_c_pk, scan_c_detail_on_c_pk)
        # connect_all_to_all(bloom_create_bc_join_key, scan_c_detail_on_c_pk)
        connect_many_to_many(join_build_abc_and_c_on_c_pk,
                             join_probe_abc_and_c_on_c_pk)
        # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_B_to_C)
        # connect_all_to_all(join_probe_a_and_b_on_ab_join_key, join_build_abc_and_c_on_c_pk)
        connect_many_to_many(scan_c_detail_on_c_pk, project_c_detail)
        # connect_many_to_many(project_c_detail, map_C_to_C)
        # connect_all_to_all(project_c_detail, join_probe_abc_and_c_on_c_pk)
        connect_many_to_many(project_c_detail, map_c_pk_2)

        connect_many_to_many(project_c, map_bc_c_join_key)
        connect_many_to_many(join_build_ab_and_c_on_bc_join_key,
                             join_probe_ab_and_c_on_bc_join_key)
        connect_all_to_all(map_c_pk_2, join_probe_abc_and_c_on_c_pk)

        connect_many_to_many(join_probe_abc_and_c_on_c_pk, part_aggregate)

    connect_many_to_one(part_aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, aggregate_project)
    connect_one_to_one(aggregate_project, collate)

    return query_plan
Пример #25
0
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts,
        part_parts, lineitem_sharded, part_sharded, other_parts, sf, fp_rate,
        expected_result, format_):
    """
    :return: None
    """

    print('')
    print("TPCH Q17 Bloom Join")
    print("-------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    part_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q17.sql_scan_select_partkey_where_brand_container_op(
                part_sharded, p, part_parts, use_pandas, secure, use_native,
                'part_scan' + '_' + str(p), query_plan, sf, format_)),
        range(0, part_parts))

    part_project = map(
        lambda p: query_plan.add_operator(
            tpch_q17.project_partkey_op('part_project' + '_' + str(p),
                                        query_plan)), range(0, part_parts))

    part_lineitem_join_build_map = map(
        lambda p: query_plan.add_operator(
            Map('p_partkey', 'part_lineitem_join_build_map' + '_' + str(p),
                query_plan, False)), range(0, part_parts))

    part_bloom_create = query_plan.add_operator(
        tpch_q17.bloom_create_partkey_op(fp_rate, 'part_bloom_create',
                                         query_plan))

    lineitem_bloom_use = \
        map(lambda p:
            query_plan.add_operator(
                tpch_q17.bloom_scan_lineitem_select_orderkey_partkey_quantity_extendedprice_bloom_partkey_op(
                    lineitem_sharded,
                    p,
                    part_parts,
                    use_pandas,
                    secure,
                    use_native,
                    'lineitem_bloom_use' + '_' + str(p),
                    query_plan,
                    sf, format_)),
            range(0, lineitem_parts))

    lineitem_project = map(
        lambda p: query_plan.add_operator(
            tpch_q17.
            project_lineitem_filtered_orderkey_partkey_quantity_extendedprice_op(
                'lineitem_project' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))

    part_lineitem_join_probe_map = map(
        lambda p: query_plan.add_operator(
            Map('l_partkey', 'part_lineitem_join_probe_map' + '_' + str(p),
                query_plan, False)), range(0, lineitem_parts))

    part_lineitem_join_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('p_partkey', 'part_lineitem_join_build' + '_' + str(
                p), query_plan, False)), range(0, other_parts))

    part_lineitem_join_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                          'part_lineitem_join_probe' + '_' + str(
                              p), query_plan, False)), range(0, other_parts))

    lineitem_part_avg_group = map(
        lambda p: query_plan.add_operator(
            tpch_q17.group_partkey_avg_quantity_op(
                'lineitem_part_avg_group' + '_' + str(p), query_plan)),
        range(0, other_parts))

    lineitem_part_avg_group_project = map(
        lambda p: query_plan.add_operator(
            tpch_q17.project_partkey_avg_quantity_op(
                'lineitem_part_avg_group_project' + '_' + str(p), query_plan)),
        range(0, other_parts))

    part_lineitem_join_avg_group_join_build = \
        map(lambda p:
            query_plan.add_operator(
                HashJoinBuild('l_partkey',
                              'part_lineitem_join_avg_group_join_build' + '_' + str(p),
                              query_plan,
                              False)),
            range(0, other_parts))

    part_lineitem_join_avg_group_join_probe = \
        map(lambda p:
            query_plan.add_operator(
                HashJoinProbe(JoinExpression('l_partkey', 'l_partkey'),
                              'part_lineitem_join_avg_group_join_probe' + '_' + str(p),
                              query_plan,
                              False)),
            range(0, other_parts))

    lineitem_filter = map(
        lambda p: query_plan.add_operator(
            tpch_q17.filter_lineitem_quantity_op(
                'lineitem_filter' + '_' + str(p), query_plan)),
        range(0, other_parts))

    extendedprice_sum_aggregate = map(
        lambda p: query_plan.add_operator(
            tpch_q17.aggregate_sum_extendedprice_op(
                use_pandas, 'extendedprice_sum_aggregate' + '_' + str(p),
                query_plan)), range(0, other_parts))

    def aggregate_reduce_fn(df):
        sum1_ = df['_0'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum1_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], use_pandas, 'aggregate_reduce', query_plan, False,
                  aggregate_reduce_fn))

    extendedprice_sum_aggregate_project = query_plan.add_operator(
        tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project',
                                       query_plan))

    collate = query_plan.add_operator(
        tpch_q17.collate_op('collate', query_plan))

    # Inline what we can
    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), part_project)
    map(lambda o: o.set_async(False), lineitem_filter)
    map(lambda o: o.set_async(False), part_lineitem_join_probe_map)
    map(lambda o: o.set_async(False), part_lineitem_join_build_map)
    map(lambda o: o.set_async(False), lineitem_part_avg_group)
    map(lambda o: o.set_async(False), lineitem_part_avg_group_project)
    map(lambda o: o.set_async(False), extendedprice_sum_aggregate)
    extendedprice_sum_aggregate_project.set_async(False)

    # Connect the operators
    # part_scan.connect(part_project)
    map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan))
    # map(lambda (p, o): o.connect(part_bloom_create_map[p]), enumerate(part_project))
    map(lambda (p, o): o.connect(part_lineitem_join_build_map[p]),
        enumerate(part_project))

    connect_many_to_one(part_project, part_bloom_create)
    connect_one_to_many(part_bloom_create, lineitem_bloom_use)

    # part_project.connect(part_bloom_create)
    # map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_bloom_create)),
    #     enumerate(part_bloom_create_map))

    # part_bloom_create.connect(lineitem_bloom_use)
    # map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(lineitem_bloom_use)),
    #     enumerate(part_bloom_create))

    # lineitem_bloom_use.connect(lineitem_project)
    map(lambda (p, o): o.connect(lineitem_project[p]),
        enumerate(lineitem_bloom_use))

    # part_lineitem_join.connect_left_producer(part_project)
    map(
        lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2),
                             enumerate(part_lineitem_join_build)),
        enumerate(part_lineitem_join_build_map))
    map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o),
        enumerate(part_lineitem_join_build))

    # part_lineitem_join.connect_right_producer(lineitem_project)
    # map(lambda (p, o): o.connect(part_lineitem_join_probe_map[p]), enumerate(lineitem_project))
    connect_many_to_many(lineitem_project, part_lineitem_join_probe_map)

    map(
        lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1),
                             enumerate(part_lineitem_join_probe)),
        enumerate(part_lineitem_join_probe_map))

    # part_lineitem_join.connect(lineitem_part_avg_group)
    map(lambda (p, o): o.connect(lineitem_part_avg_group[p]),
        enumerate(part_lineitem_join_probe))

    # lineitem_part_avg_group.connect(lineitem_part_avg_group_project)
    map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]),
        enumerate(lineitem_part_avg_group))

    # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project)
    map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]),
        enumerate(lineitem_part_avg_group_project))

    # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join)
    map(
        lambda (p, o): part_lineitem_join_avg_group_join_probe[p].
        connect_build_producer(o),
        enumerate(part_lineitem_join_avg_group_join_build))
    map(
        lambda (p, o): part_lineitem_join_avg_group_join_probe[p].
        connect_tuple_producer(o), enumerate(part_lineitem_join_probe))

    # part_lineitem_join_avg_group_join.connect(lineitem_filter)
    map(lambda (p, o): o.connect(lineitem_filter[p]),
        enumerate(part_lineitem_join_avg_group_join_probe))

    # lineitem_filter.connect(extendedprice_sum_aggregate)
    map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]),
        enumerate(lineitem_filter))

    # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project)
    map(lambda (p, o): o.connect(aggregate_reduce),
        enumerate(extendedprice_sum_aggregate))
    aggregate_reduce.connect(extendedprice_sum_aggregate_project)

    # extendedprice_sum_aggregate_project.connect(collate)
    extendedprice_sum_aggregate_project.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("part_sharded: {}".format(part_sharded))
    print("other_parts: {}".format(other_parts))
    print("fp_rate: {}".format(fp_rate))
    print("format: {}".format(format_))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['avg_yearly']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    if s3filter.util.constants.TPCH_SF == 10:
        assert round(
            float(tuples[1][0]), 10
        ) == 372414.2899999995  # TODO: This isn't correct but haven't checked tpch17 on 10 sf yet
    elif s3filter.util.constants.TPCH_SF == 1:
        numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
Пример #26
0
def test_sort_desc():
    """Executes a sorted query. The results are collated.

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    ts = query_plan.add_operator(
        SQLTableScan('supplier.csv', 'select * from S3Object '
                     'limit 3;', False, 'ts', query_plan, False))

    s = query_plan.add_operator(
        Sort([SortExpression('_5', float, 'DESC')], 's', query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    ts.connect(s)
    s.connect(c)

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    assert len(c.tuples()) == 3 + 1

    assert c.tuples()[0] == ['_0', '_1', '_2', '_3', '_4', '_5', '_6']

    assert c.tuples()[1] == [
        '1', 'Supplier#000000001', ' N kD4on9OM Ipw3,gf0JBoQDd7tgrzrddZ', '17',
        '27-918-335-1736', '5755.94', 'each slyly above the careful'
    ]
    assert c.tuples()[2] == [
        '3', 'Supplier#000000003', 'q1,G3Pj6OjIuUYfUoH18BFTKP5aU9bEV3', '1',
        '11-383-516-1199', '4192.40',
        'blithely silent requests after the express dependencies are sl'
    ]
    assert c.tuples()[3] == [
        '2', 'Supplier#000000002', '89eJ5ksX3ImxJQBvxObC,', '5',
        '15-679-861-2259', '4032.68', ' slyly bold instructions. idle dependen'
    ]

    # Write the metrics
    query_plan.print_metrics()
Пример #27
0
def query_plan(settings):
    # type: (SyntheticBaselineJoinSettings) -> QueryPlan
    """

    :type settings:
    :return: None
    """

    query_plan = QueryPlan(is_async=settings.parallel,
                           buffer_size=settings.buffer_size)

    def scan_A_fun(df):
        df.columns = settings.table_A_field_names
        criterion = settings.table_A_filter_fn(df)
        df = df[criterion]
        return df

    # Define the operators
    scan_A = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p),
                             "select "
                             "  * "
                             "from "
                             "  S3Object "
                             "{}"
                             .format(
                                 get_sql_suffix(settings.table_A_key, settings.table_A_parts, p,
                                                settings.table_A_sharded, add_where=True)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_A_{}'.format(p),
                             query_plan,
                             False, fn=scan_A_fun)),
            range(0, settings.table_A_parts))

    def scan_B_fun(df):
        df.columns = settings.table_B_field_names
        criterion = settings.table_B_filter_fn(df)
        return df[criterion]

    scan_B = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_B_key, settings.table_B_sharded, p),
                             "select "
                             "  * "
                             "from "
                             "  S3Object "
                             "{}"
                             .format(
                                 get_sql_suffix(settings.table_B_key, settings.table_B_parts, p,
                                                settings.table_B_sharded, add_where=True)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_B_{}'.format(p),
                             query_plan,
                             False, fn=scan_B_fun)),
            range(0, settings.table_B_parts))
    """
    field_names_map_B = OrderedDict(
        zip(['_{}'.format(i) for i, name in enumerate(settings.table_B_field_names)], settings.table_B_field_names))

    def project_fn_B(df):
        df.rename(columns=field_names_map_B, inplace=True)
        return df

    project_B = map(lambda p:
                    query_plan.add_operator(Project(
                        [ProjectExpression(k, v) for k, v in field_names_map_B.iteritems()],
                        'project_B_{}'.format(p),
                        query_plan,
                        True,
                        project_fn_B)),
                    range(0, settings.table_B_parts))

    filter_b = map(lambda p:
                   query_plan.add_operator(Filter(
                       PredicateExpression(None, pd_expr=settings.table_B_filter_fn), 'filter_b' + '_{}'.format(p), query_plan,
                       False)),
                   range(0, settings.table_B_parts))
    """

    def scan_C_fun(df):
        df.columns = settings.table_C_field_names
        criterion = settings.table_C_filter_fn(df)
        return df[criterion]

    scan_C = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_C_key, settings.table_C_sharded, p),
                             "select "
                             "  * "
                             "from "
                             "  S3Object "
                             "{}"
                             .format(
                                 get_sql_suffix(settings.table_C_key, settings.table_C_parts, p,
                                                settings.table_C_sharded, add_where=True)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_C_{}'.format(p),
                             query_plan,
                             False, fn=scan_C_fun)),
            range(0, settings.table_C_parts))
    """
    field_names_map_C = OrderedDict(
        zip(['_{}'.format(i) for i, name in enumerate(settings.table_C_field_names)], settings.table_C_field_names))

    def project_fn_C(df):
        df.rename(columns=field_names_map_C, inplace=True)
        return df

    project_C = map(lambda p:
                    query_plan.add_operator(Project(
                        [ProjectExpression(k, v) for k, v in field_names_map_C.iteritems()],
                        'project_C_{}'.format(p),
                        query_plan,
                        True,
                        project_fn_C)),
                    range(0, settings.table_C_parts))

    filter_c = map(lambda p:
                   query_plan.add_operator(Filter(
                       PredicateExpression(None, pd_expr=settings.table_C_filter_fn), 'filter_c' + '_{}'.format(p), query_plan,
                       False)),
                   range(0, settings.table_C_parts))
    """

    map_A_to_B = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p),
                query_plan, False)), range(0, settings.table_A_parts))

    map_B_to_B = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p),
                query_plan, False)), range(0, settings.table_B_parts))

    map_B_to_C = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p),
                query_plan, False)), range(0, settings.table_B_parts))

    map_C_to_C = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p),
                query_plan, False)), range(0, settings.table_C_parts))

    join_build_A_B = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'.
                          format(p), query_plan, False)),
        range(0, settings.table_B_parts))

    join_probe_A_B = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression(settings.table_A_AB_join_key, settings.
                               table_B_AB_join_key), 'join_probe_A_B_{}'
                .format(p), query_plan, False)),
        range(0, settings.table_B_parts))

    join_build_AB_C = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(settings.table_B_BC_join_key, 'join_build_AB_C_{}'.
                          format(p), query_plan, False)),
        range(0, settings.table_C_parts))

    join_probe_AB_C = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression(settings.table_B_BC_join_key, settings.
                               table_C_BC_join_key), 'join_probe_AB_C_{}'
                .format(p), query_plan, False)),
        range(0, settings.table_C_parts))

    def agg_fun(df):
        return pd.DataFrame({
            'sum':
            [df[settings.table_C_detail_field_name].astype(float).sum()]
        })
        #return pd.DataFrame( { 'sum' : [ len(df) ] } )

    part_aggregate = map(
        lambda p: query_plan.add_operator(
            Aggregate([
                AggregateExpression(
                    AggregateExpression.SUM, lambda t: float(t[
                        settings.table_C_detail_field_name]))
            ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan,
                      False, agg_fun)), range(0, settings.table_C_parts))

    def agg_reduce_fun(df):
        return pd.DataFrame({'sum': [df['sum'].sum()]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], settings.use_pandas, 'aggregate_reduce', query_plan, False,
                  agg_reduce_fun))
    """
    aggregate_project = query_plan.add_operator(Project(
        [
            ProjectExpression(lambda t: t['_0'], 'total_balance')
        ],
        'aggregate_project', query_plan,
        False))
    """

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    # Connect the operators
    connect_many_to_many(scan_A, map_A_to_B)
    #connect_many_to_many(project_A, filter_A)
    #connect_many_to_many(filter_A, map_A_to_B)
    connect_all_to_all(map_A_to_B, join_build_A_B)
    connect_many_to_many(join_build_A_B, join_probe_A_B)

    connect_many_to_many(scan_B, map_B_to_B)
    #connect_many_to_many(project_B, filter_b)
    #connect_many_to_many(filter_b, map_B_to_B)
    connect_all_to_all(map_B_to_B, join_probe_A_B)

    #connect_many_to_many(join_probe_A_B, part_aggregate)
    #connect_many_to_one(part_aggregate, aggregate_reduce)
    #connect_one_to_one(aggregate_reduce, collate)

    connect_many_to_many(join_build_AB_C, join_probe_AB_C)

    connect_many_to_many(join_probe_A_B, map_B_to_C)
    connect_all_to_all(map_B_to_C, join_build_AB_C)

    connect_many_to_many(scan_C, map_C_to_C)
    #connect_many_to_many(project_C, filter_c)
    #connect_many_to_many(filter_c, map_C_to_C)
    connect_all_to_all(map_C_to_C, join_probe_AB_C)

    connect_many_to_many(join_probe_AB_C, part_aggregate)

    connect_many_to_one(part_aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, collate)
    #connect_one_to_one(aggregate_project, collate)

    return query_plan
Пример #28
0
def query_plan(settings):
    # type: (SyntheticFilteredJoinSettings) -> QueryPlan
    """

    :return: None
    """

    if settings.use_shared_mem:
        system = WorkerSystem(settings.shared_memory_size)
    else:
        system = None

    query_plan = QueryPlan(system,
                           is_async=settings.parallel,
                           buffer_size=settings.buffer_size,
                           use_shared_mem=settings.use_shared_mem)

    # Define the operators
    scan_A = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p, settings.sf),
                             "select "
                             "  {} "
                             "from "
                             "  S3Object "
                             "  {} "
                             "  {} "
                             .format(','.join(settings.table_A_field_names),
                                     ' where {} '.format(
                                         settings.table_A_filter_sql) if settings.table_A_filter_sql is not None else '',
                                     get_sql_suffix(settings.table_A_key, settings.table_A_parts, p,
                                                    settings.table_A_sharded,
                                                    add_where=settings.table_A_filter_sql is None)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_A_{}'.format(p),
                             query_plan,
                             False)),
            range(0, settings.table_A_parts))

    field_names_map_A = OrderedDict(
        zip([
            '_{}'.format(i)
            for i, name in enumerate(settings.table_A_field_names)
        ], settings.table_A_field_names))

    def project_fn_A(df):
        df = df.rename(columns=field_names_map_A, copy=False)
        return df

    project_A = map(
        lambda p: query_plan.add_operator(
            Project([
                ProjectExpression(k, v)
                for k, v in field_names_map_A.iteritems()
            ], 'project_A_{}'.format(p), query_plan, False, project_fn_A)),
        range(0, settings.table_A_parts))

    scan_B = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(
                    get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf),
                    "select "
                    "  {} "
                    "from "
                    "  S3Object "
                    "  {} "
                    "  {} "
                        .format(','.join(settings.table_B_field_names),
                                ' where {} '.format(
                                    settings.table_B_filter_sql) if settings.table_B_filter_sql is not None else '',
                                get_sql_suffix(settings.table_B_key, settings.table_B_parts, p,
                                               settings.table_B_sharded,
                                               add_where=settings.table_B_filter_sql is None)), settings.format_,
                    settings.use_pandas,
                    settings.secure,
                    settings.use_native,
                    'scan_B_{}'.format(p),
                    query_plan,
                    False)),
            range(0, settings.table_B_parts))

    field_names_map_B = OrderedDict(
        zip([
            '_{}'.format(i)
            for i, name in enumerate(settings.table_B_field_names)
        ], settings.table_B_field_names))

    def project_fn_B(df):
        df.rename(columns=field_names_map_B, inplace=True)
        return df

    project_B = map(
        lambda p: query_plan.add_operator(
            Project([
                ProjectExpression(k, v)
                for k, v in field_names_map_B.iteritems()
            ], 'project_B_{}'.format(p), query_plan, False, project_fn_B)),
        range(0, settings.table_B_parts))

    if settings.table_C_key is not None:
        scan_C = \
            map(lambda p:
                query_plan.add_operator(
                    SQLTableScan(
                        get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf),
                        "select "
                        "  {} "
                        "from "
                        "  S3Object "
                        "where "
                        "  {} "
                        "  {} "
                            .format(','.join(settings.table_C_field_names),
                                    settings.table_C_filter_sql,
                                    get_sql_suffix(settings.table_C_key, settings.table_C_parts, p,
                                                   settings.table_C_sharded, add_where=False)), settings.format_,
                        settings.use_pandas,
                        settings.secure,
                        settings.use_native,
                        'scan_C_{}'.format(p),
                        query_plan,
                        False)),
                range(0, settings.table_C_parts))

        field_names_map_C = OrderedDict(
            zip([
                '_{}'.format(i)
                for i, name in enumerate(settings.table_C_field_names)
            ], settings.table_C_field_names))

        def project_fn_C(df):
            df = df.rename(columns=field_names_map_C, copy=False)
            return df

        project_C = map(
            lambda p: query_plan.add_operator(
                Project([
                    ProjectExpression(k, v)
                    for k, v in field_names_map_C.iteritems()
                ], 'project_C_{}'.format(p), query_plan, False, project_fn_C)),
            range(0, settings.table_C_parts))

        map_B_to_C = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p),
                    query_plan, False)), range(0, settings.table_B_parts))

        map_C_to_C = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p),
                    query_plan, False)), range(0, settings.table_C_parts))

        join_build_AB_C = map(
            lambda p: query_plan.add_operator(
                HashJoinBuild(settings.table_B_BC_join_key,
                              'join_build_AB_C_{}'.format(
                                  p), query_plan, False)),
            range(0, settings.table_C_parts))

        join_probe_AB_C = map(
            lambda p: query_plan.add_operator(
                HashJoinProbe(
                    JoinExpression(settings.table_B_BC_join_key, settings.
                                   table_C_BC_join_key), 'join_probe_AB_C_{}'.
                    format(p), query_plan, False)),
            range(0, settings.table_C_parts))

    map_A_to_B = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p),
                query_plan, False)), range(0, settings.table_A_parts))

    map_B_to_B = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p),
                query_plan, False)), range(0, settings.table_B_parts))

    join_build_A_B = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'.
                          format(p), query_plan, False)),
        range(0, settings.other_parts))

    join_probe_A_B = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression(settings.table_A_AB_join_key, settings.
                               table_B_AB_join_key), 'join_probe_A_B_{}'.
                format(p), query_plan, False)), range(0, settings.other_parts))

    if settings.table_C_key is None:

        def part_aggregate_fn(df):
            sum_ = df[settings.table_B_detail_field_name].astype(
                np.float).sum()
            return pd.DataFrame({'_0': [sum_]})

        part_aggregate = map(
            lambda p: query_plan.add_operator(
                Aggregate([
                    AggregateExpression(
                        AggregateExpression.SUM, lambda t: float(t[
                            settings.table_B_detail_field_name]))
                ], settings.use_pandas, 'part_aggregate_{}'.format(p),
                          query_plan, False, part_aggregate_fn)),
            range(0, settings.other_parts))

    else:

        def part_aggregate_fn(df):
            sum_ = df[settings.table_C_detail_field_name].astype(
                np.float).sum()
            return pd.DataFrame({'_0': [sum_]})

        part_aggregate = map(
            lambda p: query_plan.add_operator(
                Aggregate([
                    AggregateExpression(
                        AggregateExpression.SUM, lambda t: float(t[
                            settings.table_C_detail_field_name]))
                ], settings.use_pandas, 'part_aggregate_{}'.format(p),
                          query_plan, False, part_aggregate_fn)),
            range(0, settings.table_C_parts))

    def aggregate_reduce_fn(df):
        sum_ = df['_0'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], settings.use_pandas, 'aggregate_reduce', query_plan, False,
                  aggregate_reduce_fn))

    aggregate_project = query_plan.add_operator(
        Project([ProjectExpression(lambda t: t['_0'], 'total_balance')],
                'aggregate_project', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    # Inline some of the operators
    map(lambda o: o.set_async(False), project_A)
    map(lambda o: o.set_async(False), project_B)
    map(lambda o: o.set_async(False), map_A_to_B)
    map(lambda o: o.set_async(False), map_B_to_B)
    if settings.table_C_key is not None:
        map(lambda o: o.set_async(False), map_B_to_C)
        map(lambda o: o.set_async(False), map_C_to_C)
        map(lambda o: o.set_async(False), project_C)
    map(lambda o: o.set_async(False), part_aggregate)
    aggregate_project.set_async(False)

    # Connect the operators
    connect_many_to_many(scan_A, project_A)
    connect_many_to_many(project_A, map_A_to_B)
    connect_all_to_all(map_A_to_B, join_build_A_B)
    connect_many_to_many(join_build_A_B, join_probe_A_B)

    connect_many_to_many(scan_B, project_B)
    connect_many_to_many(project_B, map_B_to_B)
    connect_all_to_all(map_B_to_B, join_probe_A_B)

    if settings.table_C_key is None:
        connect_many_to_many(join_probe_A_B, part_aggregate)
    else:
        connect_many_to_many(join_probe_A_B, map_B_to_C)
        connect_all_to_all(map_B_to_C, join_build_AB_C)
        connect_many_to_many(join_build_AB_C, join_probe_AB_C)
        connect_many_to_many(scan_C, project_C)
        connect_many_to_many(project_C, map_C_to_C)
        connect_all_to_all(map_C_to_C, join_probe_AB_C)
        connect_many_to_many(join_probe_AB_C, part_aggregate)

    connect_many_to_one(part_aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, aggregate_project)
    connect_one_to_one(aggregate_project, collate)

    return query_plan
Пример #29
0
def run(parallel,
        use_pandas,
        buffer_size,
        table_first_part,
        table_parts,
        queried_columns,
        queried_aliases,
        castable_aliases,
        select_str,
        aggregate_column,
        filter_str,
        path,
        format_=Format.PARQUET):
    secure = False
    use_native = False
    print('')
    print("Parquet Aggregate Benchmark")
    print("------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # SQL scan the file
    scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                "{}/lineitem.{}.parquet".format(path, p
                                                ), "select {} from S3Object "
                "{};".format(select_str, filter_str), format_, use_pandas,
                secure, use_native, 'scan_{}'.format(p), query_plan, False)),
        range(table_first_part, table_first_part + table_parts))

    # project
    def fn(df):
        df.columns = queried_aliases
        df[castable_aliases] = df[castable_aliases].astype(np.double)
        return df

    project = map(
        lambda p: query_plan.add_operator(
            Project([], 'project_{}'.format(p), query_plan, False, fn)),
        range(table_first_part, table_first_part + table_parts))

    # aggregation
    def agg_fun(df):
        if aggregate_column in df:
            return pd.DataFrame({'sum': [sum(df[aggregate_column])]})
        else:
            return pd.DataFrame({'count': len(df)}, index=[0])

    aggregate = query_plan.add_operator(
        Aggregate([], True, 'agg', query_plan, False, agg_fun))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(aggregate), enumerate(project))
    aggregate.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    # query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Пример #30
0
def test_filter_empty():
    """Executes a filter where no records are returned. We tst this as it's somewhat peculiar with s3 select, in so much
    as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple
    should be present in the results.

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    ts = query_plan.add_operator(
        SQLTableScan('lineitem.csv', 'select * from S3Object limit 0;', False,
                     'ts', query_plan, False))

    f = query_plan.add_operator(
        Filter(
            PredicateExpression(lambda t_: cast(t_['_10'], timestamp) >= cast(
                '1996-03-01', timestamp)), 'f', query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(f)
    f.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    assert len(c.tuples()) == 0

    # Write the metrics
    query_plan.print_metrics()