示例#1
0
def test_aggregate_count():
    """Tests a group by query with a count aggregate

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    # select count(*) from supplier.csv
    ts = query_plan.add_operator(
        SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts',
                     query_plan, False))

    a = query_plan.add_operator(
        Aggregate(
            [
                AggregateExpression(AggregateExpression.COUNT,
                                    lambda t_: t_['_0'])
                # count(s_suppkey)
            ],
            'a',
            query_plan,
            False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(a)
    a.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    c.print_tuples()

    field_names = ['_0']

    tuples = c.tuples()

    assert tuples[0] == field_names
    assert IndexedTuple.build(tuples[1], field_names)['_0'] == 10000
    assert len(tuples) == 1 + 1

    # Write the metrics
    query_plan.print_metrics()
示例#2
0
def aggregate_def(name, query_plan, use_pandas):
    def fn(df):
        sum1_ = (df['l_extendedprice'].astype(np.float) *
                 (1 - df['l_discount'].astype(np.float))).sum()
        return pd.DataFrame({'_0': [sum1_]})

    return Aggregate([
        AggregateExpression(
            AggregateExpression.SUM,
            lambda t_: float(t_['l_extendedprice']) * float(
                (1 - float(t_['l_discount']))))
    ], use_pandas, name, query_plan, False, fn)
示例#3
0
def test_aggregate_empty():
    """Executes an aggregate query with no results returned. We tst this as it's somewhat peculiar with s3 select,
    in so much as s3 does not return column names when selecting data, though being an aggregate query we can generate
    the tuple field names based on the expressions supplied.

    TODO: Unsure whether the aggregate operator should return field names. It makes sense in one way, but is different
    to how all the other operators behave.

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    # select sum(float(s_acctbal)) from supplier.csv limit 0
    ts = query_plan.add_operator(
        SQLTableScan('supplier.csv', 'select * from S3Object limit 0;', False,
                     'ts', query_plan, False))

    a = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t_: float(t_['_5']))
        ], 'a', query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(a)
    a.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    field_names = ['_0']

    assert c.tuples()[0] == field_names
    assert len(c.tuples()) == 0 + 1

    # Write the metrics
    query_plan.print_metrics()
示例#4
0
def test_aggregate_sum():
    """Tests a group by query with a sum aggregate

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    # select sum(float(s_acctbal)) from supplier.csv
    ts = query_plan.add_operator(
        SQLTableScan('supplier.csv', 'select * from S3Object;', False, 'ts',
                     query_plan, False))

    a = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t_: float(t_['_5']))
        ], 'a', query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(a)
    a.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    field_names = ['_0']

    assert c.tuples()[0] == field_names
    assert round(IndexedTuple.build(c.tuples()[1], field_names)['_0'],
                 2) == 45103548.65
    assert len(c.tuples()) == 1 + 1

    # Write the metrics
    query_plan.print_metrics()
示例#5
0
def aggregate_sum_extendedprice_op(use_pandas, name, query_plan):
    """with extendedprice_sum_aggregate as (
        select sum(l_extendedprice) from filter_join_2
    )

    :param query_plan:
    :param name:
    :return:
    """
    def fn(df):
        sum1_ = df['l_extendedprice'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum1_]})

    return Aggregate([
        AggregateExpression(AggregateExpression.SUM,
                            lambda t_: float(t_['l_extendedprice']))
    ], use_pandas, name, query_plan, False, fn)
示例#6
0
def aggregate_promo_revenue_operator_def(name, query_plan):
    # type: (str, QueryPlan) -> Aggregate
    def ex1(t_):

        v1 = float(t_['l_extendedprice']) * (1.0 - float(t_['l_discount']))

        rx = re.compile('^PROMO.*$')

        if rx.search(t_['p_type']):  # p_type
            v2 = v1
        else:
            v2 = 0.0

        return v2

    def ex2(t_):

        v1 = float(t_['l_extendedprice']) * (1.0 - float(t_['l_discount']))

        return v1

    def agg_fn(df):

        mask = df['p_type'].str.startswith('PROMO')

        df1 = df[mask]['l_extendedprice'].astype(
            np.float) * (1.0 - df[mask]['l_discount'].astype(np.float))
        df2 = df['l_extendedprice'].astype(
            np.float) * (1.0 - df['l_discount'].astype(np.float))

        sum1 = df1.sum()
        sum2 = df2.sum()

        df5 = pd.DataFrame({'_0': [sum1], '_1': [sum2]})

        return df5

    return Aggregate([
        AggregateExpression(AggregateExpression.SUM, ex1),
        AggregateExpression(AggregateExpression.SUM, ex2)
    ], True, name, query_plan, False, agg_fn)
示例#7
0
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts):
    """The baseline tst uses nested loop joins with no projection and no filtering pushed down to s3.

    This works by:

    1. Scanning part and filtering on brand and container
    2. It then scans lineitem
    3. It then joins the two tables (essentially filtering out lineitems that dont include parts that we filtered out in
       step 1)
    4. It then computes the average of l_quantity from the joined table and groups the results by partkey
    5. It then joins these computed averages with the joined table in step 3
    6. It then filters out any rows where l_quantity is less than the computed average

    TODO: There are few ways this can be done, the above is just one.

    :return: None
    """

    print('')
    print("TPCH Q17 Baseline Join")
    print("----------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    part_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q17.sql_scan_part_select_all_where_brand_and_container_op(
                part_parts != 1, p, part_parts, use_pandas, 'part_scan' + '_' +
                str(p), query_plan)), range(0, part_parts))

    lineitem_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q17.sql_scan_lineitem_select_all_where_partkey_op(
                lineitem_parts != 1, p, lineitem_parts, use_pandas,
                'lineitem_scan' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))

    part_project = map(
        lambda p: query_plan.add_operator(
            tpch_q17.project_partkey_brand_container_op(
                'part_project' + '_' + str(p), query_plan)),
        range(0, part_parts))

    part_filter = map(
        lambda p: query_plan.add_operator(
            tpch_q17.filter_brand_container_op('part_filter' + '_' + str(p),
                                               query_plan)),
        range(0, part_parts))

    part_map = map(
        lambda p: query_plan.add_operator(
            Map('p_partkey', 'part_map' + '_' + str(p), query_plan, True)),
        range(0, part_parts))

    lineitem_map = map(
        lambda p: query_plan.add_operator(
            Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, True)),
        range(0, lineitem_parts))

    lineitem_project = map(
        lambda p: query_plan.add_operator(
            tpch_q17.
            project_lineitem_orderkey_partkey_quantity_extendedprice_op(
                'lineitem_project' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))

    # part_lineitem_join = map(lambda p:
    #                          query_plan.add_operator(
    #                              tpch_q17.join_p_partkey_l_partkey_op('part_lineitem_join', query_plan)),
    #                          range(0, lineitem_parts))

    part_lineitem_join_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('p_partkey', 'part_lineitem_join_build' + '_' + str(
                p), query_plan, False)), range(0, part_parts))

    part_lineitem_join_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                          'part_lineitem_join_probe' + '_' + str(
                              p), query_plan, True)), range(0, part_parts))

    lineitem_part_avg_group = map(
        lambda p: query_plan.add_operator(
            tpch_q17.group_partkey_avg_quantity_5_op(
                'lineitem_part_avg_group' + '_' + str(p), query_plan)),
        range(0, part_parts))

    lineitem_part_avg_group_project = map(
        lambda p: query_plan.add_operator(
            tpch_q17.project_partkey_avg_quantity_op(
                'lineitem_part_avg_group_project' + '_' + str(p), query_plan)),
        range(0, part_parts))

    # part_lineitem_join_avg_group_join = map(lambda p:
    #                                         query_plan.add_operator(
    #                                             tpch_q17.join_l_partkey_p_partkey_op(
    #                                                 'part_lineitem_join_avg_group_join', query_plan)),
    #                                         range(0, lineitem_parts))

    part_lineitem_join_avg_group_join_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(
                'l_partkey', 'part_lineitem_join_avg_group_join_build' + '_' +
                str(p), query_plan, False)), range(0, part_parts))

    part_lineitem_join_avg_group_join_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression('l_partkey', 'p_partkey'),
                'part_lineitem_join_avg_group_join_probe' + '_' + str(
                    p), query_plan, True)), range(0, part_parts))

    lineitem_filter = map(
        lambda p: query_plan.add_operator(
            tpch_q17.filter_lineitem_quantity_op(
                'lineitem_filter' + '_' + str(p), query_plan)),
        range(0, part_parts))

    extendedprice_sum_aggregate = map(
        lambda p: query_plan.add_operator(
            tpch_q17.aggregate_sum_extendedprice_op(
                'extendedprice_sum_aggregate' + '_' + str(p), query_plan)),
        range(0, part_parts))

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], 'aggregate_reduce', query_plan, False))

    extendedprice_sum_aggregate_project = query_plan.add_operator(
        tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project',
                                       query_plan))

    collate = query_plan.add_operator(
        tpch_q17.collate_op('collate', query_plan))

    # Connect the operators
    # part_scan.connect(part_project)
    map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan))
    map(lambda (p, o): o.connect(part_filter[p]), enumerate(part_project))
    map(lambda (p, o): o.connect(part_map[p]), enumerate(part_filter))

    # lineitem_scan.connect(lineitem_project)
    map(lambda (p, o): o.connect(lineitem_project[p]),
        enumerate(lineitem_scan))
    map(lambda (p, o): o.connect(lineitem_map[p]), enumerate(lineitem_project))

    # part_lineitem_join.connect_left_producer(part_project)
    # part_lineitem_join.connect_right_producer(lineitem_project)
    # part_lineitem_join.connect(lineitem_part_avg_group)
    map(
        lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2),
                             enumerate(part_lineitem_join_build)),
        enumerate(part_map))
    map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o),
        enumerate(part_lineitem_join_build))
    map(
        lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1),
                             enumerate(part_lineitem_join_probe)),
        enumerate(lineitem_map))
    map(lambda (p, o): o.connect(lineitem_part_avg_group[p]),
        enumerate(part_lineitem_join_probe))

    # map(lambda (p, o): o.map(Mapper('_1', 1, part_lineitem_join_probe)), enumerate(lineitem_scan))

    # lineitem_part_avg_group.connect(lineitem_part_avg_group_project)
    map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]),
        enumerate(lineitem_part_avg_group))

    # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project)
    # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join)
    # part_lineitem_join_avg_group_join.connect(lineitem_filter)

    map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]),
        enumerate(lineitem_part_avg_group_project))

    # map(lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo),
    #                        enumerate(part_lineitem_join_avg_group_join_build)),
    #     enumerate(part_lineitem_join_avg_group_join_probe))
    map(
        lambda (p, o): part_lineitem_join_avg_group_join_probe[p].
        connect_build_producer(o),
        enumerate(part_lineitem_join_avg_group_join_build))
    map(
        lambda (p, o): part_lineitem_join_avg_group_join_probe[p % part_parts].
        connect_tuple_producer(o), enumerate(part_lineitem_join_probe))
    map(lambda (p, o): o.connect(lineitem_filter[p]),
        enumerate(part_lineitem_join_avg_group_join_probe))

    # lineitem_filter.connect(extendedprice_sum_aggregate)
    map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]),
        enumerate(lineitem_filter))
    # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project)
    # extendedprice_sum_aggregate_project.connect(collate)
    map(lambda (p, o): o.connect(aggregate_reduce),
        enumerate(extendedprice_sum_aggregate))
    aggregate_reduce.connect(extendedprice_sum_aggregate_project)
    extendedprice_sum_aggregate_project.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['avg_yearly']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    assert round(float(tuples[1][0]), 10) == 4632.1085714286
示例#8
0
def run(parallel,
        use_pandas,
        buffer_size,
        table_first_part,
        table_parts,
        queried_columns,
        select_columns,
        lower,
        upper,
        path,
        format_=Format.CSV):
    secure = False
    use_native = False
    print('')
    print("CSV Filter+Project Benchmark")
    print("------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # SQL scan the file
    scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                "{}/lineitem.tbl.{}".format(path, p
                                            ), "select {} from S3Object "
                "where cast(l_orderkey as int) >= {} and cast(l_orderkey as int) <= {};"
                .format(select_columns, lower, upper), format_, use_pandas,
                secure, use_native, 'scan_{}'.format(p), query_plan, False)),
        range(table_first_part, table_first_part + table_parts))

    # project
    def fn(df):
        df.columns = queried_columns
        df[['l_orderkey']] = df[['l_orderkey']].astype(np.int64)
        return df

    project = map(
        lambda p: query_plan.add_operator(
            Project([], 'project_{}'.format(p), query_plan, False, fn)),
        range(table_first_part, table_first_part + table_parts))

    # aggregation
    def agg_fun(df):
        return pd.DataFrame({'count': [len(df)]})

    aggregate = query_plan.add_operator(
        Aggregate([], True, 'agg', query_plan, False, agg_fun))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(aggregate), enumerate(project))
    aggregate.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
示例#9
0
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts,
        part_parts, lineitem_sharded, part_sharded, other_parts, sf, fp_rate,
        expected_result, format_):
    """

    :return: None
    """

    print('')
    print("TPCH Q19 Bloom Join")
    print("-------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Define the operators
    lineitem_bloom_use = \
        map(lambda p:
            query_plan.add_operator(
                tpch_q19.bloom_scan_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_where_filtered_op(
                    lineitem_sharded,
                    p,
                    lineitem_parts,
                    use_pandas,
                    secure,
                    use_native,
                    'lineitem_bloom_use' + '_' + str(p), query_plan, sf, format_)),
            range(0, lineitem_parts))

    part_bloom_create = query_plan.add_operator(
        tpch_q19.bloom_create_partkey_op('part_bloom_create', query_plan,
                                         fp_rate))

    part_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q19.
            sql_scan_part_partkey_brand_size_container_where_filtered_op(
                part_sharded, p, part_parts, use_pandas, secure, use_native,
                'part_scan' + '_' + str(p), query_plan, sf, format_)),
        range(0, part_parts))

    lineitem_project = \
        map(lambda p:
            query_plan.add_operator(
                tpch_q19.project_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_filtered_op(
                    'lineitem_project' + '_' + str(p),
                    query_plan)),
            range(0, lineitem_parts))

    lineitem_map = map(
        lambda p: query_plan.add_operator(
            Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, False)
        ), range(0, lineitem_parts))

    part_project = map(
        lambda p: query_plan.add_operator(
            tpch_q19.project_partkey_brand_size_container_filtered_op(
                'part_project' + '_' + str(p), query_plan)),
        range(0, part_parts))

    part_map = map(
        lambda p: query_plan.add_operator(
            Map('p_partkey', 'part_map' + '_' + str(p), query_plan, False)),
        range(0, part_parts))

    # part_bloom_create_map = map(lambda p:
    #                query_plan.add_operator(
    #                    Map('p_partkey', 'part_bloom_create_map' + '_' + str(p), query_plan, False)),
    #                range(0, part_parts))

    # lineitem_part_join = map(lambda p:
    #                          query_plan.add_operator(tpch_q19.join_op(query_plan)),
    #                          range(0, part_parts))

    lineitem_part_join_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('p_partkey', 'lineitem_part_join_build' + '_' + str(
                p), query_plan, False)), range(0, other_parts))

    lineitem_part_join_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                          'lineitem_part_join_probe' + '_' + str(
                              p), query_plan, False)), range(0, other_parts))

    filter_op = map(
        lambda p: query_plan.add_operator(
            tpch_q19.filter_def('filter_op' + '_' + str(p), query_plan)),
        range(0, other_parts))
    aggregate = map(
        lambda p: query_plan.add_operator(
            tpch_q19.aggregate_def('aggregate' + '_' + str(p), query_plan,
                                   use_pandas)), range(0, other_parts))

    def aggregate_reduce_fn(df):
        sum1_ = df['_0'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum1_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], use_pandas, 'aggregate_reduce', query_plan, False,
                  aggregate_reduce_fn))

    aggregate_project = query_plan.add_operator(
        tpch_q19.aggregate_project_def('aggregate_project', query_plan))
    collate = query_plan.add_operator(
        tpch_q19.collate_op('collate', query_plan))

    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), part_project)
    map(lambda o: o.set_async(False), lineitem_map)
    map(lambda o: o.set_async(False), part_map)
    map(lambda o: o.set_async(False), filter_op)
    map(lambda o: o.set_async(False), aggregate)
    aggregate_project.set_async(False)

    # Connect the operators
    # part_scan.connect(part_project)
    connect_many_to_many(part_scan, part_project)
    connect_many_to_many(part_project, part_map)

    # part_project.connect(part_bloom_create)
    connect_many_to_one(part_project, part_bloom_create)

    # part_bloom_create.connect(lineitem_bloom_use)
    connect_one_to_many(part_bloom_create, lineitem_bloom_use)

    # lineitem_bloom_use.connect(lineitem_project)
    connect_many_to_many(lineitem_bloom_use, lineitem_project)
    connect_many_to_many(lineitem_project, lineitem_map)

    # lineitem_part_join.connect_left_producer(lineitem_project)
    connect_all_to_all(part_map, lineitem_part_join_build)

    # lineitem_part_join.connect_right_producer(part_project)
    connect_many_to_many(lineitem_part_join_build, lineitem_part_join_probe)
    connect_all_to_all(lineitem_map, lineitem_part_join_probe)

    # lineitem_part_join.connect(filter_op)
    connect_many_to_many(lineitem_part_join_probe, filter_op)

    # filter_op.connect(aggregate)
    connect_many_to_many(filter_op, aggregate)

    # aggregate.connect(aggregate_project)
    connect_many_to_one(aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, aggregate_project)

    # aggregate_project.connect(collate)
    connect_one_to_one(aggregate_project, collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("part_sharded: {}".format(part_sharded))
    print("other_parts: {}".format(other_parts))
    print("fp_rate: {}".format(fp_rate))
    print("format: {}".format(format_))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['revenue']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
示例#10
0
def query_plan(settings):
    # type: (SyntheticBaselineJoinSettings) -> QueryPlan
    """

    :type settings:
    :return: None
    """

    query_plan = QueryPlan(is_async=settings.parallel,
                           buffer_size=settings.buffer_size)

    def scan_A_fun(df):
        df.columns = settings.table_A_field_names
        criterion = settings.table_A_filter_fn(df)
        df = df[criterion]
        return df

    # Define the operators
    scan_A = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p),
                             "select "
                             "  * "
                             "from "
                             "  S3Object "
                             "{}"
                             .format(
                                 get_sql_suffix(settings.table_A_key, settings.table_A_parts, p,
                                                settings.table_A_sharded, add_where=True)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_A_{}'.format(p),
                             query_plan,
                             False, fn=scan_A_fun)),
            range(0, settings.table_A_parts))

    def scan_B_fun(df):
        df.columns = settings.table_B_field_names
        criterion = settings.table_B_filter_fn(df)
        return df[criterion]

    scan_B = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_B_key, settings.table_B_sharded, p),
                             "select "
                             "  * "
                             "from "
                             "  S3Object "
                             "{}"
                             .format(
                                 get_sql_suffix(settings.table_B_key, settings.table_B_parts, p,
                                                settings.table_B_sharded, add_where=True)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_B_{}'.format(p),
                             query_plan,
                             False, fn=scan_B_fun)),
            range(0, settings.table_B_parts))
    """
    field_names_map_B = OrderedDict(
        zip(['_{}'.format(i) for i, name in enumerate(settings.table_B_field_names)], settings.table_B_field_names))

    def project_fn_B(df):
        df.rename(columns=field_names_map_B, inplace=True)
        return df

    project_B = map(lambda p:
                    query_plan.add_operator(Project(
                        [ProjectExpression(k, v) for k, v in field_names_map_B.iteritems()],
                        'project_B_{}'.format(p),
                        query_plan,
                        True,
                        project_fn_B)),
                    range(0, settings.table_B_parts))

    filter_b = map(lambda p:
                   query_plan.add_operator(Filter(
                       PredicateExpression(None, pd_expr=settings.table_B_filter_fn), 'filter_b' + '_{}'.format(p), query_plan,
                       False)),
                   range(0, settings.table_B_parts))
    """

    def scan_C_fun(df):
        df.columns = settings.table_C_field_names
        criterion = settings.table_C_filter_fn(df)
        return df[criterion]

    scan_C = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_C_key, settings.table_C_sharded, p),
                             "select "
                             "  * "
                             "from "
                             "  S3Object "
                             "{}"
                             .format(
                                 get_sql_suffix(settings.table_C_key, settings.table_C_parts, p,
                                                settings.table_C_sharded, add_where=True)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_C_{}'.format(p),
                             query_plan,
                             False, fn=scan_C_fun)),
            range(0, settings.table_C_parts))
    """
    field_names_map_C = OrderedDict(
        zip(['_{}'.format(i) for i, name in enumerate(settings.table_C_field_names)], settings.table_C_field_names))

    def project_fn_C(df):
        df.rename(columns=field_names_map_C, inplace=True)
        return df

    project_C = map(lambda p:
                    query_plan.add_operator(Project(
                        [ProjectExpression(k, v) for k, v in field_names_map_C.iteritems()],
                        'project_C_{}'.format(p),
                        query_plan,
                        True,
                        project_fn_C)),
                    range(0, settings.table_C_parts))

    filter_c = map(lambda p:
                   query_plan.add_operator(Filter(
                       PredicateExpression(None, pd_expr=settings.table_C_filter_fn), 'filter_c' + '_{}'.format(p), query_plan,
                       False)),
                   range(0, settings.table_C_parts))
    """

    map_A_to_B = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p),
                query_plan, False)), range(0, settings.table_A_parts))

    map_B_to_B = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p),
                query_plan, False)), range(0, settings.table_B_parts))

    map_B_to_C = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p),
                query_plan, False)), range(0, settings.table_B_parts))

    map_C_to_C = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p),
                query_plan, False)), range(0, settings.table_C_parts))

    join_build_A_B = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'.
                          format(p), query_plan, False)),
        range(0, settings.table_B_parts))

    join_probe_A_B = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression(settings.table_A_AB_join_key, settings.
                               table_B_AB_join_key), 'join_probe_A_B_{}'
                .format(p), query_plan, False)),
        range(0, settings.table_B_parts))

    join_build_AB_C = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(settings.table_B_BC_join_key, 'join_build_AB_C_{}'.
                          format(p), query_plan, False)),
        range(0, settings.table_C_parts))

    join_probe_AB_C = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression(settings.table_B_BC_join_key, settings.
                               table_C_BC_join_key), 'join_probe_AB_C_{}'
                .format(p), query_plan, False)),
        range(0, settings.table_C_parts))

    def agg_fun(df):
        return pd.DataFrame({
            'sum':
            [df[settings.table_C_detail_field_name].astype(float).sum()]
        })
        #return pd.DataFrame( { 'sum' : [ len(df) ] } )

    part_aggregate = map(
        lambda p: query_plan.add_operator(
            Aggregate([
                AggregateExpression(
                    AggregateExpression.SUM, lambda t: float(t[
                        settings.table_C_detail_field_name]))
            ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan,
                      False, agg_fun)), range(0, settings.table_C_parts))

    def agg_reduce_fun(df):
        return pd.DataFrame({'sum': [df['sum'].sum()]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], settings.use_pandas, 'aggregate_reduce', query_plan, False,
                  agg_reduce_fun))
    """
    aggregate_project = query_plan.add_operator(Project(
        [
            ProjectExpression(lambda t: t['_0'], 'total_balance')
        ],
        'aggregate_project', query_plan,
        False))
    """

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    # Connect the operators
    connect_many_to_many(scan_A, map_A_to_B)
    #connect_many_to_many(project_A, filter_A)
    #connect_many_to_many(filter_A, map_A_to_B)
    connect_all_to_all(map_A_to_B, join_build_A_B)
    connect_many_to_many(join_build_A_B, join_probe_A_B)

    connect_many_to_many(scan_B, map_B_to_B)
    #connect_many_to_many(project_B, filter_b)
    #connect_many_to_many(filter_b, map_B_to_B)
    connect_all_to_all(map_B_to_B, join_probe_A_B)

    #connect_many_to_many(join_probe_A_B, part_aggregate)
    #connect_many_to_one(part_aggregate, aggregate_reduce)
    #connect_one_to_one(aggregate_reduce, collate)

    connect_many_to_many(join_build_AB_C, join_probe_AB_C)

    connect_many_to_many(join_probe_A_B, map_B_to_C)
    connect_all_to_all(map_B_to_C, join_build_AB_C)

    connect_many_to_many(scan_C, map_C_to_C)
    #connect_many_to_many(project_C, filter_c)
    #connect_many_to_many(filter_c, map_C_to_C)
    connect_all_to_all(map_C_to_C, join_probe_AB_C)

    connect_many_to_many(join_probe_AB_C, part_aggregate)

    connect_many_to_one(part_aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, collate)
    #connect_one_to_one(aggregate_project, collate)

    return query_plan
示例#11
0
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts,
        part_parts, lineitem_sharded, part_sharded, other_parts, sf, fp_rate,
        expected_result, format_):
    """
    :return: None
    """

    print('')
    print("TPCH Q17 Bloom Join")
    print("-------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    part_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q17.sql_scan_select_partkey_where_brand_container_op(
                part_sharded, p, part_parts, use_pandas, secure, use_native,
                'part_scan' + '_' + str(p), query_plan, sf, format_)),
        range(0, part_parts))

    part_project = map(
        lambda p: query_plan.add_operator(
            tpch_q17.project_partkey_op('part_project' + '_' + str(p),
                                        query_plan)), range(0, part_parts))

    part_lineitem_join_build_map = map(
        lambda p: query_plan.add_operator(
            Map('p_partkey', 'part_lineitem_join_build_map' + '_' + str(p),
                query_plan, False)), range(0, part_parts))

    part_bloom_create = query_plan.add_operator(
        tpch_q17.bloom_create_partkey_op(fp_rate, 'part_bloom_create',
                                         query_plan))

    lineitem_bloom_use = \
        map(lambda p:
            query_plan.add_operator(
                tpch_q17.bloom_scan_lineitem_select_orderkey_partkey_quantity_extendedprice_bloom_partkey_op(
                    lineitem_sharded,
                    p,
                    part_parts,
                    use_pandas,
                    secure,
                    use_native,
                    'lineitem_bloom_use' + '_' + str(p),
                    query_plan,
                    sf, format_)),
            range(0, lineitem_parts))

    lineitem_project = map(
        lambda p: query_plan.add_operator(
            tpch_q17.
            project_lineitem_filtered_orderkey_partkey_quantity_extendedprice_op(
                'lineitem_project' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))

    part_lineitem_join_probe_map = map(
        lambda p: query_plan.add_operator(
            Map('l_partkey', 'part_lineitem_join_probe_map' + '_' + str(p),
                query_plan, False)), range(0, lineitem_parts))

    part_lineitem_join_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('p_partkey', 'part_lineitem_join_build' + '_' + str(
                p), query_plan, False)), range(0, other_parts))

    part_lineitem_join_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                          'part_lineitem_join_probe' + '_' + str(
                              p), query_plan, False)), range(0, other_parts))

    lineitem_part_avg_group = map(
        lambda p: query_plan.add_operator(
            tpch_q17.group_partkey_avg_quantity_op(
                'lineitem_part_avg_group' + '_' + str(p), query_plan)),
        range(0, other_parts))

    lineitem_part_avg_group_project = map(
        lambda p: query_plan.add_operator(
            tpch_q17.project_partkey_avg_quantity_op(
                'lineitem_part_avg_group_project' + '_' + str(p), query_plan)),
        range(0, other_parts))

    part_lineitem_join_avg_group_join_build = \
        map(lambda p:
            query_plan.add_operator(
                HashJoinBuild('l_partkey',
                              'part_lineitem_join_avg_group_join_build' + '_' + str(p),
                              query_plan,
                              False)),
            range(0, other_parts))

    part_lineitem_join_avg_group_join_probe = \
        map(lambda p:
            query_plan.add_operator(
                HashJoinProbe(JoinExpression('l_partkey', 'l_partkey'),
                              'part_lineitem_join_avg_group_join_probe' + '_' + str(p),
                              query_plan,
                              False)),
            range(0, other_parts))

    lineitem_filter = map(
        lambda p: query_plan.add_operator(
            tpch_q17.filter_lineitem_quantity_op(
                'lineitem_filter' + '_' + str(p), query_plan)),
        range(0, other_parts))

    extendedprice_sum_aggregate = map(
        lambda p: query_plan.add_operator(
            tpch_q17.aggregate_sum_extendedprice_op(
                use_pandas, 'extendedprice_sum_aggregate' + '_' + str(p),
                query_plan)), range(0, other_parts))

    def aggregate_reduce_fn(df):
        sum1_ = df['_0'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum1_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], use_pandas, 'aggregate_reduce', query_plan, False,
                  aggregate_reduce_fn))

    extendedprice_sum_aggregate_project = query_plan.add_operator(
        tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project',
                                       query_plan))

    collate = query_plan.add_operator(
        tpch_q17.collate_op('collate', query_plan))

    # Inline what we can
    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), part_project)
    map(lambda o: o.set_async(False), lineitem_filter)
    map(lambda o: o.set_async(False), part_lineitem_join_probe_map)
    map(lambda o: o.set_async(False), part_lineitem_join_build_map)
    map(lambda o: o.set_async(False), lineitem_part_avg_group)
    map(lambda o: o.set_async(False), lineitem_part_avg_group_project)
    map(lambda o: o.set_async(False), extendedprice_sum_aggregate)
    extendedprice_sum_aggregate_project.set_async(False)

    # Connect the operators
    # part_scan.connect(part_project)
    map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan))
    # map(lambda (p, o): o.connect(part_bloom_create_map[p]), enumerate(part_project))
    map(lambda (p, o): o.connect(part_lineitem_join_build_map[p]),
        enumerate(part_project))

    connect_many_to_one(part_project, part_bloom_create)
    connect_one_to_many(part_bloom_create, lineitem_bloom_use)

    # part_project.connect(part_bloom_create)
    # map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_bloom_create)),
    #     enumerate(part_bloom_create_map))

    # part_bloom_create.connect(lineitem_bloom_use)
    # map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(lineitem_bloom_use)),
    #     enumerate(part_bloom_create))

    # lineitem_bloom_use.connect(lineitem_project)
    map(lambda (p, o): o.connect(lineitem_project[p]),
        enumerate(lineitem_bloom_use))

    # part_lineitem_join.connect_left_producer(part_project)
    map(
        lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2),
                             enumerate(part_lineitem_join_build)),
        enumerate(part_lineitem_join_build_map))
    map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o),
        enumerate(part_lineitem_join_build))

    # part_lineitem_join.connect_right_producer(lineitem_project)
    # map(lambda (p, o): o.connect(part_lineitem_join_probe_map[p]), enumerate(lineitem_project))
    connect_many_to_many(lineitem_project, part_lineitem_join_probe_map)

    map(
        lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1),
                             enumerate(part_lineitem_join_probe)),
        enumerate(part_lineitem_join_probe_map))

    # part_lineitem_join.connect(lineitem_part_avg_group)
    map(lambda (p, o): o.connect(lineitem_part_avg_group[p]),
        enumerate(part_lineitem_join_probe))

    # lineitem_part_avg_group.connect(lineitem_part_avg_group_project)
    map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]),
        enumerate(lineitem_part_avg_group))

    # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project)
    map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]),
        enumerate(lineitem_part_avg_group_project))

    # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join)
    map(
        lambda (p, o): part_lineitem_join_avg_group_join_probe[p].
        connect_build_producer(o),
        enumerate(part_lineitem_join_avg_group_join_build))
    map(
        lambda (p, o): part_lineitem_join_avg_group_join_probe[p].
        connect_tuple_producer(o), enumerate(part_lineitem_join_probe))

    # part_lineitem_join_avg_group_join.connect(lineitem_filter)
    map(lambda (p, o): o.connect(lineitem_filter[p]),
        enumerate(part_lineitem_join_avg_group_join_probe))

    # lineitem_filter.connect(extendedprice_sum_aggregate)
    map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]),
        enumerate(lineitem_filter))

    # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project)
    map(lambda (p, o): o.connect(aggregate_reduce),
        enumerate(extendedprice_sum_aggregate))
    aggregate_reduce.connect(extendedprice_sum_aggregate_project)

    # extendedprice_sum_aggregate_project.connect(collate)
    extendedprice_sum_aggregate_project.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("part_sharded: {}".format(part_sharded))
    print("other_parts: {}".format(other_parts))
    print("fp_rate: {}".format(fp_rate))
    print("format: {}".format(format_))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['avg_yearly']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    if s3filter.util.constants.TPCH_SF == 10:
        assert round(
            float(tuples[1][0]), 10
        ) == 372414.2899999995  # TODO: This isn't correct but haven't checked tpch17 on 10 sf yet
    elif s3filter.util.constants.TPCH_SF == 1:
        numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
示例#12
0
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, part_parts, lineitem_sharded,
        part_sharded, other_parts, sf, expected_result, format_):
    """The baseline tst uses nested loop joins with no projection and no filtering pushed down to s3.

    This works by:

    1. Scanning part and filtering on brand and container
    2. It then scans lineitem
    3. It then joins the two tables (essentially filtering out lineitems that dont include parts that we filtered out in
       step 1)
    4. It then computes the average of l_quantity from the joined table and groups the results by partkey
    5. It then joins these computed averages with the joined table in step 3
    6. It then filters out any rows where l_quantity is less than the computed average

    TODO: There are few ways this can be done, the above is just one.

    :return: None
    """

    print('')
    print("TPCH Q17 Baseline Join")
    print("----------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    part_scan = map(lambda p:
                    query_plan.add_operator(
                        tpch_q17.sql_scan_part_select_all_op(
                            part_sharded,
                            p,
                            part_parts,
                            use_pandas,
                            secure,
                            use_native,
                            'part_scan' + '_' + str(p),
                            query_plan,
                            sf, format_)),
                    range(0, part_parts))

    lineitem_scan = map(lambda p:
                        query_plan.add_operator(
                            tpch_q17.sql_scan_lineitem_select_all_op(
                                lineitem_sharded,
                                p,
                                lineitem_parts,
                                use_pandas,
                                secure,
                                use_native,
                                'lineitem_scan' + '_' + str(p),
                                query_plan,
                                sf, format_)),
                        range(0, lineitem_parts))

    part_project = map(lambda p:
                       query_plan.add_operator(
                           tpch_q17.project_partkey_brand_container_op(
                               'part_project' + '_' + str(p),
                               query_plan)),
                       range(0, part_parts))

    part_filter = map(lambda p:
                      query_plan.add_operator(tpch_q17.filter_brand_container_op(
                          'part_filter' + '_' + str(p),
                          query_plan)),
                      range(0, part_parts))

    part_map = map(lambda p:
                   query_plan.add_operator(Map('p_partkey', 'part_map' + '_' + str(p), query_plan, False)),
                   range(0, part_parts))

    lineitem_project = map(lambda p:
                           query_plan.add_operator(
                               tpch_q17.project_lineitem_orderkey_partkey_quantity_extendedprice_op(
                                   'lineitem_project' + '_' + str(p), query_plan)),
                           range(0, lineitem_parts))

    lineitem_map = map(lambda p:
                       query_plan.add_operator(Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, False)),
                       range(0, lineitem_parts))

    part_lineitem_join_build = map(lambda p:
                                   query_plan.add_operator(
                                       HashJoinBuild('p_partkey',
                                                     'part_lineitem_join_build' + '_' + str(p), query_plan,
                                                     False)),
                                   range(0, other_parts))

    part_lineitem_join_probe = map(lambda p:
                                   query_plan.add_operator(
                                       HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                                                     'part_lineitem_join_probe' + '_' + str(p),
                                                     query_plan, False)),
                                   range(0, other_parts))

    lineitem_part_avg_group = map(lambda p:
                                  query_plan.add_operator(
                                      tpch_q17.group_partkey_avg_quantity_5_op('lineitem_part_avg_group' + '_' + str(p),
                                                                               query_plan)),
                                  range(0, other_parts))

    lineitem_part_avg_group_project = map(lambda p:
                                          query_plan.add_operator(
                                              tpch_q17.project_partkey_avg_quantity_op(
                                                  'lineitem_part_avg_group_project' + '_' + str(p), query_plan)),
                                          range(0, other_parts))

    part_lineitem_join_avg_group_join_build = \
        map(lambda p:
            query_plan.add_operator(
                HashJoinBuild('l_partkey',
                              'part_lineitem_join_avg_group_join_build' + '_' + str(p),
                              query_plan,
                              False)),
            range(0, other_parts))

    part_lineitem_join_avg_group_join_probe = \
        map(lambda p:
            query_plan.add_operator(
                HashJoinProbe(JoinExpression('l_partkey', 'l_partkey'),
                              'part_lineitem_join_avg_group_join_probe' + '_' + str(p),
                              query_plan,
                              False)),
            range(0, other_parts))

    lineitem_filter = map(lambda p:
                          query_plan.add_operator(
                              tpch_q17.filter_lineitem_quantity_op('lineitem_filter' + '_' + str(p), query_plan)),
                          range(0, other_parts))

    extendedprice_sum_aggregate = map(lambda p:
                                      query_plan.add_operator(
                                          tpch_q17.aggregate_sum_extendedprice_op(
                                              use_pandas,
                                              'extendedprice_sum_aggregate' + '_' + str(p),
                                              query_plan)),
                                      range(0, other_parts))

    def aggregate_reduce_fn(df):
        sum1_ = df['_0'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum1_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate(
            [
                AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0']))
            ],
            use_pandas,
            'aggregate_reduce',
            query_plan,
            False, aggregate_reduce_fn))

    extendedprice_sum_aggregate_project = query_plan.add_operator(
        tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project', query_plan))

    collate = query_plan.add_operator(tpch_q17.collate_op('collate', query_plan))

    # Inline what we can
    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), part_project)
    map(lambda o: o.set_async(False), lineitem_filter)
    map(lambda o: o.set_async(False), part_filter)
    map(lambda o: o.set_async(False), lineitem_map)
    map(lambda o: o.set_async(False), part_map)
    map(lambda o: o.set_async(False), lineitem_part_avg_group)
    map(lambda o: o.set_async(False), lineitem_part_avg_group_project)
    map(lambda o: o.set_async(False), extendedprice_sum_aggregate)
    extendedprice_sum_aggregate_project.set_async(False)

    # Connect the operators
    # part_scan.connect(part_project)
    map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan))
    map(lambda (p, o): o.connect(part_filter[p]), enumerate(part_project))
    map(lambda (p, o): o.connect(part_map[p]), enumerate(part_filter))

    # lineitem_scan.connect(lineitem_project)
    map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_scan))
    map(lambda (p, o): o.connect(lineitem_map[p]), enumerate(lineitem_project))

    # part_lineitem_join.connect_left_producer(part_project)
    # part_lineitem_join.connect_right_producer(lineitem_project)
    # part_lineitem_join.connect(lineitem_part_avg_group)
    map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_lineitem_join_build)), enumerate(part_map))
    map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_build))
    map(lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1), enumerate(part_lineitem_join_probe)),
        enumerate(lineitem_map))
    map(lambda (p, o): o.connect(lineitem_part_avg_group[p]), enumerate(part_lineitem_join_probe))

    # map(lambda (p, o): o.map(Mapper('_1', 1, part_lineitem_join_probe)), enumerate(lineitem_scan))

    # lineitem_part_avg_group.connect(lineitem_part_avg_group_project)
    map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]), enumerate(lineitem_part_avg_group))

    # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project)
    # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join)
    # part_lineitem_join_avg_group_join.connect(lineitem_filter)

    map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]),
        enumerate(lineitem_part_avg_group_project))

    # map(lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo),
    #                        enumerate(part_lineitem_join_avg_group_join_build)),
    #     enumerate(part_lineitem_join_avg_group_join_probe))
    map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p].connect_build_producer(o),
        enumerate(part_lineitem_join_avg_group_join_build))
    map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p % part_parts].connect_tuple_producer(o),
        enumerate(part_lineitem_join_probe))
    map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(part_lineitem_join_avg_group_join_probe))

    # lineitem_filter.connect(extendedprice_sum_aggregate)
    map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]), enumerate(lineitem_filter))
    # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project)
    # extendedprice_sum_aggregate_project.connect(collate)
    map(lambda (p, o): o.connect(aggregate_reduce), enumerate(extendedprice_sum_aggregate))
    aggregate_reduce.connect(extendedprice_sum_aggregate_project)
    extendedprice_sum_aggregate_project.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("part_sharded: {}".format(part_sharded))
    print("other_parts: {}".format(other_parts))
    print("format: {}".format(format_))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['avg_yearly']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    if s3filter.util.constants.TPCH_SF == 10:
        assert round(float(tuples[1][0]),
                     10) == 372414.2899999995  # TODO: This isn't correct but haven't checked tpch17 on 10 sf yet
    elif s3filter.util.constants.TPCH_SF == 1:
        numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
示例#13
0
def query_plan(settings):
    # type: (SyntheticFilteredJoinSettings) -> QueryPlan
    """

    :return: None
    """

    if settings.use_shared_mem:
        system = WorkerSystem(settings.shared_memory_size)
    else:
        system = None

    query_plan = QueryPlan(system,
                           is_async=settings.parallel,
                           buffer_size=settings.buffer_size,
                           use_shared_mem=settings.use_shared_mem)

    # Define the operators
    scan_A = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p, settings.sf),
                             "select "
                             "  {} "
                             "from "
                             "  S3Object "
                             "  {} "
                             "  {} "
                             .format(','.join(settings.table_A_field_names),
                                     ' where {} '.format(
                                         settings.table_A_filter_sql) if settings.table_A_filter_sql is not None else '',
                                     get_sql_suffix(settings.table_A_key, settings.table_A_parts, p,
                                                    settings.table_A_sharded,
                                                    add_where=settings.table_A_filter_sql is None)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_A_{}'.format(p),
                             query_plan,
                             False)),
            range(0, settings.table_A_parts))

    field_names_map_A = OrderedDict(
        zip([
            '_{}'.format(i)
            for i, name in enumerate(settings.table_A_field_names)
        ], settings.table_A_field_names))

    def project_fn_A(df):
        df = df.rename(columns=field_names_map_A, copy=False)
        return df

    project_A = map(
        lambda p: query_plan.add_operator(
            Project([
                ProjectExpression(k, v)
                for k, v in field_names_map_A.iteritems()
            ], 'project_A_{}'.format(p), query_plan, False, project_fn_A)),
        range(0, settings.table_A_parts))

    scan_B = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(
                    get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf),
                    "select "
                    "  {} "
                    "from "
                    "  S3Object "
                    "  {} "
                    "  {} "
                        .format(','.join(settings.table_B_field_names),
                                ' where {} '.format(
                                    settings.table_B_filter_sql) if settings.table_B_filter_sql is not None else '',
                                get_sql_suffix(settings.table_B_key, settings.table_B_parts, p,
                                               settings.table_B_sharded,
                                               add_where=settings.table_B_filter_sql is None)), settings.format_,
                    settings.use_pandas,
                    settings.secure,
                    settings.use_native,
                    'scan_B_{}'.format(p),
                    query_plan,
                    False)),
            range(0, settings.table_B_parts))

    field_names_map_B = OrderedDict(
        zip([
            '_{}'.format(i)
            for i, name in enumerate(settings.table_B_field_names)
        ], settings.table_B_field_names))

    def project_fn_B(df):
        df.rename(columns=field_names_map_B, inplace=True)
        return df

    project_B = map(
        lambda p: query_plan.add_operator(
            Project([
                ProjectExpression(k, v)
                for k, v in field_names_map_B.iteritems()
            ], 'project_B_{}'.format(p), query_plan, False, project_fn_B)),
        range(0, settings.table_B_parts))

    if settings.table_C_key is not None:
        scan_C = \
            map(lambda p:
                query_plan.add_operator(
                    SQLTableScan(
                        get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf),
                        "select "
                        "  {} "
                        "from "
                        "  S3Object "
                        "where "
                        "  {} "
                        "  {} "
                            .format(','.join(settings.table_C_field_names),
                                    settings.table_C_filter_sql,
                                    get_sql_suffix(settings.table_C_key, settings.table_C_parts, p,
                                                   settings.table_C_sharded, add_where=False)), settings.format_,
                        settings.use_pandas,
                        settings.secure,
                        settings.use_native,
                        'scan_C_{}'.format(p),
                        query_plan,
                        False)),
                range(0, settings.table_C_parts))

        field_names_map_C = OrderedDict(
            zip([
                '_{}'.format(i)
                for i, name in enumerate(settings.table_C_field_names)
            ], settings.table_C_field_names))

        def project_fn_C(df):
            df = df.rename(columns=field_names_map_C, copy=False)
            return df

        project_C = map(
            lambda p: query_plan.add_operator(
                Project([
                    ProjectExpression(k, v)
                    for k, v in field_names_map_C.iteritems()
                ], 'project_C_{}'.format(p), query_plan, False, project_fn_C)),
            range(0, settings.table_C_parts))

        map_B_to_C = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p),
                    query_plan, False)), range(0, settings.table_B_parts))

        map_C_to_C = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p),
                    query_plan, False)), range(0, settings.table_C_parts))

        join_build_AB_C = map(
            lambda p: query_plan.add_operator(
                HashJoinBuild(settings.table_B_BC_join_key,
                              'join_build_AB_C_{}'.format(
                                  p), query_plan, False)),
            range(0, settings.table_C_parts))

        join_probe_AB_C = map(
            lambda p: query_plan.add_operator(
                HashJoinProbe(
                    JoinExpression(settings.table_B_BC_join_key, settings.
                                   table_C_BC_join_key), 'join_probe_AB_C_{}'.
                    format(p), query_plan, False)),
            range(0, settings.table_C_parts))

    map_A_to_B = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p),
                query_plan, False)), range(0, settings.table_A_parts))

    map_B_to_B = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p),
                query_plan, False)), range(0, settings.table_B_parts))

    join_build_A_B = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'.
                          format(p), query_plan, False)),
        range(0, settings.other_parts))

    join_probe_A_B = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression(settings.table_A_AB_join_key, settings.
                               table_B_AB_join_key), 'join_probe_A_B_{}'.
                format(p), query_plan, False)), range(0, settings.other_parts))

    if settings.table_C_key is None:

        def part_aggregate_fn(df):
            sum_ = df[settings.table_B_detail_field_name].astype(
                np.float).sum()
            return pd.DataFrame({'_0': [sum_]})

        part_aggregate = map(
            lambda p: query_plan.add_operator(
                Aggregate([
                    AggregateExpression(
                        AggregateExpression.SUM, lambda t: float(t[
                            settings.table_B_detail_field_name]))
                ], settings.use_pandas, 'part_aggregate_{}'.format(p),
                          query_plan, False, part_aggregate_fn)),
            range(0, settings.other_parts))

    else:

        def part_aggregate_fn(df):
            sum_ = df[settings.table_C_detail_field_name].astype(
                np.float).sum()
            return pd.DataFrame({'_0': [sum_]})

        part_aggregate = map(
            lambda p: query_plan.add_operator(
                Aggregate([
                    AggregateExpression(
                        AggregateExpression.SUM, lambda t: float(t[
                            settings.table_C_detail_field_name]))
                ], settings.use_pandas, 'part_aggregate_{}'.format(p),
                          query_plan, False, part_aggregate_fn)),
            range(0, settings.table_C_parts))

    def aggregate_reduce_fn(df):
        sum_ = df['_0'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], settings.use_pandas, 'aggregate_reduce', query_plan, False,
                  aggregate_reduce_fn))

    aggregate_project = query_plan.add_operator(
        Project([ProjectExpression(lambda t: t['_0'], 'total_balance')],
                'aggregate_project', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    # Inline some of the operators
    map(lambda o: o.set_async(False), project_A)
    map(lambda o: o.set_async(False), project_B)
    map(lambda o: o.set_async(False), map_A_to_B)
    map(lambda o: o.set_async(False), map_B_to_B)
    if settings.table_C_key is not None:
        map(lambda o: o.set_async(False), map_B_to_C)
        map(lambda o: o.set_async(False), map_C_to_C)
        map(lambda o: o.set_async(False), project_C)
    map(lambda o: o.set_async(False), part_aggregate)
    aggregate_project.set_async(False)

    # Connect the operators
    connect_many_to_many(scan_A, project_A)
    connect_many_to_many(project_A, map_A_to_B)
    connect_all_to_all(map_A_to_B, join_build_A_B)
    connect_many_to_many(join_build_A_B, join_probe_A_B)

    connect_many_to_many(scan_B, project_B)
    connect_many_to_many(project_B, map_B_to_B)
    connect_all_to_all(map_B_to_B, join_probe_A_B)

    if settings.table_C_key is None:
        connect_many_to_many(join_probe_A_B, part_aggregate)
    else:
        connect_many_to_many(join_probe_A_B, map_B_to_C)
        connect_all_to_all(map_B_to_C, join_build_AB_C)
        connect_many_to_many(join_build_AB_C, join_probe_AB_C)
        connect_many_to_many(scan_C, project_C)
        connect_many_to_many(project_C, map_C_to_C)
        connect_all_to_all(map_C_to_C, join_probe_AB_C)
        connect_many_to_many(join_probe_AB_C, part_aggregate)

    connect_many_to_one(part_aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, aggregate_project)
    connect_one_to_one(aggregate_project, collate)

    return query_plan
示例#14
0
def run(parallel, use_pandas, buffer_size, selectivity1not0, typed, path, format_):
    secure = False
    use_native = False
    print('')
    print("CSV/Parquet Filter Return 1 Column Benchmark")
    print("------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    if selectivity1not0:
        if typed:
            # SQL scan the file
            scan = map(lambda p:
                       query_plan.add_operator(
                           SQLTableScan(path,
                                        "select F0 from S3Object "
                                        "where F0 >= 0;", format_,
                                        use_pandas, secure, use_native,
                                        'scan_{}'.format(p), query_plan,
                                        False)),
                       range(0, 1))

	else:
            # SQL scan the file
            scan = map(lambda p:
                       query_plan.add_operator(
                           SQLTableScan(path,
                                        "select F0 from S3Object "
                                        "where cast(F0 as int) >= 0;", format_,
                                        use_pandas, secure, use_native,
                                        'scan_{}'.format(p), query_plan,
                                        False)),
                       range(0, 1))
    else: # selectivity = 0
	if typed:
	    scan = map(lambda p:
                       query_plan.add_operator(
                           SQLTableScan(path,
                                        "select F0 from S3Object "
                                        "where F0 < 0;", format_,
                                        use_pandas, secure, use_native,
                                        'scan_{}'.format(p), query_plan,
                                        False)),
                       range(0, 1))

	else:
            scan = map(lambda p:
                       query_plan.add_operator(
                           SQLTableScan(path,
                                        "select F0 from S3Object "
                                        "where cast(F0 as int) < 0;", format_,
                                        use_pandas, secure, use_native,
                                        'scan_{}'.format(p), query_plan,
                                        False)),
                       range(0, 1))

    # project
    def fn(df):
        df.columns = ["F0"]
        return df

    project = map(lambda p:
                  query_plan.add_operator(
                      Project([], 'project_{}'.format(p), query_plan, False, fn)),
                  range(0, 1))

    # aggregation
    def agg_fun(df):
        return pd.DataFrame({'count': [len(df)]})

    aggregate = query_plan.add_operator(
        Aggregate([], True, 'agg', query_plan, False, agg_fun))

    collate = query_plan.add_operator(
        Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(aggregate), enumerate(project))
    aggregate.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(1))
    print('')

    # Write the plan graph
    # query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
示例#15
0
def run(parallel, buffer_size, parts):
    """

    :return: None
    """

    print('')
    print("TPCH Q14 Partitioned Bloom Join")
    print("-------------------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    # DATE is the first day of a month randomly selected from a random year within [1993 .. 1997].
    date = '1993-01-01'
    min_shipped_date = datetime.strptime(date, '%Y-%m-%d')
    max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30)

    collate = query_plan.add_operator(tpch_q14.collate_operator_def('collate', query_plan))
    sum_aggregate = query_plan.add_operator(
        Aggregate(
            [
                AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])),
                AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_1']))
            ],
            'sum_aggregate',
            query_plan,
            False))
    aggregate_project = query_plan.add_operator(
        tpch_q14.project_promo_revenue_operator_def('aggregate_project', query_plan))

    hash_join_build_ops = []
    hash_join_probe_ops = []
    part_bloom_create_ops = []
    line_item_scan_ops = []
    for p in range(0, parts):
        part_scan = query_plan.add_operator(
            tpch_q14.sql_scan_part_partkey_type_part_where_brand12_sharded_operator_def(p, parts,
                                                                                        'part_scan' + '_' + str(p),
                                                                                        query_plan))
        part_scan_project = query_plan.add_operator(
            tpch_q14.project_partkey_type_operator_def('part_scan_project' + '_' + str(p), query_plan))

        part_bloom_create = query_plan.add_operator(
            tpch_q14.bloom_create_p_partkey_operator_def('part_bloom_create' + '_' + str(p), query_plan))
        part_bloom_create_ops.append(part_bloom_create)

        lineitem_scan = query_plan.add_operator(
            tpch_q14.bloom_scan_lineitem_where_shipdate_sharded_operator_def(min_shipped_date, max_shipped_date, p,
                                                                             parts, 'lineitem_scan' + '_' + str(p),
                                                                             query_plan))
        line_item_scan_ops.append(lineitem_scan)

        lineitem_scan_project = query_plan.add_operator(
            tpch_q14.project_partkey_extendedprice_discount_operator_def('lineitem_scan_project' + '_' + str(p),
                                                                         query_plan))

        join_build = query_plan.add_operator(
            HashJoinBuild('p_partkey', 'join_build' + '_' + str(p), query_plan, False))
        hash_join_build_ops.append(join_build)

        join_probe = query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'join_probe' + '_' + str(p),
                          query_plan, False))
        hash_join_probe_ops.append(join_probe)

        aggregate = query_plan.add_operator(
            tpch_q14.aggregate_promo_revenue_operator_def('aggregate' + '_' + str(p), query_plan))

        part_scan.connect(part_scan_project)
        lineitem_scan.connect(lineitem_scan_project)

        part_scan_project.connect(part_bloom_create)
        # part_bloom_create.connect(lineitem_scan)

        part_scan_project.connect(join_build)

        join_probe.connect(aggregate)
        # aggregate.connect(aggregate_project)
        aggregate.connect(sum_aggregate)

        join_probe.connect_tuple_producer(lineitem_scan_project)
        # join_probe.connect(merge)

    for bloom_create_op in part_bloom_create_ops:
        for scan_op in line_item_scan_ops:
            bloom_create_op.connect(scan_op)

    for probe_op in hash_join_probe_ops:
        for build_op in hash_join_build_ops:
            probe_op.connect_build_producer(build_op)

    sum_aggregate.connect(aggregate_project)
    aggregate_project.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print("parts: {}".format(parts))
    print('')

    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(parts))

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['promo_revenue']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    # assert tuples[1] == [15.090116526324298]
    assert round(float(tuples[1][0]), 10) == 15.0901165263
示例#16
0
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts):
    """
    :return: None
    """

    print('')
    print("TPCH Q17 Bloom Join")
    print("-------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    part_scan = map(lambda p:
                    query_plan.add_operator(
                        tpch_q17.sql_scan_select_partkey_where_brand_container_op(
                            part_parts != 1,
                            p,
                            part_parts,
                            use_pandas,
                            'part_scan' + '_' + str(p),
                            query_plan)),
                    range(0, part_parts))

    part_project = map(lambda p:
                       query_plan.add_operator(
                           tpch_q17.project_partkey_op(
                               'part_project' + '_' + str(p),
                               query_plan)),
                       range(0, part_parts))

    part_bloom_create_map = map(lambda p:
                                query_plan.add_operator(
                                    Map('p_partkey', 'part_bloom_create_map' + '_' + str(p), query_plan, True)),
                                range(0, part_parts))

    part_lineitem_join_build_map = map(lambda p:
                                       query_plan.add_operator(
                                           Map('p_partkey', 'part_lineitem_join_build_map' + '_' + str(p), query_plan,
                                               True)),
                                       range(0, part_parts))

    part_bloom_create = map(lambda p:
                            query_plan.add_operator(
                                tpch_q17.bloom_create_partkey_op('part_bloom_create' + '_' + str(p), query_plan)),
                            range(0, part_parts))

    lineitem_bloom_use = map(lambda p:
                             query_plan.add_operator(
                                 tpch_q17.bloom_scan_lineitem_select_orderkey_partkey_quantity_extendedprice_where_partkey_bloom_partkey_op(
                                     part_parts != 1,
                                     p,
                                     part_parts,
                                     use_pandas,
                                     'lineitem_bloom_use' + '_' + str(p),
                                     query_plan)),
                             range(0, lineitem_parts))

    lineitem_project = map(lambda p:
                           query_plan.add_operator(
                               tpch_q17.project_orderkey_partkey_quantity_extendedprice_op(
                                   'lineitem_project' + '_' + str(p),
                                   query_plan)),
                           range(0, lineitem_parts))

    part_lineitem_join_probe_map = map(lambda p:
                                       query_plan.add_operator(
                                           Map('l_partkey', 'part_lineitem_join_probe_map' + '_' + str(p), query_plan,
                                               True)),
                                       range(0, lineitem_parts))

    # part_lineitem_join = map(lambda p:
    #                          query_plan.add_operator(
    #                              tpch_q17.join_p_partkey_l_partkey_op('part_lineitem_join' + '_' + str(p), query_plan)),
    #                          range(0, part_parts))

    part_lineitem_join_build = map(lambda p:
                                   query_plan.add_operator(
                                       HashJoinBuild('p_partkey',
                                                     'part_lineitem_join_build' + '_' + str(p), query_plan,
                                                     False)),
                                   range(0, part_parts))

    part_lineitem_join_probe = map(lambda p:
                                   query_plan.add_operator(
                                       HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                                                     'part_lineitem_join_probe' + '_' + str(p),
                                                     query_plan, True)),
                                   range(0, part_parts))

    lineitem_part_avg_group = map(lambda p:
                                  query_plan.add_operator(
                                      tpch_q17.group_partkey_avg_quantity_op('lineitem_part_avg_group' + '_' + str(p),
                                                                             query_plan)),
                                  range(0, part_parts))

    lineitem_part_avg_group_project = map(lambda p:
                                          query_plan.add_operator(
                                              tpch_q17.project_partkey_avg_quantity_op(
                                                  'lineitem_part_avg_group_project' + '_' + str(p), query_plan)),
                                          range(0, part_parts))

    # part_lineitem_join_avg_group_join = map(lambda p:
    #                                         query_plan.add_operator(
    #                                             tpch_q17.join_l_partkey_p_partkey_op(
    #                                                 'part_lineitem_join_avg_group_join' + '_' + str(p), query_plan)),
    #                                         range(0, part_parts))

    part_lineitem_join_avg_group_join_build = map(lambda p:
                                                  query_plan.add_operator(
                                                      HashJoinBuild('l_partkey',
                                                                    'part_lineitem_join_avg_group_join_build' + '_' + str(
                                                                        p), query_plan, False)),
                                                  range(0, part_parts))

    part_lineitem_join_avg_group_join_probe = map(lambda p:
                                                  query_plan.add_operator(
                                                      HashJoinProbe(JoinExpression('l_partkey', 'p_partkey'),
                                                                    'part_lineitem_join_avg_group_join_probe' + '_' + str(
                                                                        p),
                                                                    query_plan, True)),
                                                  range(0, part_parts))

    lineitem_filter = map(lambda p:
                          query_plan.add_operator(
                              tpch_q17.filter_lineitem_quantity_op('lineitem_filter' + '_' + str(p), query_plan)),
                          range(0, part_parts))

    extendedprice_sum_aggregate = map(lambda p:
                                      query_plan.add_operator(
                                          tpch_q17.aggregate_sum_extendedprice_op(
                                              'extendedprice_sum_aggregate' + '_' + str(p),
                                              query_plan)),
                                      range(0, part_parts))

    aggregate_reduce = query_plan.add_operator(
        Aggregate(
            [
                AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0']))
            ],
            'aggregate_reduce',
            query_plan,
            False))

    extendedprice_sum_aggregate_project = query_plan.add_operator(
        tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project', query_plan))

    collate = query_plan.add_operator(tpch_q17.collate_op('collate', query_plan))

    # Connect the operators
    # part_scan.connect(part_project)
    map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan))
    map(lambda (p, o): o.connect(part_bloom_create_map[p]), enumerate(part_project))
    map(lambda (p, o): o.connect(part_lineitem_join_build_map[p]), enumerate(part_project))

    # part_project.connect(part_bloom_create)
    map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_bloom_create)),
        enumerate(part_bloom_create_map))

    # part_bloom_create.connect(lineitem_bloom_use)
    map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(lineitem_bloom_use)),
        enumerate(part_bloom_create))

    # lineitem_bloom_use.connect(lineitem_project)
    map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_bloom_use))

    # part_lineitem_join.connect_left_producer(part_project)
    map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_lineitem_join_build)),
        enumerate(part_lineitem_join_build_map))
    map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_build))

    # part_lineitem_join.connect_right_producer(lineitem_project)
    map(lambda (p, o): o.connect(part_lineitem_join_probe_map[p]), enumerate(lineitem_project))
    map(lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1), enumerate(part_lineitem_join_probe)),
        enumerate(part_lineitem_join_probe_map))

    # part_lineitem_join.connect(lineitem_part_avg_group)
    map(lambda (p, o): o.connect(lineitem_part_avg_group[p]), enumerate(part_lineitem_join_probe))

    # lineitem_part_avg_group.connect(lineitem_part_avg_group_project)
    map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]), enumerate(lineitem_part_avg_group))

    # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project)
    map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]),
        enumerate(lineitem_part_avg_group_project))

    # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join)
    map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p].connect_build_producer(o),
        enumerate(part_lineitem_join_avg_group_join_build))
    map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p].connect_tuple_producer(o),
        enumerate(part_lineitem_join_probe))

    # part_lineitem_join_avg_group_join.connect(lineitem_filter)
    map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(part_lineitem_join_avg_group_join_probe))

    # lineitem_filter.connect(extendedprice_sum_aggregate)
    map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]), enumerate(lineitem_filter))

    # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project)
    map(lambda (p, o): o.connect(aggregate_reduce), enumerate(extendedprice_sum_aggregate))
    aggregate_reduce.connect(extendedprice_sum_aggregate_project)

    # extendedprice_sum_aggregate_project.connect(collate)
    extendedprice_sum_aggregate_project.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['avg_yearly']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    assert round(float(tuples[1][0]), 10) == 4632.1085714286
示例#17
0
def query_plan(settings):
    # type: (SyntheticBloomJoinSettings) -> QueryPlan
    """

    :return: None
    """

    query_plan = QueryPlan(is_async=settings.parallel,
                           buffer_size=settings.buffer_size)

    def scan_A_fn(df):
        df.columns = settings.table_A_field_names
        return df

    # Define the operators
    scan_A = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p),
                             "select "
                             "  {} "
                             "from "
                             "  S3Object "
                             "where "
                             "  {} "
                             "  {} "
                             .format(','.join(settings.table_A_field_names),
                                     settings.table_A_filter_sql,
                                     get_sql_suffix(settings.table_A_key, settings.table_A_parts, p,
                                                    settings.table_A_sharded)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_A_{}'.format(p),
                             query_plan,
                             False,
                             fn=scan_A_fn)),
            range(0, settings.table_A_parts))
    """
    field_names_map_A = OrderedDict(
        zip(['_{}'.format(i) for i, name in enumerate(settings.table_A_field_names)], settings.table_A_field_names))

    def project_fn_A(df):
        df.rename(columns=field_names_map_A, inplace=True)
        return df

    project_A = map(lambda p:
                    query_plan.add_operator(Project(
                        [ProjectExpression(k, v) for k, v in field_names_map_A.iteritems()],
                        'project_A_{}'.format(p),
                        query_plan,
                        True,
                        project_fn_A)),
                    range(0, settings.table_A_parts))
    """

    bloom_create_a = map(
        lambda p: query_plan.add_operator(
            BloomCreate(settings.table_A_AB_join_key, 'bloom_create_a_{}'.
                        format(p), query_plan, False)),
        range(0, settings.table_A_parts))

    def scan_B_fn(df):
        df.columns = settings.table_B_field_names
        return df

    scan_B = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScanBloomUse(get_file_key(settings.table_B_key, settings.table_B_sharded, p),
                                     "select "
                                     "  {} "
                                     "from "
                                     "  S3Object "
                                     "where "
                                     "  {} "
                                     "  {} "
                                     .format(','.join(settings.table_B_field_names),
                                             settings.table_B_filter_sql,
                                             get_sql_suffix(settings.table_B_key, settings.table_B_parts, p,
                                                            settings.table_B_sharded, add_where=False)),
                                     settings.table_B_AB_join_key, settings.format_,
                                     settings.use_pandas,
                                     settings.secure,
                                     settings.use_native,
                                     'scan_B_{}'.format(p),
                                     query_plan,
                                     False,
                                     fn=scan_B_fn)),
            range(0, settings.table_B_parts))
    """
    field_names_map_B = OrderedDict(
        zip(['_{}'.format(i) for i, name in enumerate(settings.table_B_field_names)], settings.table_B_field_names))

    def project_fn_B(df):
        df.rename(columns=field_names_map_B, inplace=True)
        return df

    project_B = map(lambda p:
                    query_plan.add_operator(Project(
                        [ProjectExpression(k, v) for k, v in field_names_map_B.iteritems()],
                        'project_B_{}'.format(p),
                        query_plan,
                        True,
                        project_fn_B)),
                    range(0, settings.table_B_parts))
    """

    def scan_C_fn(df):
        df.columns = settings.table_C_field_names
        return df

    scan_C = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScanBloomUse(get_file_key(settings.table_C_key, settings.table_C_sharded, p),
                                     "select "
                                     "  {} "
                                     "from "
                                     "  S3Object "
                                     "where "
                                     "  {} "
                                     "  {} "
                                     .format(','.join(settings.table_C_field_names),
                                             settings.table_C_filter_sql,
                                             get_sql_suffix(settings.table_C_key, settings.table_C_parts, p,
                                                            settings.table_C_sharded, add_where=False)),
                                     settings.table_C_BC_join_key, settings.format_,
                                     settings.use_pandas,
                                     settings.secure,
                                     settings.use_native,
                                     'scan_C_{}'.format(p),
                                     query_plan,
                                     False,
                                     fn=scan_C_fn)),
            range(0, settings.table_C_parts))
    """
    field_names_map_C = OrderedDict(
        zip(['_{}'.format(i) for i, name in enumerate(settings.table_C_field_names)], settings.table_C_field_names))

    def project_fn_C(df):
        df.rename(columns=field_names_map_C, inplace=True)
        return df

    project_C = map(lambda p:
                    query_plan.add_operator(Project(
                        [ProjectExpression(k, v) for k, v in field_names_map_C.iteritems()],
                        'project_C_{}'.format(p),
                        query_plan,
                        True,
                        project_fn_C)),
                    range(0, settings.table_C_parts))
    """

    map_A_to_B = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p),
                query_plan, False)), range(0, settings.table_A_parts))

    map_B_to_B = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p),
                query_plan, False)), range(0, settings.table_B_parts))

    map_B_to_C = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p),
                query_plan, False)), range(0, settings.table_B_parts))

    map_C_to_C = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p),
                query_plan, False)), range(0, settings.table_C_parts))

    join_build_A_B = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'.
                          format(p), query_plan, False)),
        range(0, settings.table_B_parts))

    join_probe_A_B = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression(settings.table_A_AB_join_key, settings.
                               table_B_AB_join_key), 'join_probe_A_B_{}'
                .format(p), query_plan, False)),
        range(0, settings.table_B_parts))

    bloom_create_ab = map(
        lambda p: query_plan.add_operator(
            BloomCreate(settings.table_B_BC_join_key, 'bloom_create_ab_{}'.
                        format(p), query_plan, False)),
        range(0, settings.table_B_parts))

    join_build_AB_C = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(settings.table_B_BC_join_key, 'join_build_AB_C_{}'.
                          format(p), query_plan, False)),
        range(0, settings.table_C_parts))

    join_probe_AB_C = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression(settings.table_B_BC_join_key, settings.
                               table_C_BC_join_key), 'join_probe_AB_C_{}'
                .format(p), query_plan, False)),
        range(0, settings.table_C_parts))

    def part_aggregate_fn(df):
        sum_ = df[settings.table_C_detail_field_name].astype(float).sum()
        return pd.DataFrame({'_0': [sum_]})

    part_aggregate = map(
        lambda p: query_plan.add_operator(
            Aggregate([
                AggregateExpression(
                    AggregateExpression.SUM, lambda t: float(t[
                        settings.table_C_detail_field_name]))
            ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan,
                      False, part_aggregate_fn)),
        range(0, settings.table_C_parts))

    def aggregate_reduce_fn(df):
        sum_ = df['_0'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], settings.use_pandas, 'aggregate_reduce', query_plan, False,
                  aggregate_reduce_fn))

    aggregate_project = query_plan.add_operator(
        Project([ProjectExpression(lambda t: t['_0'], 'total_balance')],
                'aggregate_project', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    # Connect the operators
    connect_many_to_many(scan_A, map_A_to_B)
    #connect_many_to_many(scan_A, project_A)
    #connect_many_to_many(project_A, map_A_to_B)
    connect_all_to_all(map_A_to_B, join_build_A_B)
    connect_many_to_many(join_build_A_B, join_probe_A_B)

    #connect_many_to_many(project_A, bloom_create_a)
    connect_many_to_many(scan_A, bloom_create_a)
    connect_all_to_all(bloom_create_a, scan_B)
    connect_many_to_many(scan_B, map_B_to_B)
    #connect_many_to_many(scan_B, project_B)
    #connect_many_to_many(project_B, map_B_to_B)
    connect_all_to_all(map_B_to_B, join_probe_A_B)
    connect_many_to_many(join_probe_A_B, bloom_create_ab)
    connect_all_to_all(bloom_create_ab, scan_C)
    connect_many_to_many(join_build_AB_C, join_probe_AB_C)

    connect_many_to_many(join_probe_A_B, map_B_to_C)
    connect_all_to_all(map_B_to_C, join_build_AB_C)

    connect_many_to_many(scan_C, map_C_to_C)
    #connect_many_to_many(scan_C, project_C)
    #connect_many_to_many(project_C, map_C_to_C)
    connect_all_to_all(map_C_to_C, join_probe_AB_C)

    connect_many_to_many(join_probe_AB_C, part_aggregate)

    connect_many_to_one(part_aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, aggregate_project)
    connect_one_to_one(aggregate_project, collate)

    return query_plan
示例#18
0
def run(parallel, use_pandas, buffer_size, table_parts, lower, upper, sf):
    secure = False
    use_native = False
    print('')
    print("Indexing Benchmark")
    print("------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    def fn(df):
        # df.columns = ['l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber', 'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate', 'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment']
        # df[ ['l_extendedprice'] ] = df[ ['l_extendedprice'] ].astype(np.float)
        # criterion = (df['l_extendedprice'] >= lower) & (
        #        df['l_extendedprice'] <= upper)
        df[['_5']] = df[['_5']].astype(np.float)
        criterion = (df['_5'] >= lower) & (df['_5'] <= upper)
        return df[criterion]

    # scan the file
    scan_filter = map(
        lambda p: query_plan.add_operator(
            TableScan(get_file_key('lineitem', True, p, sf=sf),
                      use_pandas,
                      secure,
                      use_native,
                      'scan_{}'.format(p),
                      query_plan,
                      False,
                      fn=fn)), range(0, table_parts))

    # aggregation
    def agg_fun(df):
        return pd.DataFrame({'count': [len(df)]})

    aggregate = query_plan.add_operator(
        Aggregate([], True, 'agg', query_plan, False, agg_fun))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(aggregate), enumerate(scan_filter))
    aggregate.connect(collate)

    # scan_filter[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_0" + ".prof"))
    # aggregate.set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_aggregate" + ".prof"))
    # collate.set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_collate" + ".prof"))

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
示例#19
0
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts,
        part_parts, lineitem_sharded, part_sharded):
    """

    :return: None
    """

    print('')
    print("TPCH Q14 Semi Join")
    print("------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    # DATE is the first day of a month randomly selected from a random year within [1993 .. 1997].
    date = '1993-01-01'
    min_shipped_date = datetime.strptime(date, '%Y-%m-%d')
    max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30)

    part_scan_1 = map(
        lambda p: query_plan.add_operator(
            tpch_q14.sql_scan_part_partkey_where_brand12_operator_def(
                part_sharded, p, part_parts, use_pandas, secure, use_native,
                'part_table_scan_1' + '_' + str(p), query_plan)),
        range(0, part_parts))
    part_1_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.project_p_partkey_operator_def(
                'part_1_project' + '_' + str(p), query_plan)),
        range(0, part_parts))
    part_bloom_create = map(
        lambda p: query_plan.add_operator(
            tpch_q14.bloom_create_p_partkey_operator_def(
                'part_bloom_create' + '_' + str(p), query_plan)),
        range(0, part_parts))
    lineitem_scan_1 = map(
        lambda p: query_plan.add_operator(
            tpch_q14.bloom_scan_lineitem_partkey_where_shipdate_operator_def(
                min_shipped_date, max_shipped_date, lineitem_sharded, p,
                use_pandas, secure, use_native, 'lineitem_scan_1' + '_' + str(
                    p), query_plan)), range(0, lineitem_parts))
    lineitem_1_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.project_l_partkey_operator_def(
                'lineitem_1_project' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))
    part_lineitem_join_1_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('p_partkey', 'part_lineitem_join_1_build' + '_' +
                          str(p), query_plan, False)), range(0, part_parts))
    part_lineitem_join_1_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                          'part_lineitem_join_1_probe' + '_' + str(
                              p), query_plan, False)), range(0, part_parts))
    join_bloom_create = map(
        lambda p: query_plan.add_operator(
            tpch_q14.bloom_create_l_partkey_operator_def(
                'join_bloom_create' + '_' + str(p), query_plan)),
        range(0, part_parts))
    part_scan_2 = map(
        lambda p: query_plan.add_operator(
            tpch_q14.bloom_scan_part_partkey_type_brand12_operator_def(
                part_sharded, p, part_parts, use_pandas, secure, use_native,
                'part_table_scan_2' + '_' + str(p), query_plan)),
        range(0, part_parts))
    part_2_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.project_partkey_type_operator_def(
                'part_2_project' + '_' + str(p), query_plan)),
        range(0, part_parts))
    lineitem_scan_2 = map(
        lambda p: query_plan.add_operator(
            tpch_q14.bloom_scan_lineitem_where_shipdate_operator_def(
                min_shipped_date, max_shipped_date, lineitem_sharded, p,
                use_pandas, secure, use_native, 'lineitem_scan_2' + '_' + str(
                    p), query_plan)), range(0, lineitem_parts))
    lineitem_2_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.project_partkey_extendedprice_discount_operator_def(
                'lineitem_2_project' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))

    part_lineitem_join_2_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('p_partkey', 'part_lineitem_join_2_build' + '_' +
                          str(p), query_plan, False)), range(0, part_parts))
    part_lineitem_join_2_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                          'part_lineitem_join_2_probe' + '_' + str(
                              p), query_plan, False)), range(0, part_parts))

    part_aggregate = map(
        lambda p: query_plan.add_operator(
            tpch_q14.aggregate_promo_revenue_operator_def(
                'part_aggregate' + '_' + str(p), query_plan)),
        range(0, part_parts))

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0'])),
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_1']))
        ], 'aggregate_reduce', query_plan, False))

    aggregate_project = query_plan.add_operator(
        tpch_q14.project_promo_revenue_operator_def('aggregate_project',
                                                    query_plan))

    collate = query_plan.add_operator(
        tpch_q14.collate_operator_def('collate', query_plan))

    # Connect the operators
    map(lambda o, p: o.connect(part_1_project[p]), enumerate(part_scan_1))
    map(lambda (p, o): o.connect(part_bloom_create[p]),
        enumerate(part_1_project)),
    map(
        lambda
        (p, o): map(lambda
                    (bp, bo): bo.connect(o), enumerate(part_bloom_create)),
        enumerate(lineitem_scan_1))
    map(lambda (p, o): o.connect(part_lineitem_join_1_build[p]),
        enumerate(part_1_project))
    map(
        lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo),
                           enumerate(part_lineitem_join_1_build)),
        enumerate(part_lineitem_join_1_probe))
    map(lambda (p, o): o.connect(lineitem_1_project[p]),
        enumerate(lineitem_scan_1))
    map(
        lambda (p, o): part_lineitem_join_1_probe[p % part_parts].
        connect_tuple_producer(o), enumerate(lineitem_1_project))
    map(lambda (p, o): o.connect(join_bloom_create[p]),
        enumerate(part_lineitem_join_1_probe))
    map(
        lambda (p, o): map(lambda
                           (bp, bo): o.connect(bo), enumerate(part_scan_2)),
        enumerate(join_bloom_create))
    map(
        lambda
        (p, o): map(lambda
                    (bp, bo): bo.connect(o), enumerate(join_bloom_create)),
        enumerate(lineitem_scan_2))
    map(lambda (p, o): o.connect(part_2_project[p]), enumerate(part_scan_2))
    map(lambda (p, o): o.connect(part_lineitem_join_2_build[p]),
        enumerate(part_2_project))
    map(
        lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo),
                           enumerate(part_lineitem_join_2_build)),
        enumerate(part_lineitem_join_2_probe))
    map(lambda (p, o): o.connect(lineitem_2_project[p]),
        enumerate(lineitem_scan_2))
    map(
        lambda (p, o): part_lineitem_join_2_probe[p % part_parts].
        connect_tuple_producer(o), enumerate(lineitem_2_project))
    map(lambda (p, o): o.connect(part_aggregate[p]),
        enumerate(part_lineitem_join_2_probe))
    map(lambda (p, o): o.connect(aggregate_reduce), enumerate(part_aggregate))
    aggregate_reduce.connect(aggregate_project)
    aggregate_project.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("part_sharded: {}".format(part_sharded))
    print('')

    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['promo_revenue']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    if s3filter.util.constants.TPCH_SF == 10:
        assert round(float(tuples[1][0]), 10) == 15.4488836202
    elif s3filter.util.constants.TPCH_SF == 1:
        numpy.testing.assert_almost_equal(float(tuples[1][0]), 15.0901165263)
示例#20
0
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts, sharded):
    """
    :return: None
    """

    print('')
    print("TPCH Q19 Baseline Join")
    print("----------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Define the operators
    lineitem_scan = map(lambda p:
                        query_plan.add_operator(
                            tpch_q19.sql_scan_lineitem_select_all_where_partkey_op(sharded,
                                                                                   p,
                                                                                   lineitem_parts,
                                                                                   use_pandas,
                                                                                   'lineitem_scan' + '_' + str(p),
                                                                                   query_plan)),
                        range(0, lineitem_parts))

    part_scan = map(lambda p:
                    query_plan.add_operator(
                        tpch_q19.sql_scan_part_select_all_where_partkey_op(sharded,
                                                                           p,
                                                                           part_parts,
                                                                           use_pandas,
                                                                           'part_scan' + '_' + str(p), query_plan)),
                    range(0, part_parts))

    lineitem_project = map(lambda p:
                           query_plan.add_operator(
                               tpch_q19.project_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_op(
                                   'lineitem_project' + '_' + str(p), query_plan)),
                           range(0, lineitem_parts))

    lineitem_map = map(lambda p:
                       query_plan.add_operator(Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, True)),
                       range(0, part_parts))

    part_project = map(lambda p:
                       query_plan.add_operator(
                           tpch_q19.project_partkey_brand_size_container_op('part_project' + '_' + str(p),
                                                                            query_plan)),
                       range(0, part_parts))

    part_map = map(lambda p:
                   query_plan.add_operator(Map('p_partkey', 'part_map' + '_' + str(p), query_plan, True)),
                   range(0, part_parts))

    # lineitem_part_join = map(lambda p:
    #                          query_plan.add_operator(tpch_q19.join_op(query_plan)),
    #                          range(0, part_parts))

    lineitem_part_join_build = map(lambda p:
                                   query_plan.add_operator(
                                       HashJoinBuild('p_partkey',
                                                     'lineitem_partjoin_build' + '_' + str(p), query_plan,
                                                     False)),
                                   range(0, part_parts))

    lineitem_part_join_probe = map(lambda p:
                                   query_plan.add_operator(
                                       HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                                                     'lineitem_part_join_probe' + '_' + str(p),
                                                     query_plan, True)),
                                   range(0, part_parts))

    filter_op = map(lambda p:
                    query_plan.add_operator(tpch_q19.filter_def('filter_op' + '_' + str(p), query_plan)),
                    range(0, part_parts))
    aggregate = map(lambda p:
                    query_plan.add_operator(tpch_q19.aggregate_def('aggregate' + '_' + str(p), query_plan)),
                    range(0, part_parts))

    aggregate_reduce = query_plan.add_operator(
        Aggregate(
            [
                AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0']))
            ],
            'aggregate_reduce',
            query_plan,
            False))

    aggregate_project = query_plan.add_operator(
        tpch_q19.aggregate_project_def('aggregate_project', query_plan))
    collate = query_plan.add_operator(tpch_q19.collate_op('collate', query_plan))

    # Connect the operators
    # lineitem_scan.connect(lineitem_project)
    connect_many_to_many(lineitem_scan, lineitem_project)
    connect_all_to_all(lineitem_project, lineitem_map)

    # part_scan.connect(part_project)
    connect_many_to_many(part_scan, part_project)
    connect_many_to_many(part_project, part_map)

    # lineitem_part_join.connect_left_producer(lineitem_project)
    connect_all_to_all(part_map, lineitem_part_join_build)

    # lineitem_part_join.connect_right_producer(part_project)
    connect_many_to_many(lineitem_part_join_build, lineitem_part_join_probe)
    connect_many_to_many(lineitem_map, lineitem_part_join_probe)

    # lineitem_part_join.connect(filter_op)
    connect_many_to_many(lineitem_part_join_probe, filter_op)

    # filter_op.connect(aggregate)
    connect_many_to_many(filter_op, aggregate)

    # aggregate.connect(aggregate_project)
    connect_many_to_one(aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, aggregate_project)

    # aggregate_project.connect(collate)
    connect_one_to_one(aggregate_project, collate)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['revenue']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    numpy.testing.assert_almost_equal(tuples[1], 92403.0667)
示例#21
0
def run(parallel,
        use_pandas,
        buffer_size,
        table_parts,
        lower,
        upper,
        sf,
        format_=Format.CSV):
    secure = False
    use_native = False
    print('')
    print("Indexing Benchmark")
    print("------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # SQL scan the file

    scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                get_file_key('lineitem', True, p, sf=sf, format_=format_
                             ), "select * from S3Object "
                "where cast(l_extendedprice as float) >= {} and cast(l_extendedprice as float) <= {};"
                .format(lower, upper), format_, use_pandas, secure,
                use_native, 'scan_{}'.format(p), query_plan, False)),
        range(0, table_parts))
    '''
    scan = map(lambda p:
               query_plan.add_operator(
                   SQLTableScan(get_file_key('lineitem', True, p, sf=sf, format_=format_),
                                "select * from S3Object "
                                "where l_extendedprice  >= {} and l_extendedprice  <= {};".format(
                                    lower, upper), format_,
                                use_pandas, secure, use_native,
                                'scan_{}'.format(p), query_plan,
                                False)),
               range(0, table_parts))
    '''

    # project
    def fn(df):
        df.columns = [
            'l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber',
            'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax',
            'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate',
            'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment'
        ]
        df[['l_extendedprice']] = df[['l_extendedprice']].astype(np.float)
        return df

    project = map(
        lambda p: query_plan.add_operator(
            Project([], 'project_{}'.format(p), query_plan, False, fn)),
        range(0, table_parts))

    # aggregation
    def agg_fun(df):
        return pd.DataFrame({'count': [len(df)]})

    aggregate = query_plan.add_operator(
        Aggregate([], True, 'agg', query_plan, False, agg_fun))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(aggregate), enumerate(project))
    aggregate.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
示例#22
0
def separate_query_plan(shared_mem_buffer_size, parallel, use_pandas,
                        buffer_size, table_parts, lower, upper, sharded, sf,
                        use_shared_mem, inline_ops, merge_ops):
    secure = False
    use_native = False

    if use_shared_mem:
        system = WorkerSystem(shared_mem_buffer_size)
    else:
        system = None

    query_plan = QueryPlan(system,
                           is_async=parallel,
                           buffer_size=buffer_size,
                           use_shared_mem=use_shared_mem)

    # scan the file
    scan = map(
        lambda p: query_plan.add_operator(
            TableScan(get_file_key('lineitem', sharded, p, sf=sf), use_pandas,
                      secure, use_native, 'scan_{}'.format(
                          p), query_plan, False)), range(0, table_parts))

    def filter_fn(df):
        return (df['_5'].astype(np.float) >= lower) & (df['_5'].astype(
            np.float) <= upper)

    filter_ = map(
        lambda p: query_plan.add_operator(
            Filter(PredicateExpression(None, filter_fn), 'filter_{}'.format(p),
                   query_plan, False)), range(0, table_parts))

    # aggregation
    def agg_fun(df):
        return pd.DataFrame({'count': [len(df)]})

    aggregate = query_plan.add_operator(
        Aggregate([], use_pandas, 'agg', query_plan, False, agg_fun))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda p: p.set_async(not inline_ops), filter_)

    connect_many_to_many(scan, filter_)
    connect_many_to_one(filter_, aggregate)
    connect_one_to_one(aggregate, collate)

    profile_file_suffix = get_profile_file_suffix(inline_ops, merge_ops,
                                                  use_shared_mem)

    scan[0].set_profiled(
        True,
        os.path.join(ROOT_DIR, "../benchmark-output/",
                     gen_test_id() + "_scan_0" + profile_file_suffix +
                     ".prof"))
    filter_[0].set_profiled(
        True,
        os.path.join(
            ROOT_DIR, "../benchmark-output/",
            gen_test_id() + "_filter_0" + profile_file_suffix + ".prof"))
    aggregate.set_profiled(
        True,
        os.path.join(
            ROOT_DIR, "../benchmark-output/",
            gen_test_id() + "_aggregate" + profile_file_suffix + ".prof"))
    collate.set_profiled(
        True,
        os.path.join(
            ROOT_DIR, "../benchmark-output/",
            gen_test_id() + "_collate" + profile_file_suffix + ".prof"))
    return query_plan
示例#23
0
def run(parallel,
        use_pandas,
        buffer_size,
        table_parts,
        lower,
        upper,
        sf,
        nthreads=16,
        format_=Format.CSV):
    secure = False
    use_native = False
    print('')
    print("Indexing Benchmark")
    print("------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)
    assert sf == 1 or sf == 10

    # Scan Index Files
    index_scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                'tpch-sf{}/lineitem_sharded/index/index_l_extendedprice.csv.{}'
                .format(sf, p
                        if sf == 1 else p + 1), "select first_byte, last_byte "
                " from S3Object "
                " where col_values  >= {} and col_values  <= {};".format(
                    lower, upper), format_, use_pandas, secure, use_native,
                'index_scan_{}'.format(p), query_plan, False)),
        range(0, table_parts))

    # Range accesses
    range_access = map(
        lambda p: query_plan.add_operator(
            TableRangeAccess(
                get_file_key('lineitem', True, p, sf=sf, format_=Format.CSV),
                use_pandas, secure, use_native, 'range_access_{}'.format(
                    p), query_plan, False)), range(0, table_parts))

    map(lambda o: o.set_nthreads(nthreads), range_access)

    # Aggregation
    def agg_fun(df):
        return pd.DataFrame({'count': [len(df)]})

    aggregate = query_plan.add_operator(
        Aggregate([], True, 'agg', query_plan, False, agg_fun))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(range_access[p]), enumerate(index_scan))
    map(lambda (p, o): o.connect(aggregate), enumerate(range_access))
    aggregate.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
示例#24
0
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts):
    """

    :return: None
    """

    print('')
    print("TPCH Q14 Baseline Join")
    print("----------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    # This date is chosen because it triggers the filter to filter out 1 of the rows in the root data set.
    date = '1996-03-13'
    min_shipped_date = datetime.strptime(date, '%Y-%m-%d')
    max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30)

    lineitem_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q14.sql_scan_lineitem_extra_filtered_operator_def(
                lineitem_parts != 1, p, use_pandas, 'lineitem_scan' + '_' +
                str(p), query_plan)), range(0, lineitem_parts))

    lineitem_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.
            project_partkey_extendedprice_discount_shipdate_operator_def(
                'lineitem_project' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))

    part_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q14.sql_scan_part_operator_def(part_parts != 1, p, part_parts,
                                                use_pandas, 'part_scan' + '_' +
                                                str(p), query_plan)),
        range(0, part_parts))

    part_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.project_partkey_brand_type_operator_def(
                'part_project' + '_' + str(p), query_plan)),
        range(0, part_parts))

    lineitem_filter = map(
        lambda p: query_plan.add_operator(
            tpch_q14.filter_shipdate_operator_def(
                min_shipped_date, max_shipped_date, 'lineitem_filter' + '_' +
                str(p), query_plan)), range(0, lineitem_parts))

    part_filter = map(
        lambda p: query_plan.add_operator(
            tpch_q14.filter_brand12_operator_def('part_filter' + '_' + str(p),
                                                 query_plan)),
        range(0, part_parts))

    join_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('p_partkey', 'join_build' + '_' + str(p), query_plan,
                          False)), range(0, part_parts))

    join_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                          'join_probe' + '_' + str(p), query_plan, False)),
        range(0, part_parts))

    part_aggregate = map(
        lambda p: query_plan.add_operator(
            tpch_q14.aggregate_promo_revenue_operator_def(
                'part_aggregate' + '_' + str(p), query_plan)),
        range(0, part_parts))

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0'])),
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_1']))
        ], 'aggregate_reduce', query_plan, False))

    aggregate_project = query_plan.add_operator(
        tpch_q14.project_promo_revenue_operator_def('aggregate_project',
                                                    query_plan))

    collate = query_plan.add_operator(
        tpch_q14.collate_operator_def('collate', query_plan))

    # Connect the operators
    map(lambda (p, o): o.connect(lineitem_project[p]),
        enumerate(lineitem_scan))
    map(lambda (p, o): o.connect(lineitem_filter[p]),
        enumerate(lineitem_project))
    map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan))
    map(lambda (p, o): o.connect(part_filter[p]), enumerate(part_project))
    map(lambda (p, o): o.connect(join_build[p]), enumerate(part_filter))
    map(
        lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo),
                           enumerate(join_build)), enumerate(join_probe))
    map(lambda (p, o): join_probe[p % part_parts].connect_tuple_producer(o),
        enumerate(lineitem_filter))
    map(lambda (p, o): o.connect(part_aggregate[p]), enumerate(join_probe))
    map(lambda (p, o): o.connect(aggregate_reduce), enumerate(part_aggregate))
    aggregate_reduce.connect(aggregate_project)
    aggregate_project.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['promo_revenue']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    assert round(float(tuples[1][0]), 10) == 33.4262326420
示例#25
0
def query_plan(settings):
    # type: (SyntheticSemiJoinSettings) -> QueryPlan
    """

    :return: None
    """

    if settings.use_shared_mem:
        system = WorkerSystem(settings.shared_memory_size)
    else:
        system = None

    query_plan = QueryPlan(system,
                           is_async=settings.parallel,
                           buffer_size=settings.buffer_size,
                           use_shared_mem=settings.use_shared_mem)

    # Define the operators
    scan_a = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p, settings.sf),
                             "select "
                             "  {} "
                             "from "
                             "  S3Object "
                             "where "
                             "  {} "
                             "  {} "
                             .format(settings.table_A_AB_join_key,
                                     settings.table_A_filter_sql,
                                     get_sql_suffix(settings.table_A_key, settings.table_A_parts, p,
                                                    settings.table_A_sharded)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_a' + '_{}'.format(p),
                             query_plan,
                             False)),
            range(0, settings.table_A_parts))

    field_names_map_a = OrderedDict(
        zip([
            '_{}'.format(i)
            for i, name in enumerate(settings.table_A_field_names)
        ], settings.table_A_field_names))

    def project_fn_a(df):
        df = df.rename(columns=field_names_map_a, copy=False)
        return df

    project_a = map(
        lambda p: query_plan.add_operator(
            Project([
                ProjectExpression(k, v)
                for k, v in field_names_map_a.iteritems()
            ], 'project_a' + '_{}'.format(p), query_plan, False, project_fn_a)
        ), range(0, settings.table_A_parts))

    bloom_create_ab_join_key = map(
        lambda p: query_plan.add_operator(
            BloomCreate(settings.table_A_AB_join_key,
                        'bloom_create_ab_join_key' + '_{}'.format(p),
                        query_plan,
                        False,
                        fp_rate=settings.fp_rate)),
        range(0, settings.table_A_parts))

    scan_b_on_ab_join_key = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScanBloomUse(get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf),
                                     "select "
                                     "  {},{} "
                                     "from "
                                     "  S3Object "
                                     "where "
                                     "  {} "
                                     "  {} "
                                     .format(settings.table_B_BC_join_key,
                                             settings.table_B_AB_join_key,
                                             settings.table_B_filter_sql,
                                             get_sql_suffix(settings.table_B_key, settings.table_B_parts, p,
                                                            settings.table_B_sharded, add_where=False)), settings.format_,
                                     settings.table_B_AB_join_key,
                                     settings.use_pandas,
                                     settings.secure,
                                     settings.use_native,
                                     'scan_b_on_ab_join_key' + '_{}'.format(p),
                                     query_plan,
                                     False)),
            range(0, settings.table_B_parts))

    if settings.table_C_key is None:

        scan_b_detail_on_b_pk = \
            map(lambda p:
                query_plan.add_operator(
                    SQLTableScanBloomUse(get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf),
                                         "select "
                                         "  {},{} "
                                         "from "
                                         "  S3Object "
                                         "where "
                                         "  {} "
                                         "  {} "
                                         .format(settings.table_B_primary_key,
                                                 settings.table_B_detail_field_name,
                                                 settings.table_B_filter_sql,
                                                 get_sql_suffix(settings.table_B_key, settings.table_B_parts, p,
                                                                settings.table_B_sharded, add_where=False)), settings.format_,
                                         settings.table_B_primary_key,
                                         settings.use_pandas,
                                         settings.secure,
                                         settings.use_native,
                                         'scan_c_detail_on_b_pk' + '_{}'.format(p),
                                         query_plan,
                                         False)),
                range(0, settings.table_B_parts))

        field_names_map_b_detail = OrderedDict([
            ('_0', settings.table_B_primary_key),
            ('_1', settings.table_B_detail_field_name)
        ])

        def project_fn_b_detail(df):
            df.rename(columns=field_names_map_b_detail, inplace=True)
            return df

        project_b_detail = map(
            lambda p: query_plan.add_operator(
                Project([
                    ProjectExpression(k, v)
                    for k, v in field_names_map_b_detail.iteritems()
                ], 'project_b_detail' + '_{}'.format(p), query_plan,
                        False, project_fn_b_detail)),
            range(0, settings.table_B_parts))

        map_b_pk_1 = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_B_primary_key, 'map_b_pk_1' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_B_parts))

        map_b_pk_2 = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_B_primary_key, 'map_b_pk_2' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_B_parts))

        bloom_create_b_pk = map(
            lambda p: query_plan.add_operator(
                BloomCreate(settings.table_B_primary_key,
                            'bloom_create_b_pk' + '_{}'.format(p),
                            query_plan,
                            False,
                            fp_rate=settings.fp_rate)),
            range(0, settings.table_B_parts))

        join_probe_ab_and_b_on_b_pk = map(
            lambda p: query_plan.add_operator(
                HashJoinProbe(
                    JoinExpression(settings.table_B_primary_key, settings.
                                   table_B_primary_key),
                    'join_probe_ab_and_b_on_b_pk' + '_{}'.format(
                        p), query_plan, False)),
            range(0, settings.table_B_parts))

        join_build_ab_and_b_on_b_pk = map(
            lambda p: query_plan.add_operator(
                HashJoinBuild(settings.table_B_primary_key,
                              'join_build_ab_and_b_on_b_pk' + '_{}'.format(
                                  p), query_plan, False)),
            range(0, settings.table_B_parts))

    else:
        scan_c_on_bc_join_key = \
            map(lambda p:
                query_plan.add_operator(
                    SQLTableScanBloomUse(get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf),
                                         "select "
                                         "  {}, {} "
                                         "from "
                                         "  S3Object "
                                         "where "
                                         "  {} "
                                         "  {} "
                                         .format(settings.table_C_primary_key,
                                                 settings.table_C_BC_join_key,
                                                 settings.table_C_filter_sql,
                                                 get_sql_suffix(settings.table_C_key, settings.table_C_parts, p,
                                                                settings.table_C_sharded, add_where=False)),
                                         settings.table_C_BC_join_key, settings.format_,
                                         settings.use_pandas,
                                         settings.secure,
                                         settings.use_native,
                                         'scan_c_on_bc_join_key' + '_{}'.format(p),
                                         query_plan,
                                         False)),
                range(0, settings.table_C_parts))

        field_names_map_c = OrderedDict(
            zip([
                '_{}'.format(i)
                for i, name in enumerate(settings.table_C_field_names)
            ], settings.table_C_field_names))

        def project_fn_c(df):
            df.rename(columns=field_names_map_c, inplace=True)
            return df

        project_c = map(
            lambda p: query_plan.add_operator(
                Project([
                    ProjectExpression(k, v)
                    for k, v in field_names_map_c.iteritems()
                ], 'project_c' + '_{}'.format(p), query_plan, False,
                        project_fn_c)), range(0, settings.table_C_parts))

        scan_c_detail_on_c_pk = \
            map(lambda p:
                query_plan.add_operator(
                    SQLTableScanBloomUse(get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf),
                                         "select "
                                         "  {},{} "
                                         "from "
                                         "  S3Object "
                                         "where "
                                         "  {} "
                                         "  {} "
                                         .format(settings.table_C_primary_key,
                                                 settings.table_C_detail_field_name,
                                                 settings.table_C_filter_sql,
                                                 get_sql_suffix(settings.table_C_key, settings.table_C_parts, p,
                                                                settings.table_C_sharded, add_where=False)),
                                         settings.table_C_primary_key, settings.format_,
                                         settings.use_pandas,
                                         settings.secure,
                                         settings.use_native,
                                         'scan_c_detail_on_c_pk' + '_{}'.format(p),
                                         query_plan,
                                         False)),
                range(0, settings.table_C_parts))

        field_names_map_c_detail = OrderedDict([
            ('_0', settings.table_C_primary_key),
            ('_1', settings.table_C_detail_field_name)
        ])

        def project_fn_c_detail(df):
            df.rename(columns=field_names_map_c_detail, inplace=True)
            return df

        project_c_detail = map(
            lambda p: query_plan.add_operator(
                Project([
                    ProjectExpression(k, v)
                    for k, v in field_names_map_c_detail.iteritems()
                ], 'project_c_detail' + '_{}'.format(p), query_plan,
                        False, project_fn_c_detail)),
            range(0, settings.table_C_parts))

        map_bc_b_join_key = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_B_BC_join_key, 'map_bc_b_join_key' + '_{}'.
                    format(p), query_plan, False)),
            range(0, settings.table_C_parts))

        map_c_pk_1 = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_C_primary_key, 'map_c_pk_1' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_C_parts))

        map_c_pk_2 = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_C_primary_key, 'map_c_pk_2' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_C_parts))

        bloom_create_c_pk = map(
            lambda p: query_plan.add_operator(
                BloomCreate(settings.table_C_primary_key,
                            'bloom_create_bc_b_to_c_join_key_{}'.format(p),
                            query_plan,
                            False,
                            fp_rate=settings.fp_rate)),
            range(0, settings.table_C_parts))

        join_build_ab_and_c_on_bc_join_key = map(
            lambda p: query_plan.add_operator(
                HashJoinBuild(
                    settings.table_B_BC_join_key,
                    'join_build_ab_and_c_on_bc_join_key' + '_{}'.format(
                        p), query_plan, False)),
            range(0, settings.table_C_parts))

        join_probe_ab_and_c_on_bc_join_key = map(
            lambda p: query_plan.add_operator(
                HashJoinProbe(
                    JoinExpression(settings.table_B_BC_join_key, settings.
                                   table_C_BC_join_key),
                    'join_probe_ab_and_c_on_bc_join_key' + '_{}'.format(
                        p), query_plan, False)),
            range(0, settings.table_C_parts))

        join_build_abc_and_c_on_c_pk = map(
            lambda p: query_plan.add_operator(
                HashJoinBuild(settings.table_C_primary_key,
                              'join_build_abc_and_c_on_c_pk' + '_{}'.format(
                                  p), query_plan, False)),
            range(0, settings.table_C_parts))

        join_probe_abc_and_c_on_c_pk = map(
            lambda p: query_plan.add_operator(
                HashJoinProbe(
                    JoinExpression(settings.table_C_primary_key, settings.
                                   table_C_primary_key),
                    'join_probe_abc_and_c_on_c_pk' + '_{}'.format(
                        p), query_plan, False)),
            range(0, settings.table_C_parts))

        bloom_create_bc_join_key = map(
            lambda p: query_plan.add_operator(
                BloomCreate(settings.table_B_BC_join_key,
                            'bloom_create_bc_join_key' + '_{}'.format(
                                p), query_plan, False)),
            range(0, settings.table_B_parts))

        map_bc_c_join_key = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_C_BC_join_key, 'map_bc_c_join_key' + '_{}'.
                    format(p), query_plan, False)),
            range(0, settings.table_B_parts))

    field_names_map_b = OrderedDict(
        zip([
            '_{}'.format(i)
            for i, name in enumerate(settings.table_B_field_names)
        ], settings.table_B_field_names))

    def project_fn_b(df):
        df.rename(columns=field_names_map_b, inplace=True)
        return df

    project_b = map(
        lambda p: query_plan.add_operator(
            Project([
                ProjectExpression(k, v)
                for k, v in field_names_map_b.iteritems()
            ], 'project_b' + '_{}'.format(p), query_plan, False, project_fn_b)
        ), range(0, settings.table_B_parts))

    map_ab_a_join_key = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_A_AB_join_key, 'map_ab_a_join_key' + '_{}'
                .format(p), query_plan, False)),
        range(0, settings.table_A_parts))

    map_ab_b_join_key = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_B_AB_join_key, 'map_ab_b_join_key' + '_{}'
                .format(p), query_plan, False)),
        range(0, settings.table_B_parts))

    join_build_a_and_b_on_ab_join_key = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(
                settings.table_A_AB_join_key,
                'join_build_a_and_b_on_ab_join_key' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_B_parts))

    join_probe_a_and_b_on_ab_join_key = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression(settings.table_A_AB_join_key, settings.
                               table_B_AB_join_key),
                'join_probe_a_and_b_on_ab_join_key' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_B_parts))

    if settings.table_C_key is None:

        def part_aggregate_fn(df):
            sum_ = df[settings.table_B_detail_field_name].astype(
                np.float).sum()
            return pd.DataFrame({'_0': [sum_]})

        part_aggregate = map(
            lambda p: query_plan.add_operator(
                Aggregate([
                    AggregateExpression(
                        AggregateExpression.SUM, lambda t: float(t[
                            settings.table_B_detail_field_name]))
                ], settings.use_pandas, 'part_aggregate_{}'.format(p),
                          query_plan, False, part_aggregate_fn)),
            range(0, settings.table_B_parts))

    else:

        def part_aggregate_fn(df):
            sum_ = df[settings.table_C_detail_field_name].astype(
                np.float).sum()
            return pd.DataFrame({'_0': [sum_]})

        part_aggregate = map(
            lambda p: query_plan.add_operator(
                Aggregate([
                    AggregateExpression(
                        AggregateExpression.SUM, lambda t: float(t[
                            settings.table_C_detail_field_name]))
                ], settings.use_pandas, 'part_aggregate_{}'.format(p),
                          query_plan, False, part_aggregate_fn)),
            range(0, settings.table_C_parts))

    def aggregate_reduce_fn(df):
        sum_ = df['_0'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], settings.use_pandas, 'aggregate_reduce', query_plan, False,
                  aggregate_reduce_fn))

    aggregate_project = query_plan.add_operator(
        Project([ProjectExpression(lambda t: t['_0'], 'total_balance')],
                'aggregate_project', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    # Inline some of the operators
    map(lambda o: o.set_async(False), project_a)
    map(lambda o: o.set_async(False), project_b)
    map(lambda o: o.set_async(False), map_ab_a_join_key)
    map(lambda o: o.set_async(False), map_ab_b_join_key)
    if settings.table_C_key is None:
        map(lambda o: o.set_async(False), map_b_pk_1)
        map(lambda o: o.set_async(False), map_b_pk_2)
        map(lambda o: o.set_async(False), project_b_detail)
    else:
        map(lambda o: o.set_async(False), map_bc_b_join_key)
        map(lambda o: o.set_async(False), map_bc_c_join_key)
        map(lambda o: o.set_async(False), map_c_pk_1)
        map(lambda o: o.set_async(False), map_c_pk_2)
        map(lambda o: o.set_async(False), project_c)
        map(lambda o: o.set_async(False), project_c_detail)
    aggregate_project.set_async(False)

    # Connect the operators
    connect_many_to_many(scan_a, project_a)

    connect_many_to_many(project_a, map_ab_a_join_key)

    connect_all_to_all(map_ab_a_join_key, join_build_a_and_b_on_ab_join_key)
    connect_all_to_all(project_a, bloom_create_ab_join_key)
    # connect_all_to_all(map_A_to_B, join_build_a_and_b_on_ab_join_key)
    connect_many_to_many(join_build_a_and_b_on_ab_join_key,
                         join_probe_a_and_b_on_ab_join_key)

    # connect_all_to_all(map_bloom_A_to_B, bloom_create_ab_join_key)
    connect_many_to_many(bloom_create_ab_join_key, scan_b_on_ab_join_key)
    connect_many_to_many(scan_b_on_ab_join_key, project_b)
    # connect_many_to_many(project_b, join_probe_a_and_b_on_ab_join_key)
    # connect_all_to_all(map_B_to_B, join_probe_a_and_b_on_ab_join_key)

    connect_many_to_many(project_b, map_ab_b_join_key)
    connect_all_to_all(map_ab_b_join_key, join_probe_a_and_b_on_ab_join_key)

    # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_bloom_B_to_B)

    if settings.table_C_key is None:
        # connect_all_to_all(join_probe_a_and_b_on_ab_join_key, part_aggregate)
        connect_many_to_many(scan_b_detail_on_b_pk, project_b_detail)
        connect_many_to_many(project_b_detail, map_b_pk_2)
        connect_many_to_many(bloom_create_b_pk, scan_b_detail_on_b_pk)
        connect_all_to_all(join_probe_a_and_b_on_ab_join_key,
                           bloom_create_b_pk)
        connect_all_to_all(map_b_pk_2, join_probe_ab_and_b_on_b_pk)
        connect_many_to_many(join_probe_ab_and_b_on_b_pk, part_aggregate)
        connect_many_to_many(join_build_ab_and_b_on_b_pk,
                             join_probe_ab_and_b_on_b_pk)
        connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_b_pk_1)
        connect_all_to_all(map_b_pk_1, join_build_ab_and_b_on_b_pk)

    else:
        connect_all_to_all(join_probe_a_and_b_on_ab_join_key,
                           bloom_create_bc_join_key)
        connect_many_to_many(bloom_create_bc_join_key, scan_c_on_bc_join_key)
        connect_many_to_many(scan_c_on_bc_join_key, project_c)
        # connect_many_to_many(project_c, join_probe_ab_and_c_on_bc_join_key)
        connect_all_to_all(map_bc_c_join_key,
                           join_probe_ab_and_c_on_bc_join_key)
        # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, join_build_ab_and_c_on_bc_join_key)
        connect_many_to_many(join_probe_a_and_b_on_ab_join_key,
                             map_bc_b_join_key)
        connect_all_to_all(map_bc_b_join_key,
                           join_build_ab_and_c_on_bc_join_key)
        connect_all_to_all(join_probe_ab_and_c_on_bc_join_key,
                           bloom_create_c_pk)
        # connect_many_to_many(join_probe_ab_and_c_on_bc_join_key, join_build_abc_and_c_on_c_pk)
        connect_many_to_many(join_probe_ab_and_c_on_bc_join_key, map_c_pk_1)
        connect_all_to_all(map_c_pk_1, join_build_abc_and_c_on_c_pk)
        connect_many_to_many(bloom_create_c_pk, scan_c_detail_on_c_pk)
        # connect_all_to_all(bloom_create_bc_join_key, scan_c_detail_on_c_pk)
        connect_many_to_many(join_build_abc_and_c_on_c_pk,
                             join_probe_abc_and_c_on_c_pk)
        # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_B_to_C)
        # connect_all_to_all(join_probe_a_and_b_on_ab_join_key, join_build_abc_and_c_on_c_pk)
        connect_many_to_many(scan_c_detail_on_c_pk, project_c_detail)
        # connect_many_to_many(project_c_detail, map_C_to_C)
        # connect_all_to_all(project_c_detail, join_probe_abc_and_c_on_c_pk)
        connect_many_to_many(project_c_detail, map_c_pk_2)

        connect_many_to_many(project_c, map_bc_c_join_key)
        connect_many_to_many(join_build_ab_and_c_on_bc_join_key,
                             join_probe_ab_and_c_on_bc_join_key)
        connect_all_to_all(map_c_pk_2, join_probe_abc_and_c_on_c_pk)

        connect_many_to_many(join_probe_abc_and_c_on_c_pk, part_aggregate)

    connect_many_to_one(part_aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, aggregate_project)
    connect_one_to_one(aggregate_project, collate)

    return query_plan
示例#26
0
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts,
        part_parts, lineitem_sharded, part_sharded, other_parts, sf,
        expected_result, format_):
    """

    :return: None
    """

    print('')
    print("TPCH Q14 Baseline Join")
    print("----------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    # DATE is the first day of a month randomly selected from a random year within [1993 .. 1997].
    date = '1993-01-01'
    min_shipped_date = datetime.strptime(date, '%Y-%m-%d')
    max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30)

    lineitem_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q14.sql_scan_lineitem_operator_def(
                lineitem_sharded, p, lineitem_parts, use_pandas, secure,
                use_native, 'lineitem_scan' + '_' + str(
                    p), query_plan, sf, format_)), range(0, lineitem_parts))

    lineitem_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.
            project_partkey_extendedprice_discount_shipdate_operator_def(
                'lineitem_project' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))

    part_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q14.sql_scan_part_operator_def(
                part_sharded, p, part_parts, use_pandas, secure, use_native,
                'part_scan' + '_' + str(p), query_plan, sf, format_)),
        range(0, part_parts))

    part_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.project_partkey_brand_type_operator_def(
                'part_project' + '_' + str(p), query_plan)),
        range(0, part_parts))

    lineitem_filter = map(
        lambda p: query_plan.add_operator(
            tpch_q14.filter_shipdate_operator_def(
                min_shipped_date, max_shipped_date, 'lineitem_filter' + '_' +
                str(p), query_plan)), range(0, lineitem_parts))

    lineitem_map = map(
        lambda p: query_plan.add_operator(
            Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, False)
        ), range(0, lineitem_parts))

    part_map = map(
        lambda p: query_plan.add_operator(
            Map('p_partkey', 'part_map' + '_' + str(p), query_plan, False)),
        range(0, part_parts))

    # part_filter = map(lambda p:
    #                   query_plan.add_operator(
    #                       tpch_q14.filter_brand12_operator_def('part_filter' + '_' + str(p), query_plan)),
    #                   range(0, part_parts))

    join_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('p_partkey', 'join_build' + '_' + str(p), query_plan,
                          False)), range(0, other_parts))

    join_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                          'join_probe' + '_' + str(p), query_plan, False)),
        range(0, other_parts))

    part_aggregate = map(
        lambda p: query_plan.add_operator(
            tpch_q14.aggregate_promo_revenue_operator_def(
                'part_aggregate' + '_' + str(p), query_plan)),
        range(0, other_parts))

    def aggregate_reduce_fn(df):
        sum1_ = df['_0'].astype(np.float).sum()
        sum2_ = df['_1'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum1_], '_1': [sum2_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0'])),
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_1']))
        ], use_pandas, 'aggregate_reduce', query_plan, False,
                  aggregate_reduce_fn))

    aggregate_project = query_plan.add_operator(
        tpch_q14.project_promo_revenue_operator_def('aggregate_project',
                                                    query_plan))

    collate = query_plan.add_operator(
        tpch_q14.collate_operator_def('collate', query_plan))

    # Inline what we can
    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), part_project)
    map(lambda o: o.set_async(False), lineitem_filter)
    map(lambda o: o.set_async(False), part_map)
    map(lambda o: o.set_async(False), lineitem_map)
    map(lambda o: o.set_async(False), part_aggregate)
    aggregate_project.set_async(False)

    # Connect the operators
    connect_many_to_many(lineitem_scan, lineitem_project)
    connect_many_to_many(lineitem_project, lineitem_filter)
    connect_many_to_many(part_scan, part_project)
    connect_many_to_many(part_project, part_map)
    connect_all_to_all(part_map, join_build)
    connect_many_to_many(join_build, join_probe)
    connect_many_to_many(lineitem_filter, lineitem_map)
    connect_all_to_all(lineitem_map, join_probe)
    connect_many_to_many(join_probe, part_aggregate)
    connect_many_to_one(part_aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, aggregate_project)
    connect_one_to_one(aggregate_project, collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("part_sharded: {}".format(part_sharded))
    print("other_parts: {}".format(other_parts))
    print("format: {}".format(format_))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(lineitem_parts))

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['promo_revenue']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    if s3filter.util.constants.TPCH_SF == 10:
        assert round(float(tuples[1][0]), 10) == 15.4488836202
    elif s3filter.util.constants.TPCH_SF == 1:
        numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
示例#27
0
def run(parallel,
        use_pandas,
        buffer_size,
        table_first_part,
        table_parts,
        queried_columns,
        queried_aliases,
        castable_aliases,
        select_str,
        aggregate_column,
        filter_str,
        path,
        format_=Format.PARQUET):
    secure = False
    use_native = False
    print('')
    print("Parquet Aggregate Benchmark")
    print("------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # SQL scan the file
    scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                "{}/lineitem.{}.parquet".format(path, p
                                                ), "select {} from S3Object "
                "{};".format(select_str, filter_str), format_, use_pandas,
                secure, use_native, 'scan_{}'.format(p), query_plan, False)),
        range(table_first_part, table_first_part + table_parts))

    # project
    def fn(df):
        df.columns = queried_aliases
        df[castable_aliases] = df[castable_aliases].astype(np.double)
        return df

    project = map(
        lambda p: query_plan.add_operator(
            Project([], 'project_{}'.format(p), query_plan, False, fn)),
        range(table_first_part, table_first_part + table_parts))

    # aggregation
    def agg_fun(df):
        if aggregate_column in df:
            return pd.DataFrame({'sum': [sum(df[aggregate_column])]})
        else:
            return pd.DataFrame({'count': len(df)}, index=[0])

    aggregate = query_plan.add_operator(
        Aggregate([], True, 'agg', query_plan, False, agg_fun))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(aggregate), enumerate(project))
    aggregate.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    # query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
示例#28
0
def run(parallel, use_pandas, buffer_size, table_first_part, table_parts, queried_columns, select_columns, filterOutNothing, path, format_=Format.PARQUET):
    secure = False
    use_native = False
    print('')
    print("Parquet Filter+Project All or None Benchmark Vary Columns")
    print("------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # SQL scan the file
    if filterOutNothing: # filter out nothing
        scan = map(lambda p:
                   query_plan.add_operator(
                       SQLTableScan("{}/lineitem.{}.parquet".format(path, p),
                                    "select {} from S3Object;".format(select_columns),
                                    format_, use_pandas, secure, use_native,
                                    'scan_{}'.format(p), query_plan,
                                    False)),
                   range(table_first_part, table_first_part + table_parts))
    else:
        scan = map(lambda p:
                   query_plan.add_operator(
                       SQLTableScan("{}/lineitem.{}.parquet".format(path, p),
                                    "select {} from S3Object where l_orderkey < 1;".format(select_columns),
                                    format_, use_pandas, secure, use_native,
                                    'scan_{}'.format(p), query_plan,
                                    False)),
                   range(table_first_part, table_first_part + table_parts))

    # project
    def fn(df):
        df.columns = queried_columns
        return df

    project = map(lambda p:
                  query_plan.add_operator(
                      Project([], 'project_{}'.format(p), query_plan, False, fn)),
                  range(table_first_part, table_first_part + table_parts))

    # aggregation
    def agg_fun(df):
        return pd.DataFrame({'count': [len(df)]})

    aggregate = query_plan.add_operator(
        Aggregate([], True, 'agg', query_plan, False, agg_fun))

    collate = query_plan.add_operator(
        Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(aggregate), enumerate(project))
    aggregate.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    # query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()