Пример #1
0
def run(parallel, use_pandas, secure, use_native, buffer_size, format_, customer_parts, order_parts, lineitem_parts,
        customer_sharded,
        order_sharded, lineitem_sharded, other_parts, fp_rate, sf, expected_result, customer_filter_sql=None,
        order_filter_sql=None, lineitem_filter_sql=None):
    """

    :return: None
    """

    print('')
    print("TPCH Q3 Bloom Join")
    print("------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    customer_scan = map(lambda p:
                        query_plan.add_operator(
                            SQLTableScan(get_file_key('customer', customer_sharded, p, sf, format_),
                                         "select "
                                         "  c_custkey "
                                         "from "
                                         "  S3Object "
                                         "where "
                                         "  c_mktsegment = 'BUILDING' "
                                         "  {} "
                                         "  {} "
                                         .format(
                                             ' and ' + customer_filter_sql if customer_filter_sql is not None else '',
                                             get_sql_suffix('customer', customer_parts, p, customer_sharded,
                                                            add_where=False)),
                                         format_,
                                         use_pandas, secure, use_native,
                                         'customer_scan' + '_{}'.format(p),
                                         query_plan,
                                         False)),
                        range(0, customer_parts))

    def customer_project_fn(df):
        df = df.filter(items=['_0'], axis=1)

        df.rename(columns={'_0': 'c_custkey'},
                  inplace=True)

        return df

    customer_project = map(lambda p:
                           query_plan.add_operator(
                               Project([],
                                       'customer_project' + '_{}'.format(p),
                                       query_plan,
                                       False, customer_project_fn)),
                           range(0, customer_parts))

    customer_bloom_create = query_plan.add_operator(
        BloomCreate('c_custkey', 'customer_bloom_create', query_plan, False,
                    fp_rate))

    customer_map = map(lambda p:
                       query_plan.add_operator(Map('c_custkey', 'customer_map' + '_' + str(p), query_plan, False)),
                       range(0, customer_parts))

    order_scan = map(lambda p:
                     query_plan.add_operator(
                         SQLTableScanBloomUse(get_file_key('orders', order_sharded, p, sf, format_),
                                              "select "
                                              "  o_custkey, o_orderkey, o_orderdate, o_shippriority "
                                              "from "
                                              "  S3Object "
                                              "where "
                                              "  cast(o_orderdate as timestamp) < cast('1995-03-01' as timestamp) "
                                              "  {} "
                                              "  {} "
                                              .format(
                                                  ' and ' + order_filter_sql if order_filter_sql is not None else '',
                                                  get_sql_suffix('orders', order_parts, p, order_sharded,
                                                                 add_where=False)),
                                              'o_custkey',
                                              format_,
                                              use_pandas, secure, use_native,
                                              'order_scan' + '_{}'.format(p),
                                              query_plan,
                                              False)),
                     range(0, order_parts))

    def order_project_fn(df):
        df = df.filter(items=['_0', '_1', '_2', '_3'], axis=1)

        df.rename(columns={'_0': 'o_custkey', '_1': 'o_orderkey', '_2': 'o_orderdate', '_3': 'o_shippriority'},
                  inplace=True)

        return df

    order_project = map(lambda p:
                        query_plan.add_operator(
                            Project([],
                                    'order_project' + '_{}'.format(p),
                                    query_plan,
                                    False, order_project_fn)),
                        range(0, customer_parts))

    order_map_1 = map(lambda p:
                      query_plan.add_operator(Map('o_custkey', 'order_map_1' + '_' + str(p), query_plan, False)),
                      range(0, order_parts))

    customer_order_join_build = map(lambda p:
                                    query_plan.add_operator(
                                        HashJoinBuild('c_custkey',
                                                      'customer_order_join_build' + '_' + str(p), query_plan,
                                                      False)),
                                    range(0, other_parts))

    customer_order_join_probe = map(lambda p:
                                    query_plan.add_operator(
                                        HashJoinProbe(JoinExpression('c_custkey', 'o_custkey'),
                                                      'customer_order_join_probe' + '_' + str(p),
                                                      query_plan, False)),
                                    range(0, other_parts))

    order_bloom_create = query_plan.add_operator(
        BloomCreate('o_orderkey', 'order_bloom_create', query_plan, False,
                    fp_rate))

    lineitem_scan = map(lambda p:
                        query_plan.add_operator(
                            SQLTableScanBloomUse(get_file_key('lineitem', lineitem_sharded, p, sf, format_),
                                                 "select "
                                                 "  l_orderkey, l_extendedprice, l_discount "
                                                 "from "
                                                 "  S3Object "
                                                 "where "
                                                 "  cast(l_shipdate as timestamp) > cast('1995-03-01' as timestamp) "
                                                 "  {} "
                                                 "  {} "
                                                 .format(
                                                     ' and ' + lineitem_filter_sql if lineitem_filter_sql is not None else '',
                                                     get_sql_suffix('lineitem', lineitem_parts, p, lineitem_sharded,
                                                                    add_where=False)),
                                                 'l_orderkey', format_,
                                                 use_pandas, secure, use_native,
                                                 'lineitem_scan' + '_{}'.format(p),
                                                 query_plan,
                                                 False)),
                        range(0, lineitem_parts))

    def lineitem_project_fn(df):
        df = df.filter(items=['_0', '_1', '_2'], axis=1)

        df.rename(columns={'_0': 'l_orderkey', '_1': 'l_extendedprice', '_2': 'l_discount'},
                  inplace=True)

        return df

    lineitem_project = map(lambda p:
                           query_plan.add_operator(
                               Project([],
                                       'lineitem_project' + '_{}'.format(p),
                                       query_plan,
                                       False, lineitem_project_fn)),
                           range(0, lineitem_parts))

    lineitem_map = map(lambda p:
                       query_plan.add_operator(Map('l_orderkey', 'lineitem_map' + '_' + str(p), query_plan, False)),
                       range(0, lineitem_parts))

    order_map_2 = map(lambda p:
                      query_plan.add_operator(Map('o_orderkey', 'order_map_2' + '_' + str(p), query_plan, False)),
                      range(0, other_parts))

    customer_order_lineitem_join_build = map(lambda p:
                                             query_plan.add_operator(
                                                 HashJoinBuild('o_orderkey',
                                                               'customer_order_lineitem_join_build' + '_' + str(p),
                                                               query_plan,
                                                               False)),
                                             range(0, other_parts))

    customer_order_lineitem_join_probe = map(lambda p:
                                             query_plan.add_operator(
                                                 HashJoinProbe(JoinExpression('o_orderkey', 'l_orderkey'),
                                                               'customer_order_lineitem_join_probe' + '_' + str(p),
                                                               query_plan, False)),
                                             range(0, other_parts))

    def groupby_fn(df):
        df['l_extendedprice'] = df['l_extendedprice'].astype(np.float)
        df['l_discount'] = df['l_discount'].astype(np.float)
        df['revenue'] = df['l_extendedprice'] * (1 - df['l_discount'])
        grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority'])
        agg_df = grouped['revenue'].sum()
        return agg_df.reset_index()

    group = map(lambda p:
                query_plan.add_operator(
                    Group(
                        ['l_orderkey', 'o_orderdate', 'o_shippriority'],  # l_partkey
                        [
                            AggregateExpression(AggregateExpression.SUM,
                                                lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount'])))
                        ],
                        'group' + '_{}'.format(p), query_plan,
                        False, groupby_fn)),
                range(0, other_parts))

    def group_reduce_fn(df):
        grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority'])
        agg_df = grouped['revenue'].sum()
        return agg_df.reset_index()

    group_reduce = query_plan.add_operator(
        Group(
            ['l_orderkey', 'o_orderdate', 'o_shippriority'],  # l_partkey
            [
                AggregateExpression(AggregateExpression.SUM,
                                    lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount'])))
            ],
            'group_reduce', query_plan,
            False, group_reduce_fn))

    top = query_plan.add_operator(
        Top(10, [SortExpression('revenue', float, 'DESC'), SortExpression('o_orderdate', date, 'ASC')], use_pandas,
            'top', query_plan,
            False))

    collate = query_plan.add_operator(tpch_q19.collate_op('collate', query_plan))

    # Inline what we can
    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), customer_project)
    map(lambda o: o.set_async(False), order_project)
    map(lambda o: o.set_async(False), lineitem_map)
    map(lambda o: o.set_async(False), customer_map)
    map(lambda o: o.set_async(False), order_map_1)
    map(lambda o: o.set_async(False), order_map_2)

    # Connect the operators
    connect_many_to_many(customer_scan, customer_project)
    connect_many_to_many(customer_project, customer_map)

    connect_many_to_one(customer_project, customer_bloom_create)
    connect_one_to_many(customer_bloom_create, order_scan)

    connect_many_to_many(order_scan, order_project)
    connect_many_to_many(order_project, order_map_1)

    connect_all_to_all(customer_map, customer_order_join_build)
    connect_many_to_many(customer_order_join_build, customer_order_join_probe)
    connect_all_to_all(order_map_1, customer_order_join_probe)

    # connect_many_to_one(customer_order_join_probe, collate)

    connect_many_to_one(order_project, order_bloom_create)
    connect_one_to_many(order_bloom_create, lineitem_scan)

    connect_many_to_many(lineitem_scan, lineitem_project)
    connect_many_to_many(lineitem_project, lineitem_map)

    connect_many_to_many(customer_order_join_probe, order_map_2)
    connect_all_to_all(order_map_2, customer_order_lineitem_join_build)
    connect_many_to_many(customer_order_lineitem_join_build, customer_order_lineitem_join_probe)
    connect_all_to_all(lineitem_map, customer_order_lineitem_join_probe)

    # connect_many_to_one(customer_order_lineitem_join_probe, collate)

    connect_many_to_many(customer_order_lineitem_join_probe, group)
    # connect_many_to_one(group, collate)

    connect_many_to_one(group, group_reduce)
    connect_one_to_one(group_reduce, top)
    connect_one_to_one(top, collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("customer_parts: {}".format(customer_parts))
    print("order_parts: {}".format(order_parts))
    print("lineitem_parts: {}".format(lineitem_parts))
    print("customer_sharded: {}".format(customer_sharded))
    print("order_sharded: {}".format(order_sharded))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("other_parts: {}".format(other_parts))
    print("fp_rate: {}".format(fp_rate))
    print("format: {}".format(format_))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['l_orderkey', 'o_orderdate', 'o_shippriority', 'revenue']

    assert len(tuples) == 10 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    test_util.assert_tuples(expected_result, tuples)
Пример #2
0
def query_plan(settings):
    # type: (SyntheticBaselineJoinSettings) -> QueryPlan
    """

    :type settings:
    :return: None
    """

    query_plan = QueryPlan(is_async=settings.parallel,
                           buffer_size=settings.buffer_size)

    def scan_A_fun(df):
        df.columns = settings.table_A_field_names
        criterion = settings.table_A_filter_fn(df)
        df = df[criterion]
        return df

    # Define the operators
    scan_A = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p),
                             "select "
                             "  * "
                             "from "
                             "  S3Object "
                             "{}"
                             .format(
                                 get_sql_suffix(settings.table_A_key, settings.table_A_parts, p,
                                                settings.table_A_sharded, add_where=True)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_A_{}'.format(p),
                             query_plan,
                             False, fn=scan_A_fun)),
            range(0, settings.table_A_parts))

    def scan_B_fun(df):
        df.columns = settings.table_B_field_names
        criterion = settings.table_B_filter_fn(df)
        return df[criterion]

    scan_B = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_B_key, settings.table_B_sharded, p),
                             "select "
                             "  * "
                             "from "
                             "  S3Object "
                             "{}"
                             .format(
                                 get_sql_suffix(settings.table_B_key, settings.table_B_parts, p,
                                                settings.table_B_sharded, add_where=True)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_B_{}'.format(p),
                             query_plan,
                             False, fn=scan_B_fun)),
            range(0, settings.table_B_parts))
    """
    field_names_map_B = OrderedDict(
        zip(['_{}'.format(i) for i, name in enumerate(settings.table_B_field_names)], settings.table_B_field_names))

    def project_fn_B(df):
        df.rename(columns=field_names_map_B, inplace=True)
        return df

    project_B = map(lambda p:
                    query_plan.add_operator(Project(
                        [ProjectExpression(k, v) for k, v in field_names_map_B.iteritems()],
                        'project_B_{}'.format(p),
                        query_plan,
                        True,
                        project_fn_B)),
                    range(0, settings.table_B_parts))

    filter_b = map(lambda p:
                   query_plan.add_operator(Filter(
                       PredicateExpression(None, pd_expr=settings.table_B_filter_fn), 'filter_b' + '_{}'.format(p), query_plan,
                       False)),
                   range(0, settings.table_B_parts))
    """

    def scan_C_fun(df):
        df.columns = settings.table_C_field_names
        criterion = settings.table_C_filter_fn(df)
        return df[criterion]

    scan_C = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_C_key, settings.table_C_sharded, p),
                             "select "
                             "  * "
                             "from "
                             "  S3Object "
                             "{}"
                             .format(
                                 get_sql_suffix(settings.table_C_key, settings.table_C_parts, p,
                                                settings.table_C_sharded, add_where=True)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_C_{}'.format(p),
                             query_plan,
                             False, fn=scan_C_fun)),
            range(0, settings.table_C_parts))
    """
    field_names_map_C = OrderedDict(
        zip(['_{}'.format(i) for i, name in enumerate(settings.table_C_field_names)], settings.table_C_field_names))

    def project_fn_C(df):
        df.rename(columns=field_names_map_C, inplace=True)
        return df

    project_C = map(lambda p:
                    query_plan.add_operator(Project(
                        [ProjectExpression(k, v) for k, v in field_names_map_C.iteritems()],
                        'project_C_{}'.format(p),
                        query_plan,
                        True,
                        project_fn_C)),
                    range(0, settings.table_C_parts))

    filter_c = map(lambda p:
                   query_plan.add_operator(Filter(
                       PredicateExpression(None, pd_expr=settings.table_C_filter_fn), 'filter_c' + '_{}'.format(p), query_plan,
                       False)),
                   range(0, settings.table_C_parts))
    """

    map_A_to_B = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p),
                query_plan, False)), range(0, settings.table_A_parts))

    map_B_to_B = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p),
                query_plan, False)), range(0, settings.table_B_parts))

    map_B_to_C = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p),
                query_plan, False)), range(0, settings.table_B_parts))

    map_C_to_C = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p),
                query_plan, False)), range(0, settings.table_C_parts))

    join_build_A_B = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'.
                          format(p), query_plan, False)),
        range(0, settings.table_B_parts))

    join_probe_A_B = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression(settings.table_A_AB_join_key, settings.
                               table_B_AB_join_key), 'join_probe_A_B_{}'
                .format(p), query_plan, False)),
        range(0, settings.table_B_parts))

    join_build_AB_C = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(settings.table_B_BC_join_key, 'join_build_AB_C_{}'.
                          format(p), query_plan, False)),
        range(0, settings.table_C_parts))

    join_probe_AB_C = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression(settings.table_B_BC_join_key, settings.
                               table_C_BC_join_key), 'join_probe_AB_C_{}'
                .format(p), query_plan, False)),
        range(0, settings.table_C_parts))

    def agg_fun(df):
        return pd.DataFrame({
            'sum':
            [df[settings.table_C_detail_field_name].astype(float).sum()]
        })
        #return pd.DataFrame( { 'sum' : [ len(df) ] } )

    part_aggregate = map(
        lambda p: query_plan.add_operator(
            Aggregate([
                AggregateExpression(
                    AggregateExpression.SUM, lambda t: float(t[
                        settings.table_C_detail_field_name]))
            ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan,
                      False, agg_fun)), range(0, settings.table_C_parts))

    def agg_reduce_fun(df):
        return pd.DataFrame({'sum': [df['sum'].sum()]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], settings.use_pandas, 'aggregate_reduce', query_plan, False,
                  agg_reduce_fun))
    """
    aggregate_project = query_plan.add_operator(Project(
        [
            ProjectExpression(lambda t: t['_0'], 'total_balance')
        ],
        'aggregate_project', query_plan,
        False))
    """

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    # Connect the operators
    connect_many_to_many(scan_A, map_A_to_B)
    #connect_many_to_many(project_A, filter_A)
    #connect_many_to_many(filter_A, map_A_to_B)
    connect_all_to_all(map_A_to_B, join_build_A_B)
    connect_many_to_many(join_build_A_B, join_probe_A_B)

    connect_many_to_many(scan_B, map_B_to_B)
    #connect_many_to_many(project_B, filter_b)
    #connect_many_to_many(filter_b, map_B_to_B)
    connect_all_to_all(map_B_to_B, join_probe_A_B)

    #connect_many_to_many(join_probe_A_B, part_aggregate)
    #connect_many_to_one(part_aggregate, aggregate_reduce)
    #connect_one_to_one(aggregate_reduce, collate)

    connect_many_to_many(join_build_AB_C, join_probe_AB_C)

    connect_many_to_many(join_probe_A_B, map_B_to_C)
    connect_all_to_all(map_B_to_C, join_build_AB_C)

    connect_many_to_many(scan_C, map_C_to_C)
    #connect_many_to_many(project_C, filter_c)
    #connect_many_to_many(filter_c, map_C_to_C)
    connect_all_to_all(map_C_to_C, join_probe_AB_C)

    connect_many_to_many(join_probe_AB_C, part_aggregate)

    connect_many_to_one(part_aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, collate)
    #connect_one_to_one(aggregate_project, collate)

    return query_plan
Пример #3
0
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, part_parts, lineitem_sharded,
        part_sharded, other_parts, sf, expected_result, format_):
    """The baseline tst uses nested loop joins with no projection and no filtering pushed down to s3.

    This works by:

    1. Scanning part and filtering on brand and container
    2. It then scans lineitem
    3. It then joins the two tables (essentially filtering out lineitems that dont include parts that we filtered out in
       step 1)
    4. It then computes the average of l_quantity from the joined table and groups the results by partkey
    5. It then joins these computed averages with the joined table in step 3
    6. It then filters out any rows where l_quantity is less than the computed average

    TODO: There are few ways this can be done, the above is just one.

    :return: None
    """

    print('')
    print("TPCH Q17 Baseline Join")
    print("----------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    part_scan = map(lambda p:
                    query_plan.add_operator(
                        tpch_q17.sql_scan_part_select_all_op(
                            part_sharded,
                            p,
                            part_parts,
                            use_pandas,
                            secure,
                            use_native,
                            'part_scan' + '_' + str(p),
                            query_plan,
                            sf, format_)),
                    range(0, part_parts))

    lineitem_scan = map(lambda p:
                        query_plan.add_operator(
                            tpch_q17.sql_scan_lineitem_select_all_op(
                                lineitem_sharded,
                                p,
                                lineitem_parts,
                                use_pandas,
                                secure,
                                use_native,
                                'lineitem_scan' + '_' + str(p),
                                query_plan,
                                sf, format_)),
                        range(0, lineitem_parts))

    part_project = map(lambda p:
                       query_plan.add_operator(
                           tpch_q17.project_partkey_brand_container_op(
                               'part_project' + '_' + str(p),
                               query_plan)),
                       range(0, part_parts))

    part_filter = map(lambda p:
                      query_plan.add_operator(tpch_q17.filter_brand_container_op(
                          'part_filter' + '_' + str(p),
                          query_plan)),
                      range(0, part_parts))

    part_map = map(lambda p:
                   query_plan.add_operator(Map('p_partkey', 'part_map' + '_' + str(p), query_plan, False)),
                   range(0, part_parts))

    lineitem_project = map(lambda p:
                           query_plan.add_operator(
                               tpch_q17.project_lineitem_orderkey_partkey_quantity_extendedprice_op(
                                   'lineitem_project' + '_' + str(p), query_plan)),
                           range(0, lineitem_parts))

    lineitem_map = map(lambda p:
                       query_plan.add_operator(Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, False)),
                       range(0, lineitem_parts))

    part_lineitem_join_build = map(lambda p:
                                   query_plan.add_operator(
                                       HashJoinBuild('p_partkey',
                                                     'part_lineitem_join_build' + '_' + str(p), query_plan,
                                                     False)),
                                   range(0, other_parts))

    part_lineitem_join_probe = map(lambda p:
                                   query_plan.add_operator(
                                       HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                                                     'part_lineitem_join_probe' + '_' + str(p),
                                                     query_plan, False)),
                                   range(0, other_parts))

    lineitem_part_avg_group = map(lambda p:
                                  query_plan.add_operator(
                                      tpch_q17.group_partkey_avg_quantity_5_op('lineitem_part_avg_group' + '_' + str(p),
                                                                               query_plan)),
                                  range(0, other_parts))

    lineitem_part_avg_group_project = map(lambda p:
                                          query_plan.add_operator(
                                              tpch_q17.project_partkey_avg_quantity_op(
                                                  'lineitem_part_avg_group_project' + '_' + str(p), query_plan)),
                                          range(0, other_parts))

    part_lineitem_join_avg_group_join_build = \
        map(lambda p:
            query_plan.add_operator(
                HashJoinBuild('l_partkey',
                              'part_lineitem_join_avg_group_join_build' + '_' + str(p),
                              query_plan,
                              False)),
            range(0, other_parts))

    part_lineitem_join_avg_group_join_probe = \
        map(lambda p:
            query_plan.add_operator(
                HashJoinProbe(JoinExpression('l_partkey', 'l_partkey'),
                              'part_lineitem_join_avg_group_join_probe' + '_' + str(p),
                              query_plan,
                              False)),
            range(0, other_parts))

    lineitem_filter = map(lambda p:
                          query_plan.add_operator(
                              tpch_q17.filter_lineitem_quantity_op('lineitem_filter' + '_' + str(p), query_plan)),
                          range(0, other_parts))

    extendedprice_sum_aggregate = map(lambda p:
                                      query_plan.add_operator(
                                          tpch_q17.aggregate_sum_extendedprice_op(
                                              use_pandas,
                                              'extendedprice_sum_aggregate' + '_' + str(p),
                                              query_plan)),
                                      range(0, other_parts))

    def aggregate_reduce_fn(df):
        sum1_ = df['_0'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum1_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate(
            [
                AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0']))
            ],
            use_pandas,
            'aggregate_reduce',
            query_plan,
            False, aggregate_reduce_fn))

    extendedprice_sum_aggregate_project = query_plan.add_operator(
        tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project', query_plan))

    collate = query_plan.add_operator(tpch_q17.collate_op('collate', query_plan))

    # Inline what we can
    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), part_project)
    map(lambda o: o.set_async(False), lineitem_filter)
    map(lambda o: o.set_async(False), part_filter)
    map(lambda o: o.set_async(False), lineitem_map)
    map(lambda o: o.set_async(False), part_map)
    map(lambda o: o.set_async(False), lineitem_part_avg_group)
    map(lambda o: o.set_async(False), lineitem_part_avg_group_project)
    map(lambda o: o.set_async(False), extendedprice_sum_aggregate)
    extendedprice_sum_aggregate_project.set_async(False)

    # Connect the operators
    # part_scan.connect(part_project)
    map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan))
    map(lambda (p, o): o.connect(part_filter[p]), enumerate(part_project))
    map(lambda (p, o): o.connect(part_map[p]), enumerate(part_filter))

    # lineitem_scan.connect(lineitem_project)
    map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_scan))
    map(lambda (p, o): o.connect(lineitem_map[p]), enumerate(lineitem_project))

    # part_lineitem_join.connect_left_producer(part_project)
    # part_lineitem_join.connect_right_producer(lineitem_project)
    # part_lineitem_join.connect(lineitem_part_avg_group)
    map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_lineitem_join_build)), enumerate(part_map))
    map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_build))
    map(lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1), enumerate(part_lineitem_join_probe)),
        enumerate(lineitem_map))
    map(lambda (p, o): o.connect(lineitem_part_avg_group[p]), enumerate(part_lineitem_join_probe))

    # map(lambda (p, o): o.map(Mapper('_1', 1, part_lineitem_join_probe)), enumerate(lineitem_scan))

    # lineitem_part_avg_group.connect(lineitem_part_avg_group_project)
    map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]), enumerate(lineitem_part_avg_group))

    # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project)
    # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join)
    # part_lineitem_join_avg_group_join.connect(lineitem_filter)

    map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]),
        enumerate(lineitem_part_avg_group_project))

    # map(lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo),
    #                        enumerate(part_lineitem_join_avg_group_join_build)),
    #     enumerate(part_lineitem_join_avg_group_join_probe))
    map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p].connect_build_producer(o),
        enumerate(part_lineitem_join_avg_group_join_build))
    map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p % part_parts].connect_tuple_producer(o),
        enumerate(part_lineitem_join_probe))
    map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(part_lineitem_join_avg_group_join_probe))

    # lineitem_filter.connect(extendedprice_sum_aggregate)
    map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]), enumerate(lineitem_filter))
    # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project)
    # extendedprice_sum_aggregate_project.connect(collate)
    map(lambda (p, o): o.connect(aggregate_reduce), enumerate(extendedprice_sum_aggregate))
    aggregate_reduce.connect(extendedprice_sum_aggregate_project)
    extendedprice_sum_aggregate_project.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("part_sharded: {}".format(part_sharded))
    print("other_parts: {}".format(other_parts))
    print("format: {}".format(format_))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['avg_yearly']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    if s3filter.util.constants.TPCH_SF == 10:
        assert round(float(tuples[1][0]),
                     10) == 372414.2899999995  # TODO: This isn't correct but haven't checked tpch17 on 10 sf yet
    elif s3filter.util.constants.TPCH_SF == 1:
        numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
Пример #4
0
def query_plan(settings):
    # type: (SyntheticFilteredJoinSettings) -> QueryPlan
    """

    :return: None
    """

    if settings.use_shared_mem:
        system = WorkerSystem(settings.shared_memory_size)
    else:
        system = None

    query_plan = QueryPlan(system,
                           is_async=settings.parallel,
                           buffer_size=settings.buffer_size,
                           use_shared_mem=settings.use_shared_mem)

    # Define the operators
    scan_A = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p, settings.sf),
                             "select "
                             "  {} "
                             "from "
                             "  S3Object "
                             "  {} "
                             "  {} "
                             .format(','.join(settings.table_A_field_names),
                                     ' where {} '.format(
                                         settings.table_A_filter_sql) if settings.table_A_filter_sql is not None else '',
                                     get_sql_suffix(settings.table_A_key, settings.table_A_parts, p,
                                                    settings.table_A_sharded,
                                                    add_where=settings.table_A_filter_sql is None)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_A_{}'.format(p),
                             query_plan,
                             False)),
            range(0, settings.table_A_parts))

    field_names_map_A = OrderedDict(
        zip([
            '_{}'.format(i)
            for i, name in enumerate(settings.table_A_field_names)
        ], settings.table_A_field_names))

    def project_fn_A(df):
        df = df.rename(columns=field_names_map_A, copy=False)
        return df

    project_A = map(
        lambda p: query_plan.add_operator(
            Project([
                ProjectExpression(k, v)
                for k, v in field_names_map_A.iteritems()
            ], 'project_A_{}'.format(p), query_plan, False, project_fn_A)),
        range(0, settings.table_A_parts))

    scan_B = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(
                    get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf),
                    "select "
                    "  {} "
                    "from "
                    "  S3Object "
                    "  {} "
                    "  {} "
                        .format(','.join(settings.table_B_field_names),
                                ' where {} '.format(
                                    settings.table_B_filter_sql) if settings.table_B_filter_sql is not None else '',
                                get_sql_suffix(settings.table_B_key, settings.table_B_parts, p,
                                               settings.table_B_sharded,
                                               add_where=settings.table_B_filter_sql is None)), settings.format_,
                    settings.use_pandas,
                    settings.secure,
                    settings.use_native,
                    'scan_B_{}'.format(p),
                    query_plan,
                    False)),
            range(0, settings.table_B_parts))

    field_names_map_B = OrderedDict(
        zip([
            '_{}'.format(i)
            for i, name in enumerate(settings.table_B_field_names)
        ], settings.table_B_field_names))

    def project_fn_B(df):
        df.rename(columns=field_names_map_B, inplace=True)
        return df

    project_B = map(
        lambda p: query_plan.add_operator(
            Project([
                ProjectExpression(k, v)
                for k, v in field_names_map_B.iteritems()
            ], 'project_B_{}'.format(p), query_plan, False, project_fn_B)),
        range(0, settings.table_B_parts))

    if settings.table_C_key is not None:
        scan_C = \
            map(lambda p:
                query_plan.add_operator(
                    SQLTableScan(
                        get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf),
                        "select "
                        "  {} "
                        "from "
                        "  S3Object "
                        "where "
                        "  {} "
                        "  {} "
                            .format(','.join(settings.table_C_field_names),
                                    settings.table_C_filter_sql,
                                    get_sql_suffix(settings.table_C_key, settings.table_C_parts, p,
                                                   settings.table_C_sharded, add_where=False)), settings.format_,
                        settings.use_pandas,
                        settings.secure,
                        settings.use_native,
                        'scan_C_{}'.format(p),
                        query_plan,
                        False)),
                range(0, settings.table_C_parts))

        field_names_map_C = OrderedDict(
            zip([
                '_{}'.format(i)
                for i, name in enumerate(settings.table_C_field_names)
            ], settings.table_C_field_names))

        def project_fn_C(df):
            df = df.rename(columns=field_names_map_C, copy=False)
            return df

        project_C = map(
            lambda p: query_plan.add_operator(
                Project([
                    ProjectExpression(k, v)
                    for k, v in field_names_map_C.iteritems()
                ], 'project_C_{}'.format(p), query_plan, False, project_fn_C)),
            range(0, settings.table_C_parts))

        map_B_to_C = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p),
                    query_plan, False)), range(0, settings.table_B_parts))

        map_C_to_C = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p),
                    query_plan, False)), range(0, settings.table_C_parts))

        join_build_AB_C = map(
            lambda p: query_plan.add_operator(
                HashJoinBuild(settings.table_B_BC_join_key,
                              'join_build_AB_C_{}'.format(
                                  p), query_plan, False)),
            range(0, settings.table_C_parts))

        join_probe_AB_C = map(
            lambda p: query_plan.add_operator(
                HashJoinProbe(
                    JoinExpression(settings.table_B_BC_join_key, settings.
                                   table_C_BC_join_key), 'join_probe_AB_C_{}'.
                    format(p), query_plan, False)),
            range(0, settings.table_C_parts))

    map_A_to_B = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p),
                query_plan, False)), range(0, settings.table_A_parts))

    map_B_to_B = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p),
                query_plan, False)), range(0, settings.table_B_parts))

    join_build_A_B = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'.
                          format(p), query_plan, False)),
        range(0, settings.other_parts))

    join_probe_A_B = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression(settings.table_A_AB_join_key, settings.
                               table_B_AB_join_key), 'join_probe_A_B_{}'.
                format(p), query_plan, False)), range(0, settings.other_parts))

    if settings.table_C_key is None:

        def part_aggregate_fn(df):
            sum_ = df[settings.table_B_detail_field_name].astype(
                np.float).sum()
            return pd.DataFrame({'_0': [sum_]})

        part_aggregate = map(
            lambda p: query_plan.add_operator(
                Aggregate([
                    AggregateExpression(
                        AggregateExpression.SUM, lambda t: float(t[
                            settings.table_B_detail_field_name]))
                ], settings.use_pandas, 'part_aggregate_{}'.format(p),
                          query_plan, False, part_aggregate_fn)),
            range(0, settings.other_parts))

    else:

        def part_aggregate_fn(df):
            sum_ = df[settings.table_C_detail_field_name].astype(
                np.float).sum()
            return pd.DataFrame({'_0': [sum_]})

        part_aggregate = map(
            lambda p: query_plan.add_operator(
                Aggregate([
                    AggregateExpression(
                        AggregateExpression.SUM, lambda t: float(t[
                            settings.table_C_detail_field_name]))
                ], settings.use_pandas, 'part_aggregate_{}'.format(p),
                          query_plan, False, part_aggregate_fn)),
            range(0, settings.table_C_parts))

    def aggregate_reduce_fn(df):
        sum_ = df['_0'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], settings.use_pandas, 'aggregate_reduce', query_plan, False,
                  aggregate_reduce_fn))

    aggregate_project = query_plan.add_operator(
        Project([ProjectExpression(lambda t: t['_0'], 'total_balance')],
                'aggregate_project', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    # Inline some of the operators
    map(lambda o: o.set_async(False), project_A)
    map(lambda o: o.set_async(False), project_B)
    map(lambda o: o.set_async(False), map_A_to_B)
    map(lambda o: o.set_async(False), map_B_to_B)
    if settings.table_C_key is not None:
        map(lambda o: o.set_async(False), map_B_to_C)
        map(lambda o: o.set_async(False), map_C_to_C)
        map(lambda o: o.set_async(False), project_C)
    map(lambda o: o.set_async(False), part_aggregate)
    aggregate_project.set_async(False)

    # Connect the operators
    connect_many_to_many(scan_A, project_A)
    connect_many_to_many(project_A, map_A_to_B)
    connect_all_to_all(map_A_to_B, join_build_A_B)
    connect_many_to_many(join_build_A_B, join_probe_A_B)

    connect_many_to_many(scan_B, project_B)
    connect_many_to_many(project_B, map_B_to_B)
    connect_all_to_all(map_B_to_B, join_probe_A_B)

    if settings.table_C_key is None:
        connect_many_to_many(join_probe_A_B, part_aggregate)
    else:
        connect_many_to_many(join_probe_A_B, map_B_to_C)
        connect_all_to_all(map_B_to_C, join_build_AB_C)
        connect_many_to_many(join_build_AB_C, join_probe_AB_C)
        connect_many_to_many(scan_C, project_C)
        connect_many_to_many(project_C, map_C_to_C)
        connect_all_to_all(map_C_to_C, join_probe_AB_C)
        connect_many_to_many(join_probe_AB_C, part_aggregate)

    connect_many_to_one(part_aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, aggregate_project)
    connect_one_to_one(aggregate_project, collate)

    return query_plan
Пример #5
0
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts):
    """The baseline tst uses nested loop joins with no projection and no filtering pushed down to s3.

    This works by:

    1. Scanning part and filtering on brand and container
    2. It then scans lineitem
    3. It then joins the two tables (essentially filtering out lineitems that dont include parts that we filtered out in
       step 1)
    4. It then computes the average of l_quantity from the joined table and groups the results by partkey
    5. It then joins these computed averages with the joined table in step 3
    6. It then filters out any rows where l_quantity is less than the computed average

    TODO: There are few ways this can be done, the above is just one.

    :return: None
    """

    print('')
    print("TPCH Q17 Baseline Join")
    print("----------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    part_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q17.sql_scan_part_select_all_where_brand_and_container_op(
                part_parts != 1, p, part_parts, use_pandas, 'part_scan' + '_' +
                str(p), query_plan)), range(0, part_parts))

    lineitem_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q17.sql_scan_lineitem_select_all_where_partkey_op(
                lineitem_parts != 1, p, lineitem_parts, use_pandas,
                'lineitem_scan' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))

    part_project = map(
        lambda p: query_plan.add_operator(
            tpch_q17.project_partkey_brand_container_op(
                'part_project' + '_' + str(p), query_plan)),
        range(0, part_parts))

    part_filter = map(
        lambda p: query_plan.add_operator(
            tpch_q17.filter_brand_container_op('part_filter' + '_' + str(p),
                                               query_plan)),
        range(0, part_parts))

    part_map = map(
        lambda p: query_plan.add_operator(
            Map('p_partkey', 'part_map' + '_' + str(p), query_plan, True)),
        range(0, part_parts))

    lineitem_map = map(
        lambda p: query_plan.add_operator(
            Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, True)),
        range(0, lineitem_parts))

    lineitem_project = map(
        lambda p: query_plan.add_operator(
            tpch_q17.
            project_lineitem_orderkey_partkey_quantity_extendedprice_op(
                'lineitem_project' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))

    # part_lineitem_join = map(lambda p:
    #                          query_plan.add_operator(
    #                              tpch_q17.join_p_partkey_l_partkey_op('part_lineitem_join', query_plan)),
    #                          range(0, lineitem_parts))

    part_lineitem_join_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('p_partkey', 'part_lineitem_join_build' + '_' + str(
                p), query_plan, False)), range(0, part_parts))

    part_lineitem_join_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                          'part_lineitem_join_probe' + '_' + str(
                              p), query_plan, True)), range(0, part_parts))

    lineitem_part_avg_group = map(
        lambda p: query_plan.add_operator(
            tpch_q17.group_partkey_avg_quantity_5_op(
                'lineitem_part_avg_group' + '_' + str(p), query_plan)),
        range(0, part_parts))

    lineitem_part_avg_group_project = map(
        lambda p: query_plan.add_operator(
            tpch_q17.project_partkey_avg_quantity_op(
                'lineitem_part_avg_group_project' + '_' + str(p), query_plan)),
        range(0, part_parts))

    # part_lineitem_join_avg_group_join = map(lambda p:
    #                                         query_plan.add_operator(
    #                                             tpch_q17.join_l_partkey_p_partkey_op(
    #                                                 'part_lineitem_join_avg_group_join', query_plan)),
    #                                         range(0, lineitem_parts))

    part_lineitem_join_avg_group_join_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(
                'l_partkey', 'part_lineitem_join_avg_group_join_build' + '_' +
                str(p), query_plan, False)), range(0, part_parts))

    part_lineitem_join_avg_group_join_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression('l_partkey', 'p_partkey'),
                'part_lineitem_join_avg_group_join_probe' + '_' + str(
                    p), query_plan, True)), range(0, part_parts))

    lineitem_filter = map(
        lambda p: query_plan.add_operator(
            tpch_q17.filter_lineitem_quantity_op(
                'lineitem_filter' + '_' + str(p), query_plan)),
        range(0, part_parts))

    extendedprice_sum_aggregate = map(
        lambda p: query_plan.add_operator(
            tpch_q17.aggregate_sum_extendedprice_op(
                'extendedprice_sum_aggregate' + '_' + str(p), query_plan)),
        range(0, part_parts))

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], 'aggregate_reduce', query_plan, False))

    extendedprice_sum_aggregate_project = query_plan.add_operator(
        tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project',
                                       query_plan))

    collate = query_plan.add_operator(
        tpch_q17.collate_op('collate', query_plan))

    # Connect the operators
    # part_scan.connect(part_project)
    map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan))
    map(lambda (p, o): o.connect(part_filter[p]), enumerate(part_project))
    map(lambda (p, o): o.connect(part_map[p]), enumerate(part_filter))

    # lineitem_scan.connect(lineitem_project)
    map(lambda (p, o): o.connect(lineitem_project[p]),
        enumerate(lineitem_scan))
    map(lambda (p, o): o.connect(lineitem_map[p]), enumerate(lineitem_project))

    # part_lineitem_join.connect_left_producer(part_project)
    # part_lineitem_join.connect_right_producer(lineitem_project)
    # part_lineitem_join.connect(lineitem_part_avg_group)
    map(
        lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2),
                             enumerate(part_lineitem_join_build)),
        enumerate(part_map))
    map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o),
        enumerate(part_lineitem_join_build))
    map(
        lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1),
                             enumerate(part_lineitem_join_probe)),
        enumerate(lineitem_map))
    map(lambda (p, o): o.connect(lineitem_part_avg_group[p]),
        enumerate(part_lineitem_join_probe))

    # map(lambda (p, o): o.map(Mapper('_1', 1, part_lineitem_join_probe)), enumerate(lineitem_scan))

    # lineitem_part_avg_group.connect(lineitem_part_avg_group_project)
    map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]),
        enumerate(lineitem_part_avg_group))

    # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project)
    # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join)
    # part_lineitem_join_avg_group_join.connect(lineitem_filter)

    map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]),
        enumerate(lineitem_part_avg_group_project))

    # map(lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo),
    #                        enumerate(part_lineitem_join_avg_group_join_build)),
    #     enumerate(part_lineitem_join_avg_group_join_probe))
    map(
        lambda (p, o): part_lineitem_join_avg_group_join_probe[p].
        connect_build_producer(o),
        enumerate(part_lineitem_join_avg_group_join_build))
    map(
        lambda (p, o): part_lineitem_join_avg_group_join_probe[p % part_parts].
        connect_tuple_producer(o), enumerate(part_lineitem_join_probe))
    map(lambda (p, o): o.connect(lineitem_filter[p]),
        enumerate(part_lineitem_join_avg_group_join_probe))

    # lineitem_filter.connect(extendedprice_sum_aggregate)
    map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]),
        enumerate(lineitem_filter))
    # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project)
    # extendedprice_sum_aggregate_project.connect(collate)
    map(lambda (p, o): o.connect(aggregate_reduce),
        enumerate(extendedprice_sum_aggregate))
    aggregate_reduce.connect(extendedprice_sum_aggregate_project)
    extendedprice_sum_aggregate_project.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['avg_yearly']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    assert round(float(tuples[1][0]), 10) == 4632.1085714286
Пример #6
0
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts,
        part_parts, lineitem_sharded, part_sharded):
    """

    :return: None
    """

    print('')
    print("TPCH Q14 Semi Join")
    print("------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    # DATE is the first day of a month randomly selected from a random year within [1993 .. 1997].
    date = '1993-01-01'
    min_shipped_date = datetime.strptime(date, '%Y-%m-%d')
    max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30)

    part_scan_1 = map(
        lambda p: query_plan.add_operator(
            tpch_q14.sql_scan_part_partkey_where_brand12_operator_def(
                part_sharded, p, part_parts, use_pandas, secure, use_native,
                'part_table_scan_1' + '_' + str(p), query_plan)),
        range(0, part_parts))
    part_1_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.project_p_partkey_operator_def(
                'part_1_project' + '_' + str(p), query_plan)),
        range(0, part_parts))
    part_bloom_create = map(
        lambda p: query_plan.add_operator(
            tpch_q14.bloom_create_p_partkey_operator_def(
                'part_bloom_create' + '_' + str(p), query_plan)),
        range(0, part_parts))
    lineitem_scan_1 = map(
        lambda p: query_plan.add_operator(
            tpch_q14.bloom_scan_lineitem_partkey_where_shipdate_operator_def(
                min_shipped_date, max_shipped_date, lineitem_sharded, p,
                use_pandas, secure, use_native, 'lineitem_scan_1' + '_' + str(
                    p), query_plan)), range(0, lineitem_parts))
    lineitem_1_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.project_l_partkey_operator_def(
                'lineitem_1_project' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))
    part_lineitem_join_1_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('p_partkey', 'part_lineitem_join_1_build' + '_' +
                          str(p), query_plan, False)), range(0, part_parts))
    part_lineitem_join_1_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                          'part_lineitem_join_1_probe' + '_' + str(
                              p), query_plan, False)), range(0, part_parts))
    join_bloom_create = map(
        lambda p: query_plan.add_operator(
            tpch_q14.bloom_create_l_partkey_operator_def(
                'join_bloom_create' + '_' + str(p), query_plan)),
        range(0, part_parts))
    part_scan_2 = map(
        lambda p: query_plan.add_operator(
            tpch_q14.bloom_scan_part_partkey_type_brand12_operator_def(
                part_sharded, p, part_parts, use_pandas, secure, use_native,
                'part_table_scan_2' + '_' + str(p), query_plan)),
        range(0, part_parts))
    part_2_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.project_partkey_type_operator_def(
                'part_2_project' + '_' + str(p), query_plan)),
        range(0, part_parts))
    lineitem_scan_2 = map(
        lambda p: query_plan.add_operator(
            tpch_q14.bloom_scan_lineitem_where_shipdate_operator_def(
                min_shipped_date, max_shipped_date, lineitem_sharded, p,
                use_pandas, secure, use_native, 'lineitem_scan_2' + '_' + str(
                    p), query_plan)), range(0, lineitem_parts))
    lineitem_2_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.project_partkey_extendedprice_discount_operator_def(
                'lineitem_2_project' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))

    part_lineitem_join_2_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('p_partkey', 'part_lineitem_join_2_build' + '_' +
                          str(p), query_plan, False)), range(0, part_parts))
    part_lineitem_join_2_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                          'part_lineitem_join_2_probe' + '_' + str(
                              p), query_plan, False)), range(0, part_parts))

    part_aggregate = map(
        lambda p: query_plan.add_operator(
            tpch_q14.aggregate_promo_revenue_operator_def(
                'part_aggregate' + '_' + str(p), query_plan)),
        range(0, part_parts))

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0'])),
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_1']))
        ], 'aggregate_reduce', query_plan, False))

    aggregate_project = query_plan.add_operator(
        tpch_q14.project_promo_revenue_operator_def('aggregate_project',
                                                    query_plan))

    collate = query_plan.add_operator(
        tpch_q14.collate_operator_def('collate', query_plan))

    # Connect the operators
    map(lambda o, p: o.connect(part_1_project[p]), enumerate(part_scan_1))
    map(lambda (p, o): o.connect(part_bloom_create[p]),
        enumerate(part_1_project)),
    map(
        lambda
        (p, o): map(lambda
                    (bp, bo): bo.connect(o), enumerate(part_bloom_create)),
        enumerate(lineitem_scan_1))
    map(lambda (p, o): o.connect(part_lineitem_join_1_build[p]),
        enumerate(part_1_project))
    map(
        lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo),
                           enumerate(part_lineitem_join_1_build)),
        enumerate(part_lineitem_join_1_probe))
    map(lambda (p, o): o.connect(lineitem_1_project[p]),
        enumerate(lineitem_scan_1))
    map(
        lambda (p, o): part_lineitem_join_1_probe[p % part_parts].
        connect_tuple_producer(o), enumerate(lineitem_1_project))
    map(lambda (p, o): o.connect(join_bloom_create[p]),
        enumerate(part_lineitem_join_1_probe))
    map(
        lambda (p, o): map(lambda
                           (bp, bo): o.connect(bo), enumerate(part_scan_2)),
        enumerate(join_bloom_create))
    map(
        lambda
        (p, o): map(lambda
                    (bp, bo): bo.connect(o), enumerate(join_bloom_create)),
        enumerate(lineitem_scan_2))
    map(lambda (p, o): o.connect(part_2_project[p]), enumerate(part_scan_2))
    map(lambda (p, o): o.connect(part_lineitem_join_2_build[p]),
        enumerate(part_2_project))
    map(
        lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo),
                           enumerate(part_lineitem_join_2_build)),
        enumerate(part_lineitem_join_2_probe))
    map(lambda (p, o): o.connect(lineitem_2_project[p]),
        enumerate(lineitem_scan_2))
    map(
        lambda (p, o): part_lineitem_join_2_probe[p % part_parts].
        connect_tuple_producer(o), enumerate(lineitem_2_project))
    map(lambda (p, o): o.connect(part_aggregate[p]),
        enumerate(part_lineitem_join_2_probe))
    map(lambda (p, o): o.connect(aggregate_reduce), enumerate(part_aggregate))
    aggregate_reduce.connect(aggregate_project)
    aggregate_project.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("part_sharded: {}".format(part_sharded))
    print('')

    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['promo_revenue']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    if s3filter.util.constants.TPCH_SF == 10:
        assert round(float(tuples[1][0]), 10) == 15.4488836202
    elif s3filter.util.constants.TPCH_SF == 1:
        numpy.testing.assert_almost_equal(float(tuples[1][0]), 15.0901165263)
Пример #7
0
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts,
        part_parts, lineitem_sharded, part_sharded, other_parts, sf,
        expected_result, format_):
    """

    :return: None
    """

    print('')
    print("TPCH Q14 Baseline Join")
    print("----------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    # DATE is the first day of a month randomly selected from a random year within [1993 .. 1997].
    date = '1993-01-01'
    min_shipped_date = datetime.strptime(date, '%Y-%m-%d')
    max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30)

    lineitem_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q14.sql_scan_lineitem_operator_def(
                lineitem_sharded, p, lineitem_parts, use_pandas, secure,
                use_native, 'lineitem_scan' + '_' + str(
                    p), query_plan, sf, format_)), range(0, lineitem_parts))

    lineitem_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.
            project_partkey_extendedprice_discount_shipdate_operator_def(
                'lineitem_project' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))

    part_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q14.sql_scan_part_operator_def(
                part_sharded, p, part_parts, use_pandas, secure, use_native,
                'part_scan' + '_' + str(p), query_plan, sf, format_)),
        range(0, part_parts))

    part_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.project_partkey_brand_type_operator_def(
                'part_project' + '_' + str(p), query_plan)),
        range(0, part_parts))

    lineitem_filter = map(
        lambda p: query_plan.add_operator(
            tpch_q14.filter_shipdate_operator_def(
                min_shipped_date, max_shipped_date, 'lineitem_filter' + '_' +
                str(p), query_plan)), range(0, lineitem_parts))

    lineitem_map = map(
        lambda p: query_plan.add_operator(
            Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, False)
        ), range(0, lineitem_parts))

    part_map = map(
        lambda p: query_plan.add_operator(
            Map('p_partkey', 'part_map' + '_' + str(p), query_plan, False)),
        range(0, part_parts))

    # part_filter = map(lambda p:
    #                   query_plan.add_operator(
    #                       tpch_q14.filter_brand12_operator_def('part_filter' + '_' + str(p), query_plan)),
    #                   range(0, part_parts))

    join_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('p_partkey', 'join_build' + '_' + str(p), query_plan,
                          False)), range(0, other_parts))

    join_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                          'join_probe' + '_' + str(p), query_plan, False)),
        range(0, other_parts))

    part_aggregate = map(
        lambda p: query_plan.add_operator(
            tpch_q14.aggregate_promo_revenue_operator_def(
                'part_aggregate' + '_' + str(p), query_plan)),
        range(0, other_parts))

    def aggregate_reduce_fn(df):
        sum1_ = df['_0'].astype(np.float).sum()
        sum2_ = df['_1'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum1_], '_1': [sum2_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0'])),
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_1']))
        ], use_pandas, 'aggregate_reduce', query_plan, False,
                  aggregate_reduce_fn))

    aggregate_project = query_plan.add_operator(
        tpch_q14.project_promo_revenue_operator_def('aggregate_project',
                                                    query_plan))

    collate = query_plan.add_operator(
        tpch_q14.collate_operator_def('collate', query_plan))

    # Inline what we can
    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), part_project)
    map(lambda o: o.set_async(False), lineitem_filter)
    map(lambda o: o.set_async(False), part_map)
    map(lambda o: o.set_async(False), lineitem_map)
    map(lambda o: o.set_async(False), part_aggregate)
    aggregate_project.set_async(False)

    # Connect the operators
    connect_many_to_many(lineitem_scan, lineitem_project)
    connect_many_to_many(lineitem_project, lineitem_filter)
    connect_many_to_many(part_scan, part_project)
    connect_many_to_many(part_project, part_map)
    connect_all_to_all(part_map, join_build)
    connect_many_to_many(join_build, join_probe)
    connect_many_to_many(lineitem_filter, lineitem_map)
    connect_all_to_all(lineitem_map, join_probe)
    connect_many_to_many(join_probe, part_aggregate)
    connect_many_to_one(part_aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, aggregate_project)
    connect_one_to_one(aggregate_project, collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("part_sharded: {}".format(part_sharded))
    print("other_parts: {}".format(other_parts))
    print("format: {}".format(format_))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(lineitem_parts))

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['promo_revenue']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    if s3filter.util.constants.TPCH_SF == 10:
        assert round(float(tuples[1][0]), 10) == 15.4488836202
    elif s3filter.util.constants.TPCH_SF == 1:
        numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
Пример #8
0
def run(parallel, buffer_size):
    """Tests a with sharded operators and separate build and probe for join

    :return: None
    """

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    parts = 2

    collate = query_plan.add_operator(Collate('collate', query_plan, False))
    merge = query_plan.add_operator(Merge('merge', query_plan, False))

    hash_join_build_ops = []
    hash_join_probe_ops = []
    for p in range(1, parts + 1):
        r_region_key_lower = math.ceil((5.0 / float(parts)) * (p - 1))
        r_region_key_upper = math.ceil((5.0 / float(parts)) * p)

        region_scan = query_plan.add_operator(
            SQLTableScan('region.csv',
                         'select * '
                         'from S3Object '
                         'where cast(r_regionkey as int) >= {} and cast(r_regionkey as int) < {};'
                         .format(r_region_key_lower, r_region_key_upper),
                         False, 'region_scan' + '_' + str(p),
                         query_plan,
                         False))

        region_project = query_plan.add_operator(
            Project([
                ProjectExpression(lambda t_: t_['_0'], 'r_regionkey'),
                ProjectExpression(lambda t_: t_['_1'], 'r_name')
            ], 'region_project' + '_' + str(p), query_plan, False))

        n_nation_key_lower = math.ceil((25.0 / float(parts)) * (p - 1))
        n_nation_key_upper = math.ceil((25.0 / float(parts)) * p)

        nation_scan = query_plan.add_operator(
            SQLTableScan('nation.csv',
                         'select * from S3Object '
                         'where cast(n_nationkey as int) >= {} and cast(n_nationkey as int) < {};'
                         .format(n_nation_key_lower, n_nation_key_upper),
                         False, 'nation_scan' + '_' + str(p),
                         query_plan,
                         False))

        nation_project = query_plan.add_operator(
            Project([
                ProjectExpression(lambda t_: t_['_0'], 'n_nationkey'),
                ProjectExpression(lambda t_: t_['_1'], 'n_name'),
                ProjectExpression(lambda t_: t_['_2'], 'n_regionkey')
            ], 'nation_project' + '_' + str(p), query_plan, False))

        region_hash_join_build = query_plan.add_operator(
            HashJoinBuild('r_regionkey', 'region_hash_join_build' + '_' + str(p), query_plan, False))
        hash_join_build_ops.append(region_hash_join_build)

        region_nation_join_probe = query_plan.add_operator(
            HashJoinProbe(JoinExpression('r_regionkey', 'n_regionkey'), 'region_nation_join_probe' + '_' + str(p),
                          query_plan, False))
        hash_join_probe_ops.append(region_nation_join_probe)

        region_scan.connect(region_project)
        nation_scan.connect(nation_project)
        region_project.connect(region_hash_join_build)
        region_nation_join_probe.connect_tuple_producer(nation_project)
        region_nation_join_probe.connect(merge)

    for probe_op in hash_join_probe_ops:
        for build_op in hash_join_build_ops:
            probe_op.connect_build_producer(build_op)

    merge.connect(collate)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['r_regionkey', 'r_name', 'n_nationkey', 'n_name', 'n_regionkey']

    assert len(tuples) == 25 + 1

    assert tuples[0] == field_names

    num_rows = 0
    for t in tuples:
        num_rows += 1
        # Assert that the nation_key in table 1 has been joined with the record in table 2 with the same nation_key
        if num_rows > 1:
            lt = IndexedTuple.build(t, field_names)
            assert lt['r_regionkey'] == lt['n_regionkey']
Пример #9
0
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts, sharded):
    """
    :return: None
    """

    print('')
    print("TPCH Q19 Baseline Join")
    print("----------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Define the operators
    lineitem_scan = map(lambda p:
                        query_plan.add_operator(
                            tpch_q19.sql_scan_lineitem_select_all_where_partkey_op(sharded,
                                                                                   p,
                                                                                   lineitem_parts,
                                                                                   use_pandas,
                                                                                   'lineitem_scan' + '_' + str(p),
                                                                                   query_plan)),
                        range(0, lineitem_parts))

    part_scan = map(lambda p:
                    query_plan.add_operator(
                        tpch_q19.sql_scan_part_select_all_where_partkey_op(sharded,
                                                                           p,
                                                                           part_parts,
                                                                           use_pandas,
                                                                           'part_scan' + '_' + str(p), query_plan)),
                    range(0, part_parts))

    lineitem_project = map(lambda p:
                           query_plan.add_operator(
                               tpch_q19.project_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_op(
                                   'lineitem_project' + '_' + str(p), query_plan)),
                           range(0, lineitem_parts))

    lineitem_map = map(lambda p:
                       query_plan.add_operator(Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, True)),
                       range(0, part_parts))

    part_project = map(lambda p:
                       query_plan.add_operator(
                           tpch_q19.project_partkey_brand_size_container_op('part_project' + '_' + str(p),
                                                                            query_plan)),
                       range(0, part_parts))

    part_map = map(lambda p:
                   query_plan.add_operator(Map('p_partkey', 'part_map' + '_' + str(p), query_plan, True)),
                   range(0, part_parts))

    # lineitem_part_join = map(lambda p:
    #                          query_plan.add_operator(tpch_q19.join_op(query_plan)),
    #                          range(0, part_parts))

    lineitem_part_join_build = map(lambda p:
                                   query_plan.add_operator(
                                       HashJoinBuild('p_partkey',
                                                     'lineitem_partjoin_build' + '_' + str(p), query_plan,
                                                     False)),
                                   range(0, part_parts))

    lineitem_part_join_probe = map(lambda p:
                                   query_plan.add_operator(
                                       HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                                                     'lineitem_part_join_probe' + '_' + str(p),
                                                     query_plan, True)),
                                   range(0, part_parts))

    filter_op = map(lambda p:
                    query_plan.add_operator(tpch_q19.filter_def('filter_op' + '_' + str(p), query_plan)),
                    range(0, part_parts))
    aggregate = map(lambda p:
                    query_plan.add_operator(tpch_q19.aggregate_def('aggregate' + '_' + str(p), query_plan)),
                    range(0, part_parts))

    aggregate_reduce = query_plan.add_operator(
        Aggregate(
            [
                AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0']))
            ],
            'aggregate_reduce',
            query_plan,
            False))

    aggregate_project = query_plan.add_operator(
        tpch_q19.aggregate_project_def('aggregate_project', query_plan))
    collate = query_plan.add_operator(tpch_q19.collate_op('collate', query_plan))

    # Connect the operators
    # lineitem_scan.connect(lineitem_project)
    connect_many_to_many(lineitem_scan, lineitem_project)
    connect_all_to_all(lineitem_project, lineitem_map)

    # part_scan.connect(part_project)
    connect_many_to_many(part_scan, part_project)
    connect_many_to_many(part_project, part_map)

    # lineitem_part_join.connect_left_producer(lineitem_project)
    connect_all_to_all(part_map, lineitem_part_join_build)

    # lineitem_part_join.connect_right_producer(part_project)
    connect_many_to_many(lineitem_part_join_build, lineitem_part_join_probe)
    connect_many_to_many(lineitem_map, lineitem_part_join_probe)

    # lineitem_part_join.connect(filter_op)
    connect_many_to_many(lineitem_part_join_probe, filter_op)

    # filter_op.connect(aggregate)
    connect_many_to_many(filter_op, aggregate)

    # aggregate.connect(aggregate_project)
    connect_many_to_one(aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, aggregate_project)

    # aggregate_project.connect(collate)
    connect_one_to_one(aggregate_project, collate)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['revenue']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    numpy.testing.assert_almost_equal(tuples[1], 92403.0667)
Пример #10
0
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts):
    """

    :return: None
    """

    print('')
    print("TPCH Q14 Baseline Join")
    print("----------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    # This date is chosen because it triggers the filter to filter out 1 of the rows in the root data set.
    date = '1996-03-13'
    min_shipped_date = datetime.strptime(date, '%Y-%m-%d')
    max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30)

    lineitem_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q14.sql_scan_lineitem_extra_filtered_operator_def(
                lineitem_parts != 1, p, use_pandas, 'lineitem_scan' + '_' +
                str(p), query_plan)), range(0, lineitem_parts))

    lineitem_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.
            project_partkey_extendedprice_discount_shipdate_operator_def(
                'lineitem_project' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))

    part_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q14.sql_scan_part_operator_def(part_parts != 1, p, part_parts,
                                                use_pandas, 'part_scan' + '_' +
                                                str(p), query_plan)),
        range(0, part_parts))

    part_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.project_partkey_brand_type_operator_def(
                'part_project' + '_' + str(p), query_plan)),
        range(0, part_parts))

    lineitem_filter = map(
        lambda p: query_plan.add_operator(
            tpch_q14.filter_shipdate_operator_def(
                min_shipped_date, max_shipped_date, 'lineitem_filter' + '_' +
                str(p), query_plan)), range(0, lineitem_parts))

    part_filter = map(
        lambda p: query_plan.add_operator(
            tpch_q14.filter_brand12_operator_def('part_filter' + '_' + str(p),
                                                 query_plan)),
        range(0, part_parts))

    join_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('p_partkey', 'join_build' + '_' + str(p), query_plan,
                          False)), range(0, part_parts))

    join_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                          'join_probe' + '_' + str(p), query_plan, False)),
        range(0, part_parts))

    part_aggregate = map(
        lambda p: query_plan.add_operator(
            tpch_q14.aggregate_promo_revenue_operator_def(
                'part_aggregate' + '_' + str(p), query_plan)),
        range(0, part_parts))

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0'])),
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_1']))
        ], 'aggregate_reduce', query_plan, False))

    aggregate_project = query_plan.add_operator(
        tpch_q14.project_promo_revenue_operator_def('aggregate_project',
                                                    query_plan))

    collate = query_plan.add_operator(
        tpch_q14.collate_operator_def('collate', query_plan))

    # Connect the operators
    map(lambda (p, o): o.connect(lineitem_project[p]),
        enumerate(lineitem_scan))
    map(lambda (p, o): o.connect(lineitem_filter[p]),
        enumerate(lineitem_project))
    map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan))
    map(lambda (p, o): o.connect(part_filter[p]), enumerate(part_project))
    map(lambda (p, o): o.connect(join_build[p]), enumerate(part_filter))
    map(
        lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo),
                           enumerate(join_build)), enumerate(join_probe))
    map(lambda (p, o): join_probe[p % part_parts].connect_tuple_producer(o),
        enumerate(lineitem_filter))
    map(lambda (p, o): o.connect(part_aggregate[p]), enumerate(join_probe))
    map(lambda (p, o): o.connect(aggregate_reduce), enumerate(part_aggregate))
    aggregate_reduce.connect(aggregate_project)
    aggregate_project.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['promo_revenue']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    assert round(float(tuples[1][0]), 10) == 33.4262326420
Пример #11
0
def query_plan(settings):
    # type: (SyntheticBloomJoinSettings) -> QueryPlan
    """

    :return: None
    """

    query_plan = QueryPlan(is_async=settings.parallel,
                           buffer_size=settings.buffer_size)

    def scan_A_fn(df):
        df.columns = settings.table_A_field_names
        return df

    # Define the operators
    scan_A = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p),
                             "select "
                             "  {} "
                             "from "
                             "  S3Object "
                             "where "
                             "  {} "
                             "  {} "
                             .format(','.join(settings.table_A_field_names),
                                     settings.table_A_filter_sql,
                                     get_sql_suffix(settings.table_A_key, settings.table_A_parts, p,
                                                    settings.table_A_sharded)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_A_{}'.format(p),
                             query_plan,
                             False,
                             fn=scan_A_fn)),
            range(0, settings.table_A_parts))
    """
    field_names_map_A = OrderedDict(
        zip(['_{}'.format(i) for i, name in enumerate(settings.table_A_field_names)], settings.table_A_field_names))

    def project_fn_A(df):
        df.rename(columns=field_names_map_A, inplace=True)
        return df

    project_A = map(lambda p:
                    query_plan.add_operator(Project(
                        [ProjectExpression(k, v) for k, v in field_names_map_A.iteritems()],
                        'project_A_{}'.format(p),
                        query_plan,
                        True,
                        project_fn_A)),
                    range(0, settings.table_A_parts))
    """

    bloom_create_a = map(
        lambda p: query_plan.add_operator(
            BloomCreate(settings.table_A_AB_join_key, 'bloom_create_a_{}'.
                        format(p), query_plan, False)),
        range(0, settings.table_A_parts))

    def scan_B_fn(df):
        df.columns = settings.table_B_field_names
        return df

    scan_B = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScanBloomUse(get_file_key(settings.table_B_key, settings.table_B_sharded, p),
                                     "select "
                                     "  {} "
                                     "from "
                                     "  S3Object "
                                     "where "
                                     "  {} "
                                     "  {} "
                                     .format(','.join(settings.table_B_field_names),
                                             settings.table_B_filter_sql,
                                             get_sql_suffix(settings.table_B_key, settings.table_B_parts, p,
                                                            settings.table_B_sharded, add_where=False)),
                                     settings.table_B_AB_join_key, settings.format_,
                                     settings.use_pandas,
                                     settings.secure,
                                     settings.use_native,
                                     'scan_B_{}'.format(p),
                                     query_plan,
                                     False,
                                     fn=scan_B_fn)),
            range(0, settings.table_B_parts))
    """
    field_names_map_B = OrderedDict(
        zip(['_{}'.format(i) for i, name in enumerate(settings.table_B_field_names)], settings.table_B_field_names))

    def project_fn_B(df):
        df.rename(columns=field_names_map_B, inplace=True)
        return df

    project_B = map(lambda p:
                    query_plan.add_operator(Project(
                        [ProjectExpression(k, v) for k, v in field_names_map_B.iteritems()],
                        'project_B_{}'.format(p),
                        query_plan,
                        True,
                        project_fn_B)),
                    range(0, settings.table_B_parts))
    """

    def scan_C_fn(df):
        df.columns = settings.table_C_field_names
        return df

    scan_C = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScanBloomUse(get_file_key(settings.table_C_key, settings.table_C_sharded, p),
                                     "select "
                                     "  {} "
                                     "from "
                                     "  S3Object "
                                     "where "
                                     "  {} "
                                     "  {} "
                                     .format(','.join(settings.table_C_field_names),
                                             settings.table_C_filter_sql,
                                             get_sql_suffix(settings.table_C_key, settings.table_C_parts, p,
                                                            settings.table_C_sharded, add_where=False)),
                                     settings.table_C_BC_join_key, settings.format_,
                                     settings.use_pandas,
                                     settings.secure,
                                     settings.use_native,
                                     'scan_C_{}'.format(p),
                                     query_plan,
                                     False,
                                     fn=scan_C_fn)),
            range(0, settings.table_C_parts))
    """
    field_names_map_C = OrderedDict(
        zip(['_{}'.format(i) for i, name in enumerate(settings.table_C_field_names)], settings.table_C_field_names))

    def project_fn_C(df):
        df.rename(columns=field_names_map_C, inplace=True)
        return df

    project_C = map(lambda p:
                    query_plan.add_operator(Project(
                        [ProjectExpression(k, v) for k, v in field_names_map_C.iteritems()],
                        'project_C_{}'.format(p),
                        query_plan,
                        True,
                        project_fn_C)),
                    range(0, settings.table_C_parts))
    """

    map_A_to_B = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p),
                query_plan, False)), range(0, settings.table_A_parts))

    map_B_to_B = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p),
                query_plan, False)), range(0, settings.table_B_parts))

    map_B_to_C = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p),
                query_plan, False)), range(0, settings.table_B_parts))

    map_C_to_C = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p),
                query_plan, False)), range(0, settings.table_C_parts))

    join_build_A_B = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'.
                          format(p), query_plan, False)),
        range(0, settings.table_B_parts))

    join_probe_A_B = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression(settings.table_A_AB_join_key, settings.
                               table_B_AB_join_key), 'join_probe_A_B_{}'
                .format(p), query_plan, False)),
        range(0, settings.table_B_parts))

    bloom_create_ab = map(
        lambda p: query_plan.add_operator(
            BloomCreate(settings.table_B_BC_join_key, 'bloom_create_ab_{}'.
                        format(p), query_plan, False)),
        range(0, settings.table_B_parts))

    join_build_AB_C = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(settings.table_B_BC_join_key, 'join_build_AB_C_{}'.
                          format(p), query_plan, False)),
        range(0, settings.table_C_parts))

    join_probe_AB_C = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression(settings.table_B_BC_join_key, settings.
                               table_C_BC_join_key), 'join_probe_AB_C_{}'
                .format(p), query_plan, False)),
        range(0, settings.table_C_parts))

    def part_aggregate_fn(df):
        sum_ = df[settings.table_C_detail_field_name].astype(float).sum()
        return pd.DataFrame({'_0': [sum_]})

    part_aggregate = map(
        lambda p: query_plan.add_operator(
            Aggregate([
                AggregateExpression(
                    AggregateExpression.SUM, lambda t: float(t[
                        settings.table_C_detail_field_name]))
            ], settings.use_pandas, 'part_aggregate_{}'.format(p), query_plan,
                      False, part_aggregate_fn)),
        range(0, settings.table_C_parts))

    def aggregate_reduce_fn(df):
        sum_ = df['_0'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], settings.use_pandas, 'aggregate_reduce', query_plan, False,
                  aggregate_reduce_fn))

    aggregate_project = query_plan.add_operator(
        Project([ProjectExpression(lambda t: t['_0'], 'total_balance')],
                'aggregate_project', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    # Connect the operators
    connect_many_to_many(scan_A, map_A_to_B)
    #connect_many_to_many(scan_A, project_A)
    #connect_many_to_many(project_A, map_A_to_B)
    connect_all_to_all(map_A_to_B, join_build_A_B)
    connect_many_to_many(join_build_A_B, join_probe_A_B)

    #connect_many_to_many(project_A, bloom_create_a)
    connect_many_to_many(scan_A, bloom_create_a)
    connect_all_to_all(bloom_create_a, scan_B)
    connect_many_to_many(scan_B, map_B_to_B)
    #connect_many_to_many(scan_B, project_B)
    #connect_many_to_many(project_B, map_B_to_B)
    connect_all_to_all(map_B_to_B, join_probe_A_B)
    connect_many_to_many(join_probe_A_B, bloom_create_ab)
    connect_all_to_all(bloom_create_ab, scan_C)
    connect_many_to_many(join_build_AB_C, join_probe_AB_C)

    connect_many_to_many(join_probe_A_B, map_B_to_C)
    connect_all_to_all(map_B_to_C, join_build_AB_C)

    connect_many_to_many(scan_C, map_C_to_C)
    #connect_many_to_many(scan_C, project_C)
    #connect_many_to_many(project_C, map_C_to_C)
    connect_all_to_all(map_C_to_C, join_probe_AB_C)

    connect_many_to_many(join_probe_AB_C, part_aggregate)

    connect_many_to_one(part_aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, aggregate_project)
    connect_one_to_one(aggregate_project, collate)

    return query_plan
Пример #12
0
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts):
    """
    :return: None
    """

    print('')
    print("TPCH Q17 Bloom Join")
    print("-------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    part_scan = map(lambda p:
                    query_plan.add_operator(
                        tpch_q17.sql_scan_select_partkey_where_brand_container_op(
                            part_parts != 1,
                            p,
                            part_parts,
                            use_pandas,
                            'part_scan' + '_' + str(p),
                            query_plan)),
                    range(0, part_parts))

    part_project = map(lambda p:
                       query_plan.add_operator(
                           tpch_q17.project_partkey_op(
                               'part_project' + '_' + str(p),
                               query_plan)),
                       range(0, part_parts))

    part_bloom_create_map = map(lambda p:
                                query_plan.add_operator(
                                    Map('p_partkey', 'part_bloom_create_map' + '_' + str(p), query_plan, True)),
                                range(0, part_parts))

    part_lineitem_join_build_map = map(lambda p:
                                       query_plan.add_operator(
                                           Map('p_partkey', 'part_lineitem_join_build_map' + '_' + str(p), query_plan,
                                               True)),
                                       range(0, part_parts))

    part_bloom_create = map(lambda p:
                            query_plan.add_operator(
                                tpch_q17.bloom_create_partkey_op('part_bloom_create' + '_' + str(p), query_plan)),
                            range(0, part_parts))

    lineitem_bloom_use = map(lambda p:
                             query_plan.add_operator(
                                 tpch_q17.bloom_scan_lineitem_select_orderkey_partkey_quantity_extendedprice_where_partkey_bloom_partkey_op(
                                     part_parts != 1,
                                     p,
                                     part_parts,
                                     use_pandas,
                                     'lineitem_bloom_use' + '_' + str(p),
                                     query_plan)),
                             range(0, lineitem_parts))

    lineitem_project = map(lambda p:
                           query_plan.add_operator(
                               tpch_q17.project_orderkey_partkey_quantity_extendedprice_op(
                                   'lineitem_project' + '_' + str(p),
                                   query_plan)),
                           range(0, lineitem_parts))

    part_lineitem_join_probe_map = map(lambda p:
                                       query_plan.add_operator(
                                           Map('l_partkey', 'part_lineitem_join_probe_map' + '_' + str(p), query_plan,
                                               True)),
                                       range(0, lineitem_parts))

    # part_lineitem_join = map(lambda p:
    #                          query_plan.add_operator(
    #                              tpch_q17.join_p_partkey_l_partkey_op('part_lineitem_join' + '_' + str(p), query_plan)),
    #                          range(0, part_parts))

    part_lineitem_join_build = map(lambda p:
                                   query_plan.add_operator(
                                       HashJoinBuild('p_partkey',
                                                     'part_lineitem_join_build' + '_' + str(p), query_plan,
                                                     False)),
                                   range(0, part_parts))

    part_lineitem_join_probe = map(lambda p:
                                   query_plan.add_operator(
                                       HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                                                     'part_lineitem_join_probe' + '_' + str(p),
                                                     query_plan, True)),
                                   range(0, part_parts))

    lineitem_part_avg_group = map(lambda p:
                                  query_plan.add_operator(
                                      tpch_q17.group_partkey_avg_quantity_op('lineitem_part_avg_group' + '_' + str(p),
                                                                             query_plan)),
                                  range(0, part_parts))

    lineitem_part_avg_group_project = map(lambda p:
                                          query_plan.add_operator(
                                              tpch_q17.project_partkey_avg_quantity_op(
                                                  'lineitem_part_avg_group_project' + '_' + str(p), query_plan)),
                                          range(0, part_parts))

    # part_lineitem_join_avg_group_join = map(lambda p:
    #                                         query_plan.add_operator(
    #                                             tpch_q17.join_l_partkey_p_partkey_op(
    #                                                 'part_lineitem_join_avg_group_join' + '_' + str(p), query_plan)),
    #                                         range(0, part_parts))

    part_lineitem_join_avg_group_join_build = map(lambda p:
                                                  query_plan.add_operator(
                                                      HashJoinBuild('l_partkey',
                                                                    'part_lineitem_join_avg_group_join_build' + '_' + str(
                                                                        p), query_plan, False)),
                                                  range(0, part_parts))

    part_lineitem_join_avg_group_join_probe = map(lambda p:
                                                  query_plan.add_operator(
                                                      HashJoinProbe(JoinExpression('l_partkey', 'p_partkey'),
                                                                    'part_lineitem_join_avg_group_join_probe' + '_' + str(
                                                                        p),
                                                                    query_plan, True)),
                                                  range(0, part_parts))

    lineitem_filter = map(lambda p:
                          query_plan.add_operator(
                              tpch_q17.filter_lineitem_quantity_op('lineitem_filter' + '_' + str(p), query_plan)),
                          range(0, part_parts))

    extendedprice_sum_aggregate = map(lambda p:
                                      query_plan.add_operator(
                                          tpch_q17.aggregate_sum_extendedprice_op(
                                              'extendedprice_sum_aggregate' + '_' + str(p),
                                              query_plan)),
                                      range(0, part_parts))

    aggregate_reduce = query_plan.add_operator(
        Aggregate(
            [
                AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0']))
            ],
            'aggregate_reduce',
            query_plan,
            False))

    extendedprice_sum_aggregate_project = query_plan.add_operator(
        tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project', query_plan))

    collate = query_plan.add_operator(tpch_q17.collate_op('collate', query_plan))

    # Connect the operators
    # part_scan.connect(part_project)
    map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan))
    map(lambda (p, o): o.connect(part_bloom_create_map[p]), enumerate(part_project))
    map(lambda (p, o): o.connect(part_lineitem_join_build_map[p]), enumerate(part_project))

    # part_project.connect(part_bloom_create)
    map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_bloom_create)),
        enumerate(part_bloom_create_map))

    # part_bloom_create.connect(lineitem_bloom_use)
    map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(lineitem_bloom_use)),
        enumerate(part_bloom_create))

    # lineitem_bloom_use.connect(lineitem_project)
    map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_bloom_use))

    # part_lineitem_join.connect_left_producer(part_project)
    map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_lineitem_join_build)),
        enumerate(part_lineitem_join_build_map))
    map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_build))

    # part_lineitem_join.connect_right_producer(lineitem_project)
    map(lambda (p, o): o.connect(part_lineitem_join_probe_map[p]), enumerate(lineitem_project))
    map(lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1), enumerate(part_lineitem_join_probe)),
        enumerate(part_lineitem_join_probe_map))

    # part_lineitem_join.connect(lineitem_part_avg_group)
    map(lambda (p, o): o.connect(lineitem_part_avg_group[p]), enumerate(part_lineitem_join_probe))

    # lineitem_part_avg_group.connect(lineitem_part_avg_group_project)
    map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]), enumerate(lineitem_part_avg_group))

    # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project)
    map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]),
        enumerate(lineitem_part_avg_group_project))

    # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join)
    map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p].connect_build_producer(o),
        enumerate(part_lineitem_join_avg_group_join_build))
    map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p].connect_tuple_producer(o),
        enumerate(part_lineitem_join_probe))

    # part_lineitem_join_avg_group_join.connect(lineitem_filter)
    map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(part_lineitem_join_avg_group_join_probe))

    # lineitem_filter.connect(extendedprice_sum_aggregate)
    map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]), enumerate(lineitem_filter))

    # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project)
    map(lambda (p, o): o.connect(aggregate_reduce), enumerate(extendedprice_sum_aggregate))
    aggregate_reduce.connect(extendedprice_sum_aggregate_project)

    # extendedprice_sum_aggregate_project.connect(collate)
    extendedprice_sum_aggregate_project.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['avg_yearly']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    assert round(float(tuples[1][0]), 10) == 4632.1085714286
Пример #13
0
def run(parallel, buffer_size, parts):
    """

    :return: None
    """

    print('')
    print("TPCH Q14 Partitioned Bloom Join")
    print("-------------------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    # DATE is the first day of a month randomly selected from a random year within [1993 .. 1997].
    date = '1993-01-01'
    min_shipped_date = datetime.strptime(date, '%Y-%m-%d')
    max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30)

    collate = query_plan.add_operator(tpch_q14.collate_operator_def('collate', query_plan))
    sum_aggregate = query_plan.add_operator(
        Aggregate(
            [
                AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])),
                AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_1']))
            ],
            'sum_aggregate',
            query_plan,
            False))
    aggregate_project = query_plan.add_operator(
        tpch_q14.project_promo_revenue_operator_def('aggregate_project', query_plan))

    hash_join_build_ops = []
    hash_join_probe_ops = []
    part_bloom_create_ops = []
    line_item_scan_ops = []
    for p in range(0, parts):
        part_scan = query_plan.add_operator(
            tpch_q14.sql_scan_part_partkey_type_part_where_brand12_sharded_operator_def(p, parts,
                                                                                        'part_scan' + '_' + str(p),
                                                                                        query_plan))
        part_scan_project = query_plan.add_operator(
            tpch_q14.project_partkey_type_operator_def('part_scan_project' + '_' + str(p), query_plan))

        part_bloom_create = query_plan.add_operator(
            tpch_q14.bloom_create_p_partkey_operator_def('part_bloom_create' + '_' + str(p), query_plan))
        part_bloom_create_ops.append(part_bloom_create)

        lineitem_scan = query_plan.add_operator(
            tpch_q14.bloom_scan_lineitem_where_shipdate_sharded_operator_def(min_shipped_date, max_shipped_date, p,
                                                                             parts, 'lineitem_scan' + '_' + str(p),
                                                                             query_plan))
        line_item_scan_ops.append(lineitem_scan)

        lineitem_scan_project = query_plan.add_operator(
            tpch_q14.project_partkey_extendedprice_discount_operator_def('lineitem_scan_project' + '_' + str(p),
                                                                         query_plan))

        join_build = query_plan.add_operator(
            HashJoinBuild('p_partkey', 'join_build' + '_' + str(p), query_plan, False))
        hash_join_build_ops.append(join_build)

        join_probe = query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'join_probe' + '_' + str(p),
                          query_plan, False))
        hash_join_probe_ops.append(join_probe)

        aggregate = query_plan.add_operator(
            tpch_q14.aggregate_promo_revenue_operator_def('aggregate' + '_' + str(p), query_plan))

        part_scan.connect(part_scan_project)
        lineitem_scan.connect(lineitem_scan_project)

        part_scan_project.connect(part_bloom_create)
        # part_bloom_create.connect(lineitem_scan)

        part_scan_project.connect(join_build)

        join_probe.connect(aggregate)
        # aggregate.connect(aggregate_project)
        aggregate.connect(sum_aggregate)

        join_probe.connect_tuple_producer(lineitem_scan_project)
        # join_probe.connect(merge)

    for bloom_create_op in part_bloom_create_ops:
        for scan_op in line_item_scan_ops:
            bloom_create_op.connect(scan_op)

    for probe_op in hash_join_probe_ops:
        for build_op in hash_join_build_ops:
            probe_op.connect_build_producer(build_op)

    sum_aggregate.connect(aggregate_project)
    aggregate_project.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print("parts: {}".format(parts))
    print('')

    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(parts))

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['promo_revenue']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    # assert tuples[1] == [15.090116526324298]
    assert round(float(tuples[1][0]), 10) == 15.0901165263
Пример #14
0
def query_plan(settings):
    # type: (SyntheticSemiJoinSettings) -> QueryPlan
    """

    :return: None
    """

    if settings.use_shared_mem:
        system = WorkerSystem(settings.shared_memory_size)
    else:
        system = None

    query_plan = QueryPlan(system,
                           is_async=settings.parallel,
                           buffer_size=settings.buffer_size,
                           use_shared_mem=settings.use_shared_mem)

    # Define the operators
    scan_a = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p, settings.sf),
                             "select "
                             "  {} "
                             "from "
                             "  S3Object "
                             "where "
                             "  {} "
                             "  {} "
                             .format(settings.table_A_AB_join_key,
                                     settings.table_A_filter_sql,
                                     get_sql_suffix(settings.table_A_key, settings.table_A_parts, p,
                                                    settings.table_A_sharded)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_a' + '_{}'.format(p),
                             query_plan,
                             False)),
            range(0, settings.table_A_parts))

    field_names_map_a = OrderedDict(
        zip([
            '_{}'.format(i)
            for i, name in enumerate(settings.table_A_field_names)
        ], settings.table_A_field_names))

    def project_fn_a(df):
        df = df.rename(columns=field_names_map_a, copy=False)
        return df

    project_a = map(
        lambda p: query_plan.add_operator(
            Project([
                ProjectExpression(k, v)
                for k, v in field_names_map_a.iteritems()
            ], 'project_a' + '_{}'.format(p), query_plan, False, project_fn_a)
        ), range(0, settings.table_A_parts))

    bloom_create_ab_join_key = map(
        lambda p: query_plan.add_operator(
            BloomCreate(settings.table_A_AB_join_key,
                        'bloom_create_ab_join_key' + '_{}'.format(p),
                        query_plan,
                        False,
                        fp_rate=settings.fp_rate)),
        range(0, settings.table_A_parts))

    scan_b_on_ab_join_key = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScanBloomUse(get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf),
                                     "select "
                                     "  {},{} "
                                     "from "
                                     "  S3Object "
                                     "where "
                                     "  {} "
                                     "  {} "
                                     .format(settings.table_B_BC_join_key,
                                             settings.table_B_AB_join_key,
                                             settings.table_B_filter_sql,
                                             get_sql_suffix(settings.table_B_key, settings.table_B_parts, p,
                                                            settings.table_B_sharded, add_where=False)), settings.format_,
                                     settings.table_B_AB_join_key,
                                     settings.use_pandas,
                                     settings.secure,
                                     settings.use_native,
                                     'scan_b_on_ab_join_key' + '_{}'.format(p),
                                     query_plan,
                                     False)),
            range(0, settings.table_B_parts))

    if settings.table_C_key is None:

        scan_b_detail_on_b_pk = \
            map(lambda p:
                query_plan.add_operator(
                    SQLTableScanBloomUse(get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf),
                                         "select "
                                         "  {},{} "
                                         "from "
                                         "  S3Object "
                                         "where "
                                         "  {} "
                                         "  {} "
                                         .format(settings.table_B_primary_key,
                                                 settings.table_B_detail_field_name,
                                                 settings.table_B_filter_sql,
                                                 get_sql_suffix(settings.table_B_key, settings.table_B_parts, p,
                                                                settings.table_B_sharded, add_where=False)), settings.format_,
                                         settings.table_B_primary_key,
                                         settings.use_pandas,
                                         settings.secure,
                                         settings.use_native,
                                         'scan_c_detail_on_b_pk' + '_{}'.format(p),
                                         query_plan,
                                         False)),
                range(0, settings.table_B_parts))

        field_names_map_b_detail = OrderedDict([
            ('_0', settings.table_B_primary_key),
            ('_1', settings.table_B_detail_field_name)
        ])

        def project_fn_b_detail(df):
            df.rename(columns=field_names_map_b_detail, inplace=True)
            return df

        project_b_detail = map(
            lambda p: query_plan.add_operator(
                Project([
                    ProjectExpression(k, v)
                    for k, v in field_names_map_b_detail.iteritems()
                ], 'project_b_detail' + '_{}'.format(p), query_plan,
                        False, project_fn_b_detail)),
            range(0, settings.table_B_parts))

        map_b_pk_1 = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_B_primary_key, 'map_b_pk_1' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_B_parts))

        map_b_pk_2 = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_B_primary_key, 'map_b_pk_2' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_B_parts))

        bloom_create_b_pk = map(
            lambda p: query_plan.add_operator(
                BloomCreate(settings.table_B_primary_key,
                            'bloom_create_b_pk' + '_{}'.format(p),
                            query_plan,
                            False,
                            fp_rate=settings.fp_rate)),
            range(0, settings.table_B_parts))

        join_probe_ab_and_b_on_b_pk = map(
            lambda p: query_plan.add_operator(
                HashJoinProbe(
                    JoinExpression(settings.table_B_primary_key, settings.
                                   table_B_primary_key),
                    'join_probe_ab_and_b_on_b_pk' + '_{}'.format(
                        p), query_plan, False)),
            range(0, settings.table_B_parts))

        join_build_ab_and_b_on_b_pk = map(
            lambda p: query_plan.add_operator(
                HashJoinBuild(settings.table_B_primary_key,
                              'join_build_ab_and_b_on_b_pk' + '_{}'.format(
                                  p), query_plan, False)),
            range(0, settings.table_B_parts))

    else:
        scan_c_on_bc_join_key = \
            map(lambda p:
                query_plan.add_operator(
                    SQLTableScanBloomUse(get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf),
                                         "select "
                                         "  {}, {} "
                                         "from "
                                         "  S3Object "
                                         "where "
                                         "  {} "
                                         "  {} "
                                         .format(settings.table_C_primary_key,
                                                 settings.table_C_BC_join_key,
                                                 settings.table_C_filter_sql,
                                                 get_sql_suffix(settings.table_C_key, settings.table_C_parts, p,
                                                                settings.table_C_sharded, add_where=False)),
                                         settings.table_C_BC_join_key, settings.format_,
                                         settings.use_pandas,
                                         settings.secure,
                                         settings.use_native,
                                         'scan_c_on_bc_join_key' + '_{}'.format(p),
                                         query_plan,
                                         False)),
                range(0, settings.table_C_parts))

        field_names_map_c = OrderedDict(
            zip([
                '_{}'.format(i)
                for i, name in enumerate(settings.table_C_field_names)
            ], settings.table_C_field_names))

        def project_fn_c(df):
            df.rename(columns=field_names_map_c, inplace=True)
            return df

        project_c = map(
            lambda p: query_plan.add_operator(
                Project([
                    ProjectExpression(k, v)
                    for k, v in field_names_map_c.iteritems()
                ], 'project_c' + '_{}'.format(p), query_plan, False,
                        project_fn_c)), range(0, settings.table_C_parts))

        scan_c_detail_on_c_pk = \
            map(lambda p:
                query_plan.add_operator(
                    SQLTableScanBloomUse(get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf),
                                         "select "
                                         "  {},{} "
                                         "from "
                                         "  S3Object "
                                         "where "
                                         "  {} "
                                         "  {} "
                                         .format(settings.table_C_primary_key,
                                                 settings.table_C_detail_field_name,
                                                 settings.table_C_filter_sql,
                                                 get_sql_suffix(settings.table_C_key, settings.table_C_parts, p,
                                                                settings.table_C_sharded, add_where=False)),
                                         settings.table_C_primary_key, settings.format_,
                                         settings.use_pandas,
                                         settings.secure,
                                         settings.use_native,
                                         'scan_c_detail_on_c_pk' + '_{}'.format(p),
                                         query_plan,
                                         False)),
                range(0, settings.table_C_parts))

        field_names_map_c_detail = OrderedDict([
            ('_0', settings.table_C_primary_key),
            ('_1', settings.table_C_detail_field_name)
        ])

        def project_fn_c_detail(df):
            df.rename(columns=field_names_map_c_detail, inplace=True)
            return df

        project_c_detail = map(
            lambda p: query_plan.add_operator(
                Project([
                    ProjectExpression(k, v)
                    for k, v in field_names_map_c_detail.iteritems()
                ], 'project_c_detail' + '_{}'.format(p), query_plan,
                        False, project_fn_c_detail)),
            range(0, settings.table_C_parts))

        map_bc_b_join_key = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_B_BC_join_key, 'map_bc_b_join_key' + '_{}'.
                    format(p), query_plan, False)),
            range(0, settings.table_C_parts))

        map_c_pk_1 = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_C_primary_key, 'map_c_pk_1' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_C_parts))

        map_c_pk_2 = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_C_primary_key, 'map_c_pk_2' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_C_parts))

        bloom_create_c_pk = map(
            lambda p: query_plan.add_operator(
                BloomCreate(settings.table_C_primary_key,
                            'bloom_create_bc_b_to_c_join_key_{}'.format(p),
                            query_plan,
                            False,
                            fp_rate=settings.fp_rate)),
            range(0, settings.table_C_parts))

        join_build_ab_and_c_on_bc_join_key = map(
            lambda p: query_plan.add_operator(
                HashJoinBuild(
                    settings.table_B_BC_join_key,
                    'join_build_ab_and_c_on_bc_join_key' + '_{}'.format(
                        p), query_plan, False)),
            range(0, settings.table_C_parts))

        join_probe_ab_and_c_on_bc_join_key = map(
            lambda p: query_plan.add_operator(
                HashJoinProbe(
                    JoinExpression(settings.table_B_BC_join_key, settings.
                                   table_C_BC_join_key),
                    'join_probe_ab_and_c_on_bc_join_key' + '_{}'.format(
                        p), query_plan, False)),
            range(0, settings.table_C_parts))

        join_build_abc_and_c_on_c_pk = map(
            lambda p: query_plan.add_operator(
                HashJoinBuild(settings.table_C_primary_key,
                              'join_build_abc_and_c_on_c_pk' + '_{}'.format(
                                  p), query_plan, False)),
            range(0, settings.table_C_parts))

        join_probe_abc_and_c_on_c_pk = map(
            lambda p: query_plan.add_operator(
                HashJoinProbe(
                    JoinExpression(settings.table_C_primary_key, settings.
                                   table_C_primary_key),
                    'join_probe_abc_and_c_on_c_pk' + '_{}'.format(
                        p), query_plan, False)),
            range(0, settings.table_C_parts))

        bloom_create_bc_join_key = map(
            lambda p: query_plan.add_operator(
                BloomCreate(settings.table_B_BC_join_key,
                            'bloom_create_bc_join_key' + '_{}'.format(
                                p), query_plan, False)),
            range(0, settings.table_B_parts))

        map_bc_c_join_key = map(
            lambda p: query_plan.add_operator(
                Map(settings.table_C_BC_join_key, 'map_bc_c_join_key' + '_{}'.
                    format(p), query_plan, False)),
            range(0, settings.table_B_parts))

    field_names_map_b = OrderedDict(
        zip([
            '_{}'.format(i)
            for i, name in enumerate(settings.table_B_field_names)
        ], settings.table_B_field_names))

    def project_fn_b(df):
        df.rename(columns=field_names_map_b, inplace=True)
        return df

    project_b = map(
        lambda p: query_plan.add_operator(
            Project([
                ProjectExpression(k, v)
                for k, v in field_names_map_b.iteritems()
            ], 'project_b' + '_{}'.format(p), query_plan, False, project_fn_b)
        ), range(0, settings.table_B_parts))

    map_ab_a_join_key = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_A_AB_join_key, 'map_ab_a_join_key' + '_{}'
                .format(p), query_plan, False)),
        range(0, settings.table_A_parts))

    map_ab_b_join_key = map(
        lambda p: query_plan.add_operator(
            Map(settings.table_B_AB_join_key, 'map_ab_b_join_key' + '_{}'
                .format(p), query_plan, False)),
        range(0, settings.table_B_parts))

    join_build_a_and_b_on_ab_join_key = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(
                settings.table_A_AB_join_key,
                'join_build_a_and_b_on_ab_join_key' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_B_parts))

    join_probe_a_and_b_on_ab_join_key = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression(settings.table_A_AB_join_key, settings.
                               table_B_AB_join_key),
                'join_probe_a_and_b_on_ab_join_key' + '_{}'.format(
                    p), query_plan, False)), range(0, settings.table_B_parts))

    if settings.table_C_key is None:

        def part_aggregate_fn(df):
            sum_ = df[settings.table_B_detail_field_name].astype(
                np.float).sum()
            return pd.DataFrame({'_0': [sum_]})

        part_aggregate = map(
            lambda p: query_plan.add_operator(
                Aggregate([
                    AggregateExpression(
                        AggregateExpression.SUM, lambda t: float(t[
                            settings.table_B_detail_field_name]))
                ], settings.use_pandas, 'part_aggregate_{}'.format(p),
                          query_plan, False, part_aggregate_fn)),
            range(0, settings.table_B_parts))

    else:

        def part_aggregate_fn(df):
            sum_ = df[settings.table_C_detail_field_name].astype(
                np.float).sum()
            return pd.DataFrame({'_0': [sum_]})

        part_aggregate = map(
            lambda p: query_plan.add_operator(
                Aggregate([
                    AggregateExpression(
                        AggregateExpression.SUM, lambda t: float(t[
                            settings.table_C_detail_field_name]))
                ], settings.use_pandas, 'part_aggregate_{}'.format(p),
                          query_plan, False, part_aggregate_fn)),
            range(0, settings.table_C_parts))

    def aggregate_reduce_fn(df):
        sum_ = df['_0'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], settings.use_pandas, 'aggregate_reduce', query_plan, False,
                  aggregate_reduce_fn))

    aggregate_project = query_plan.add_operator(
        Project([ProjectExpression(lambda t: t['_0'], 'total_balance')],
                'aggregate_project', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    # Inline some of the operators
    map(lambda o: o.set_async(False), project_a)
    map(lambda o: o.set_async(False), project_b)
    map(lambda o: o.set_async(False), map_ab_a_join_key)
    map(lambda o: o.set_async(False), map_ab_b_join_key)
    if settings.table_C_key is None:
        map(lambda o: o.set_async(False), map_b_pk_1)
        map(lambda o: o.set_async(False), map_b_pk_2)
        map(lambda o: o.set_async(False), project_b_detail)
    else:
        map(lambda o: o.set_async(False), map_bc_b_join_key)
        map(lambda o: o.set_async(False), map_bc_c_join_key)
        map(lambda o: o.set_async(False), map_c_pk_1)
        map(lambda o: o.set_async(False), map_c_pk_2)
        map(lambda o: o.set_async(False), project_c)
        map(lambda o: o.set_async(False), project_c_detail)
    aggregate_project.set_async(False)

    # Connect the operators
    connect_many_to_many(scan_a, project_a)

    connect_many_to_many(project_a, map_ab_a_join_key)

    connect_all_to_all(map_ab_a_join_key, join_build_a_and_b_on_ab_join_key)
    connect_all_to_all(project_a, bloom_create_ab_join_key)
    # connect_all_to_all(map_A_to_B, join_build_a_and_b_on_ab_join_key)
    connect_many_to_many(join_build_a_and_b_on_ab_join_key,
                         join_probe_a_and_b_on_ab_join_key)

    # connect_all_to_all(map_bloom_A_to_B, bloom_create_ab_join_key)
    connect_many_to_many(bloom_create_ab_join_key, scan_b_on_ab_join_key)
    connect_many_to_many(scan_b_on_ab_join_key, project_b)
    # connect_many_to_many(project_b, join_probe_a_and_b_on_ab_join_key)
    # connect_all_to_all(map_B_to_B, join_probe_a_and_b_on_ab_join_key)

    connect_many_to_many(project_b, map_ab_b_join_key)
    connect_all_to_all(map_ab_b_join_key, join_probe_a_and_b_on_ab_join_key)

    # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_bloom_B_to_B)

    if settings.table_C_key is None:
        # connect_all_to_all(join_probe_a_and_b_on_ab_join_key, part_aggregate)
        connect_many_to_many(scan_b_detail_on_b_pk, project_b_detail)
        connect_many_to_many(project_b_detail, map_b_pk_2)
        connect_many_to_many(bloom_create_b_pk, scan_b_detail_on_b_pk)
        connect_all_to_all(join_probe_a_and_b_on_ab_join_key,
                           bloom_create_b_pk)
        connect_all_to_all(map_b_pk_2, join_probe_ab_and_b_on_b_pk)
        connect_many_to_many(join_probe_ab_and_b_on_b_pk, part_aggregate)
        connect_many_to_many(join_build_ab_and_b_on_b_pk,
                             join_probe_ab_and_b_on_b_pk)
        connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_b_pk_1)
        connect_all_to_all(map_b_pk_1, join_build_ab_and_b_on_b_pk)

    else:
        connect_all_to_all(join_probe_a_and_b_on_ab_join_key,
                           bloom_create_bc_join_key)
        connect_many_to_many(bloom_create_bc_join_key, scan_c_on_bc_join_key)
        connect_many_to_many(scan_c_on_bc_join_key, project_c)
        # connect_many_to_many(project_c, join_probe_ab_and_c_on_bc_join_key)
        connect_all_to_all(map_bc_c_join_key,
                           join_probe_ab_and_c_on_bc_join_key)
        # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, join_build_ab_and_c_on_bc_join_key)
        connect_many_to_many(join_probe_a_and_b_on_ab_join_key,
                             map_bc_b_join_key)
        connect_all_to_all(map_bc_b_join_key,
                           join_build_ab_and_c_on_bc_join_key)
        connect_all_to_all(join_probe_ab_and_c_on_bc_join_key,
                           bloom_create_c_pk)
        # connect_many_to_many(join_probe_ab_and_c_on_bc_join_key, join_build_abc_and_c_on_c_pk)
        connect_many_to_many(join_probe_ab_and_c_on_bc_join_key, map_c_pk_1)
        connect_all_to_all(map_c_pk_1, join_build_abc_and_c_on_c_pk)
        connect_many_to_many(bloom_create_c_pk, scan_c_detail_on_c_pk)
        # connect_all_to_all(bloom_create_bc_join_key, scan_c_detail_on_c_pk)
        connect_many_to_many(join_build_abc_and_c_on_c_pk,
                             join_probe_abc_and_c_on_c_pk)
        # connect_many_to_many(join_probe_a_and_b_on_ab_join_key, map_B_to_C)
        # connect_all_to_all(join_probe_a_and_b_on_ab_join_key, join_build_abc_and_c_on_c_pk)
        connect_many_to_many(scan_c_detail_on_c_pk, project_c_detail)
        # connect_many_to_many(project_c_detail, map_C_to_C)
        # connect_all_to_all(project_c_detail, join_probe_abc_and_c_on_c_pk)
        connect_many_to_many(project_c_detail, map_c_pk_2)

        connect_many_to_many(project_c, map_bc_c_join_key)
        connect_many_to_many(join_build_ab_and_c_on_bc_join_key,
                             join_probe_ab_and_c_on_bc_join_key)
        connect_all_to_all(map_c_pk_2, join_probe_abc_and_c_on_c_pk)

        connect_many_to_many(join_probe_abc_and_c_on_c_pk, part_aggregate)

    connect_many_to_one(part_aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, aggregate_project)
    connect_one_to_one(aggregate_project, collate)

    return query_plan
Пример #15
0
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts,
        part_parts, lineitem_sharded, part_sharded, other_parts, sf, fp_rate,
        expected_result, format_):
    """

    :return: None
    """

    print('')
    print("TPCH Q19 Bloom Join")
    print("-------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Define the operators
    lineitem_bloom_use = \
        map(lambda p:
            query_plan.add_operator(
                tpch_q19.bloom_scan_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_where_filtered_op(
                    lineitem_sharded,
                    p,
                    lineitem_parts,
                    use_pandas,
                    secure,
                    use_native,
                    'lineitem_bloom_use' + '_' + str(p), query_plan, sf, format_)),
            range(0, lineitem_parts))

    part_bloom_create = query_plan.add_operator(
        tpch_q19.bloom_create_partkey_op('part_bloom_create', query_plan,
                                         fp_rate))

    part_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q19.
            sql_scan_part_partkey_brand_size_container_where_filtered_op(
                part_sharded, p, part_parts, use_pandas, secure, use_native,
                'part_scan' + '_' + str(p), query_plan, sf, format_)),
        range(0, part_parts))

    lineitem_project = \
        map(lambda p:
            query_plan.add_operator(
                tpch_q19.project_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_filtered_op(
                    'lineitem_project' + '_' + str(p),
                    query_plan)),
            range(0, lineitem_parts))

    lineitem_map = map(
        lambda p: query_plan.add_operator(
            Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, False)
        ), range(0, lineitem_parts))

    part_project = map(
        lambda p: query_plan.add_operator(
            tpch_q19.project_partkey_brand_size_container_filtered_op(
                'part_project' + '_' + str(p), query_plan)),
        range(0, part_parts))

    part_map = map(
        lambda p: query_plan.add_operator(
            Map('p_partkey', 'part_map' + '_' + str(p), query_plan, False)),
        range(0, part_parts))

    # part_bloom_create_map = map(lambda p:
    #                query_plan.add_operator(
    #                    Map('p_partkey', 'part_bloom_create_map' + '_' + str(p), query_plan, False)),
    #                range(0, part_parts))

    # lineitem_part_join = map(lambda p:
    #                          query_plan.add_operator(tpch_q19.join_op(query_plan)),
    #                          range(0, part_parts))

    lineitem_part_join_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('p_partkey', 'lineitem_part_join_build' + '_' + str(
                p), query_plan, False)), range(0, other_parts))

    lineitem_part_join_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                          'lineitem_part_join_probe' + '_' + str(
                              p), query_plan, False)), range(0, other_parts))

    filter_op = map(
        lambda p: query_plan.add_operator(
            tpch_q19.filter_def('filter_op' + '_' + str(p), query_plan)),
        range(0, other_parts))
    aggregate = map(
        lambda p: query_plan.add_operator(
            tpch_q19.aggregate_def('aggregate' + '_' + str(p), query_plan,
                                   use_pandas)), range(0, other_parts))

    def aggregate_reduce_fn(df):
        sum1_ = df['_0'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum1_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], use_pandas, 'aggregate_reduce', query_plan, False,
                  aggregate_reduce_fn))

    aggregate_project = query_plan.add_operator(
        tpch_q19.aggregate_project_def('aggregate_project', query_plan))
    collate = query_plan.add_operator(
        tpch_q19.collate_op('collate', query_plan))

    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), part_project)
    map(lambda o: o.set_async(False), lineitem_map)
    map(lambda o: o.set_async(False), part_map)
    map(lambda o: o.set_async(False), filter_op)
    map(lambda o: o.set_async(False), aggregate)
    aggregate_project.set_async(False)

    # Connect the operators
    # part_scan.connect(part_project)
    connect_many_to_many(part_scan, part_project)
    connect_many_to_many(part_project, part_map)

    # part_project.connect(part_bloom_create)
    connect_many_to_one(part_project, part_bloom_create)

    # part_bloom_create.connect(lineitem_bloom_use)
    connect_one_to_many(part_bloom_create, lineitem_bloom_use)

    # lineitem_bloom_use.connect(lineitem_project)
    connect_many_to_many(lineitem_bloom_use, lineitem_project)
    connect_many_to_many(lineitem_project, lineitem_map)

    # lineitem_part_join.connect_left_producer(lineitem_project)
    connect_all_to_all(part_map, lineitem_part_join_build)

    # lineitem_part_join.connect_right_producer(part_project)
    connect_many_to_many(lineitem_part_join_build, lineitem_part_join_probe)
    connect_all_to_all(lineitem_map, lineitem_part_join_probe)

    # lineitem_part_join.connect(filter_op)
    connect_many_to_many(lineitem_part_join_probe, filter_op)

    # filter_op.connect(aggregate)
    connect_many_to_many(filter_op, aggregate)

    # aggregate.connect(aggregate_project)
    connect_many_to_one(aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, aggregate_project)

    # aggregate_project.connect(collate)
    connect_one_to_one(aggregate_project, collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("part_sharded: {}".format(part_sharded))
    print("other_parts: {}".format(other_parts))
    print("fp_rate: {}".format(fp_rate))
    print("format: {}".format(format_))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['revenue']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
Пример #16
0
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts,
        part_parts, lineitem_sharded, part_sharded, other_parts, sf, fp_rate,
        expected_result, format_):
    """
    :return: None
    """

    print('')
    print("TPCH Q17 Bloom Join")
    print("-------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    part_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q17.sql_scan_select_partkey_where_brand_container_op(
                part_sharded, p, part_parts, use_pandas, secure, use_native,
                'part_scan' + '_' + str(p), query_plan, sf, format_)),
        range(0, part_parts))

    part_project = map(
        lambda p: query_plan.add_operator(
            tpch_q17.project_partkey_op('part_project' + '_' + str(p),
                                        query_plan)), range(0, part_parts))

    part_lineitem_join_build_map = map(
        lambda p: query_plan.add_operator(
            Map('p_partkey', 'part_lineitem_join_build_map' + '_' + str(p),
                query_plan, False)), range(0, part_parts))

    part_bloom_create = query_plan.add_operator(
        tpch_q17.bloom_create_partkey_op(fp_rate, 'part_bloom_create',
                                         query_plan))

    lineitem_bloom_use = \
        map(lambda p:
            query_plan.add_operator(
                tpch_q17.bloom_scan_lineitem_select_orderkey_partkey_quantity_extendedprice_bloom_partkey_op(
                    lineitem_sharded,
                    p,
                    part_parts,
                    use_pandas,
                    secure,
                    use_native,
                    'lineitem_bloom_use' + '_' + str(p),
                    query_plan,
                    sf, format_)),
            range(0, lineitem_parts))

    lineitem_project = map(
        lambda p: query_plan.add_operator(
            tpch_q17.
            project_lineitem_filtered_orderkey_partkey_quantity_extendedprice_op(
                'lineitem_project' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))

    part_lineitem_join_probe_map = map(
        lambda p: query_plan.add_operator(
            Map('l_partkey', 'part_lineitem_join_probe_map' + '_' + str(p),
                query_plan, False)), range(0, lineitem_parts))

    part_lineitem_join_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('p_partkey', 'part_lineitem_join_build' + '_' + str(
                p), query_plan, False)), range(0, other_parts))

    part_lineitem_join_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                          'part_lineitem_join_probe' + '_' + str(
                              p), query_plan, False)), range(0, other_parts))

    lineitem_part_avg_group = map(
        lambda p: query_plan.add_operator(
            tpch_q17.group_partkey_avg_quantity_op(
                'lineitem_part_avg_group' + '_' + str(p), query_plan)),
        range(0, other_parts))

    lineitem_part_avg_group_project = map(
        lambda p: query_plan.add_operator(
            tpch_q17.project_partkey_avg_quantity_op(
                'lineitem_part_avg_group_project' + '_' + str(p), query_plan)),
        range(0, other_parts))

    part_lineitem_join_avg_group_join_build = \
        map(lambda p:
            query_plan.add_operator(
                HashJoinBuild('l_partkey',
                              'part_lineitem_join_avg_group_join_build' + '_' + str(p),
                              query_plan,
                              False)),
            range(0, other_parts))

    part_lineitem_join_avg_group_join_probe = \
        map(lambda p:
            query_plan.add_operator(
                HashJoinProbe(JoinExpression('l_partkey', 'l_partkey'),
                              'part_lineitem_join_avg_group_join_probe' + '_' + str(p),
                              query_plan,
                              False)),
            range(0, other_parts))

    lineitem_filter = map(
        lambda p: query_plan.add_operator(
            tpch_q17.filter_lineitem_quantity_op(
                'lineitem_filter' + '_' + str(p), query_plan)),
        range(0, other_parts))

    extendedprice_sum_aggregate = map(
        lambda p: query_plan.add_operator(
            tpch_q17.aggregate_sum_extendedprice_op(
                use_pandas, 'extendedprice_sum_aggregate' + '_' + str(p),
                query_plan)), range(0, other_parts))

    def aggregate_reduce_fn(df):
        sum1_ = df['_0'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum1_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], use_pandas, 'aggregate_reduce', query_plan, False,
                  aggregate_reduce_fn))

    extendedprice_sum_aggregate_project = query_plan.add_operator(
        tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project',
                                       query_plan))

    collate = query_plan.add_operator(
        tpch_q17.collate_op('collate', query_plan))

    # Inline what we can
    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), part_project)
    map(lambda o: o.set_async(False), lineitem_filter)
    map(lambda o: o.set_async(False), part_lineitem_join_probe_map)
    map(lambda o: o.set_async(False), part_lineitem_join_build_map)
    map(lambda o: o.set_async(False), lineitem_part_avg_group)
    map(lambda o: o.set_async(False), lineitem_part_avg_group_project)
    map(lambda o: o.set_async(False), extendedprice_sum_aggregate)
    extendedprice_sum_aggregate_project.set_async(False)

    # Connect the operators
    # part_scan.connect(part_project)
    map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan))
    # map(lambda (p, o): o.connect(part_bloom_create_map[p]), enumerate(part_project))
    map(lambda (p, o): o.connect(part_lineitem_join_build_map[p]),
        enumerate(part_project))

    connect_many_to_one(part_project, part_bloom_create)
    connect_one_to_many(part_bloom_create, lineitem_bloom_use)

    # part_project.connect(part_bloom_create)
    # map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_bloom_create)),
    #     enumerate(part_bloom_create_map))

    # part_bloom_create.connect(lineitem_bloom_use)
    # map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(lineitem_bloom_use)),
    #     enumerate(part_bloom_create))

    # lineitem_bloom_use.connect(lineitem_project)
    map(lambda (p, o): o.connect(lineitem_project[p]),
        enumerate(lineitem_bloom_use))

    # part_lineitem_join.connect_left_producer(part_project)
    map(
        lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2),
                             enumerate(part_lineitem_join_build)),
        enumerate(part_lineitem_join_build_map))
    map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o),
        enumerate(part_lineitem_join_build))

    # part_lineitem_join.connect_right_producer(lineitem_project)
    # map(lambda (p, o): o.connect(part_lineitem_join_probe_map[p]), enumerate(lineitem_project))
    connect_many_to_many(lineitem_project, part_lineitem_join_probe_map)

    map(
        lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1),
                             enumerate(part_lineitem_join_probe)),
        enumerate(part_lineitem_join_probe_map))

    # part_lineitem_join.connect(lineitem_part_avg_group)
    map(lambda (p, o): o.connect(lineitem_part_avg_group[p]),
        enumerate(part_lineitem_join_probe))

    # lineitem_part_avg_group.connect(lineitem_part_avg_group_project)
    map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]),
        enumerate(lineitem_part_avg_group))

    # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project)
    map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]),
        enumerate(lineitem_part_avg_group_project))

    # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join)
    map(
        lambda (p, o): part_lineitem_join_avg_group_join_probe[p].
        connect_build_producer(o),
        enumerate(part_lineitem_join_avg_group_join_build))
    map(
        lambda (p, o): part_lineitem_join_avg_group_join_probe[p].
        connect_tuple_producer(o), enumerate(part_lineitem_join_probe))

    # part_lineitem_join_avg_group_join.connect(lineitem_filter)
    map(lambda (p, o): o.connect(lineitem_filter[p]),
        enumerate(part_lineitem_join_avg_group_join_probe))

    # lineitem_filter.connect(extendedprice_sum_aggregate)
    map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]),
        enumerate(lineitem_filter))

    # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project)
    map(lambda (p, o): o.connect(aggregate_reduce),
        enumerate(extendedprice_sum_aggregate))
    aggregate_reduce.connect(extendedprice_sum_aggregate_project)

    # extendedprice_sum_aggregate_project.connect(collate)
    extendedprice_sum_aggregate_project.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("part_sharded: {}".format(part_sharded))
    print("other_parts: {}".format(other_parts))
    print("fp_rate: {}".format(fp_rate))
    print("format: {}".format(format_))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['avg_yearly']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    if s3filter.util.constants.TPCH_SF == 10:
        assert round(
            float(tuples[1][0]), 10
        ) == 372414.2899999995  # TODO: This isn't correct but haven't checked tpch17 on 10 sf yet
    elif s3filter.util.constants.TPCH_SF == 1:
        numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
Пример #17
0
def test_join_baseline_pandas():
    """Tests a join

    :return: None
    """

    query_plan = QueryPlan(is_async=True, buffer_size=0)

    # Query plan
    supplier_scan = query_plan.add_operator(
        SQLTableScan('region.csv', 'select * from S3Object;', True, False, False, 'supplier_scan', query_plan, True))

    def supplier_project_fn(df):
        df = df.filter(['_0'], axis='columns')
        df = df.rename(columns={'_0': 'r_regionkey'})
        return df

    supplier_project = query_plan.add_operator(
        Project([ProjectExpression(lambda t_: t_['_0'], 'r_regionkey')], 'supplier_project', query_plan, True, supplier_project_fn))

    nation_scan = query_plan.add_operator(
        SQLTableScan('nation.csv', 'select * from S3Object;', True, False, False, 'nation_scan', query_plan, True))

    def nation_project_fn(df):
        df = df.filter(['_2'], axis='columns')
        df = df.rename(columns={'_2': 'n_regionkey'})
        return df

    nation_project = query_plan.add_operator(
        Project([ProjectExpression(lambda t_: t_['_2'], 'n_regionkey')], 'nation_project', query_plan, True, nation_project_fn))

    supplier_nation_join_build = query_plan.add_operator(
        HashJoinBuild('n_regionkey', 'supplier_nation_join_build', query_plan, True))

    supplier_nation_join_probe = query_plan.add_operator(
        HashJoinProbe(JoinExpression('n_regionkey', 'r_regionkey'), 'supplier_nation_join_probe', query_plan, True))

    collate = query_plan.add_operator(Collate('collate', query_plan, True))

    supplier_scan.connect(supplier_project)
    nation_scan.connect(nation_project)
    nation_project.connect(supplier_nation_join_build)
    supplier_nation_join_probe.connect_build_producer(supplier_nation_join_build)
    supplier_nation_join_probe.connect_tuple_producer(supplier_project)
    supplier_nation_join_probe.connect(collate)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['n_regionkey', 'r_regionkey']

    assert len(tuples) == 25 + 1

    assert tuples[0] == field_names

    num_rows = 0
    for t in tuples:
        num_rows += 1
        # Assert that the nation_key in table 1 has been joined with the record in table 2 with the same nation_key
        if num_rows > 1:
            lt = IndexedTuple.build(t, field_names)
            assert lt['n_regionkey'] == lt['r_regionkey']