Exemplo n.º 1
0
def lineitem_filter_def(name, query_plan):
    def pd_expr(df):
        return ((df['l_quantity'].astype(np.int) >= 3) &
                (df['l_quantity'].astype(np.int) <= 3 + 10) &
                (df['l_shipmode'].isin(['AIR', 'AIR REG'])) &
                (df['l_shipinstruct'] == 'DELIVER IN PERSON')) | (
                    (df['l_quantity'].astype(np.int) >= 16) &
                    (df['l_quantity'].astype(np.int) <= 16 + 10) &
                    (df['l_shipmode'].isin(['AIR', 'AIR REG'])) &
                    (df['l_shipinstruct'] == 'DELIVER IN PERSON')) | (
                        (df['l_quantity'].astype(np.int) >= 24) &
                        (df['l_quantity'].astype(np.int) <= 24 + 10) &
                        (df['l_shipmode'].isin(['AIR', 'AIR REG'])) &
                        (df['l_shipinstruct'] == 'DELIVER IN PERSON'))

    return Filter(
        PredicateExpression(
            lambda t_:
            (3 <= int(t_['l_quantity']) <= 3 + 10 and t_['l_shipmode'] in [
                'AIR', 'AIR REG'
            ] and t_['l_shipinstruct'] == 'DELIVER IN PERSON') or
            (16 <= int(t_['l_quantity']) <= 16 + 10 and t_['l_shipmode'] in
             ['AIR', 'AIR REG'] and t_['l_shipinstruct'] == 'DELIVER IN PERSON'
             ) or (24 <= int(t_['l_quantity']) <= 24 + 10 and t_[
                 'l_shipmode'] in ['AIR', 'AIR REG'] and t_['l_shipinstruct']
                   == 'DELIVER IN PERSON'), pd_expr), name, query_plan, False)
Exemplo n.º 2
0
def part_filter_def(name, query_plan):
    def pd_expr(df):
        return ((df['p_brand'] == 'Brand#11') & (df['p_container'].isin(
            ['SM CASE', 'SM BOX', 'SM PACK', 'SM PKG'])) &
                (df['p_size'].astype(np.int) >= 1) &
                (df['p_size'].astype(np.int) <= 5)) | (
                    (df['p_brand'] == 'Brand#44') & (df['p_container'].isin(
                        ['MED BAG', 'MED BOX', 'MED PACK', 'MED PKG'])) &
                    (df['p_size'].astype(np.int) >= 1) &
                    (df['p_size'].astype(np.int) <= 10)) | (
                        (df['p_brand'] == 'Brand#53') &
                        (df['p_container'].isin(
                            ['LG CASE', 'LG BOX', 'LG PACK', 'LG PKG'])) &
                        (df['p_size'].astype(np.int) >= 1) &
                        (df['p_size'].astype(np.int) <= 15))

    return Filter(
        PredicateExpression(
            lambda t_: (t_['p_brand'] == 'Brand#11' and t_['p_container'] in [
                'SM CASE', 'SM BOX', 'SM PACK', 'SM PKG'
            ] and 1 <= int(t_['p_size']) <= 5) or
            (t_['p_brand'] == 'Brand#44' and t_['p_container'] in [
                'MED BAG', 'MED BOX', 'MED PACK', 'MED PKG'
            ] and 1 <= int(t_['p_size']) <= 10) or
            (t_['p_brand'] == 'Brand#53' and t_['p_container'] in [
                'LG CASE', 'LG BOX', 'LG PACK', 'LG PKG'
            ] and 1 <= int(t_['p_size']) <= 15), pd_expr), name, query_plan,
        False)
Exemplo n.º 3
0
def filter_def(name, query_plan):
    def pd_expr(df):
        return (
                       (df['p_brand'] == 'Brand#11') &
                       (df['p_container'].isin(['SM CASE', 'SM BOX', 'SM PACK', 'SM PKG'])) &
                       (df['l_quantity'].astype(np.int) >= 3) &
                       (df['l_quantity'].astype(np.int) <= 3 + 10) &
                       (df['p_size'].astype(np.int) >= 1) &
                       (df['p_size'].astype(np.int) <= 5) &
                       (df['l_shipmode'].isin(['AIR', 'AIR REG'])) &
                       (df['l_shipinstruct'] == 'DELIVER IN PERSON')
               ) | (
                       (df['p_brand'] == 'Brand#44') &
                       (df['p_container'].isin(['MED BAG', 'MED BOX', 'MED PACK', 'MED PKG'])) &
                       (df['l_quantity'].astype(np.int) >= 16) &
                       (df['l_quantity'].astype(np.int) <= 16 + 10) &
                       (df['p_size'].astype(np.int) >= 1) &
                       (df['p_size'].astype(np.int) <= 10) &
                       (df['l_shipmode'].isin(['AIR', 'AIR REG'])) &
                       (df['l_shipinstruct'] == 'DELIVER IN PERSON')
               ) | (
                       (df['p_brand'] == 'Brand#53') &
                       (df['p_container'].isin(['LG CASE', 'LG BOX', 'LG PACK', 'LG PKG'])) &
                       (df['l_quantity'].astype(np.int) >= 24) &
                       (df['l_quantity'].astype(np.int) <= 24 + 10) &
                       (df['p_size'].astype(np.int) >= 1) &
                       (df['p_size'].astype(np.int) <= 15) &
                       (df['l_shipmode'].isin(['AIR', 'AIR REG'])) &
                       (df['l_shipinstruct'] == 'DELIVER IN PERSON')
               )

    return Filter(
        PredicateExpression(lambda t_:
                            (
                                    t_['p_brand'] == 'Brand#11' and
                                    t_['p_container'] in ['SM CASE', 'SM BOX', 'SM PACK', 'SM PKG'] and
                                    3 <= int(t_['l_quantity']) <= 3 + 10 and
                                    1 <= int(t_['p_size']) <= 5 and
                                    t_['l_shipmode'] in ['AIR', 'AIR REG'] and
                                    t_['l_shipinstruct'] == 'DELIVER IN PERSON'
                            ) or (
                                    t_['p_brand'] == 'Brand#44' and
                                    t_['p_container'] in ['MED BAG', 'MED BOX', 'MED PACK', 'MED PKG'] and
                                    16 <= int(t_['l_quantity']) <= 16 + 10 and
                                    1 <= int(t_['p_size']) <= 10 and
                                    t_['l_shipmode'] in ['AIR', 'AIR REG'] and
                                    t_['l_shipinstruct'] == 'DELIVER IN PERSON'
                            ) or (
                                    t_['p_brand'] == 'Brand#53' and
                                    t_['p_container'] in ['LG CASE', 'LG BOX', 'LG PACK', 'LG PKG'] and
                                    24 <= int(t_['l_quantity']) <= 24 + 10 and
                                    1 <= int(t_['p_size']) <= 15 and
                                    t_['l_shipmode'] in ['AIR', 'AIR REG'] and
                                    t_['l_shipinstruct'] == 'DELIVER IN PERSON'
                            ),
                            pd_expr),
        name,
        query_plan,
        False)
Exemplo n.º 4
0
def filter_brand_container_op(name, query_plan):
    def pd_expr(df):
        return (df['p_brand'] == 'Brand#41') & (df['p_container'] == 'SM PACK')

    return Filter(
        PredicateExpression(
            lambda t_: t_['p_brand'] == 'Brand#41' and t_['p_container'] ==
            'SM PACK', pd_expr), name, query_plan, False)
Exemplo n.º 5
0
def filter_brand12_operator_def(name, query_plan):
    # type: (str, QueryPlan) -> Filter

    def pd_expr(df):
        return df['p_brand'] == 'Brand#12'

    return Filter(
        PredicateExpression(lambda t_: t_['p_brand'] == 'Brand#12', pd_expr),
        name, query_plan, False)
Exemplo n.º 6
0
def filter_shipdate_operator_def(max_shipped_date, name, query_plan):
    def pd_expr(df):
        return pd.to_datetime(df['l_shipdate'], cache=True) <= max_shipped_date

    return Filter(
        PredicateExpression(lambda t_:
                            (cast(t_['l_shipdate'], timestamp) <= cast(max_shipped_date, timestamp)),
                            pd_expr),
        name, query_plan,
        False)
Exemplo n.º 7
0
def filter_shipdate_operator_def(min_shipped_date, max_shipped_date, name,
                                 query_plan):
    def pd_expr(df):
        # df['_10'] = pd.to_datetime(df['_10'])
        return (pd.to_datetime(df['l_shipdate']) >= min_shipped_date) & (
            pd.to_datetime(df['l_shipdate']) < max_shipped_date)

    return Filter(
        PredicateExpression(
            lambda t_: (cast(t_['l_shipdate'], timestamp) >= cast(
                min_shipped_date, timestamp)) and
            (cast(t_['l_shipdate'], timestamp) < cast(
                max_shipped_date, timestamp)), pd_expr), name, query_plan,
        False)
Exemplo n.º 8
0
def filter_lineitem_quantity_op(name, query_plan):
    """with filter_join_2 as (select * from part_lineitem_join_avg_group_join where l_quantity < avg_l_quantity_computed00)

    :param query_plan:
    :param name:
    :return:
    """
    def pd_expr(df):
        return (df['l_quantity'].astype(np.float) <
                df['avg_l_quantity_computed00'])

    return Filter(
        PredicateExpression(
            lambda t_: float(t_['l_quantity']) < t_['avg_l_quantity_computed00'
                                                    ], pd_expr),
        name,
        query_plan,
        False,
    )
Exemplo n.º 9
0
def test_filter_empty():
    """Executes a filter where no records are returned. We tst this as it's somewhat peculiar with s3 select, in so much
    as s3 does not return column names when selecting data, meaning, unlike a traditional DBMS, no field names tuple
    should be present in the results.

    :return: None
    """

    query_plan = QueryPlan()

    # Query plan
    ts = query_plan.add_operator(
        SQLTableScan('lineitem.csv', 'select * from S3Object limit 0;', False,
                     'ts', query_plan, False))

    f = query_plan.add_operator(
        Filter(
            PredicateExpression(lambda t_: cast(t_['_10'], timestamp) >= cast(
                '1996-03-01', timestamp)), 'f', query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(f)
    f.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    assert len(c.tuples()) == 0

    # Write the metrics
    query_plan.print_metrics()
Exemplo n.º 10
0
def test_filter_baseline():
    """

    :return:
    """

    query_plan = QueryPlan(buffer_size=64, is_async=True, use_shared_mem=False)

    # Query plan
    '''
    ts = query_plan.add_operator(
        SQLTableScan('lineitem.csv', 'select * from S3Object limit 3;' , False, 'ts', query_plan, False))
    '''
    # using a 'use_native=True' argument will result in a None object being returned
    ts = query_plan.add_operator(
        SQLTableScan('random_strings_2.csv', 'select * from S3Object limit 3;',
                     Format.CSV, True, False, False, 'ts', query_plan, False))
    f = query_plan.add_operator(
        Filter(
            PredicateExpression(lambda t_: cast(t_['_10'], timestamp) >= cast(
                '1996-03-01', timestamp)), 'f', query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(f)
    f.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))
    print(len(c.tuples()))
    assert 2 + 1 == len(c.tuples())

    field_names = [
        '_0', '_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10',
        '_11', '_12', '_13', '_14', '_15'
    ]

    assert c.tuples()[0] == field_names

    assert c.tuples()[1] == [
        '1', '155190', '7706', '1', '17', '21168.23', '0.04', '0.02', 'N', 'O',
        '1996-03-13', '1996-02-12', '1996-03-22', 'DELIVER IN PERSON', 'TRUCK',
        'egular courts above the'
    ]

    assert c.tuples()[2] == [
        '1', '67310', '7311', '2', '36', '45983.16', '0.09', '0.06', 'N', 'O',
        '1996-04-12', '1996-02-28', '1996-04-20', 'TAKE BACK RETURN', 'MAIL',
        'ly final dependencies: slyly bold '
    ]

    # Write the metrics
    query_plan.print_metrics()
Exemplo n.º 11
0
def test_pandas_filter_baseline():
    """

    :return:
    """

    query_plan = QueryPlan(buffer_size=8192)

    # Query plan
    ts = query_plan.add_operator(
        SQLTableScan('lineitem.csv', 'select * from S3Object limit 3;', True,
                     'ts', query_plan, False))

    # f = query_plan.add_operator(
    #     Filter(PredicateExpression(lambda t_: cast(t_['_10'], timestamp) >= cast('1996-03-01', timestamp)),
    #            'f', query_plan,
    #            False))

    def pd_expr(df):
        # df['_10'] = pd.to_datetime(df['_10'])
        return pd.to_datetime(df['_10']) >= '1996-03-01'

    f = query_plan.add_operator(
        Filter(PredicateExpression(None, pd_expr), 'f', query_plan, True))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(f)
    f.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    assert 2 + 1 == len(c.tuples())

    field_names = [
        '_0', '_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10',
        '_11', '_12', '_13', '_14', '_15'
    ]

    assert c.tuples()[0] == field_names

    assert c.tuples()[1] == [
        '1', '155190', '7706', '1', '17', '21168.23', '0.04', '0.02', 'N', 'O',
        '1996-03-13', '1996-02-12', '1996-03-22', 'DELIVER IN PERSON', 'TRUCK',
        'egular courts above the'
    ]

    assert c.tuples()[2] == [
        '1', '67310', '7311', '2', '36', '45983.16', '0.09', '0.06', 'N', 'O',
        '1996-04-12', '1996-02-28', '1996-04-20', 'TAKE BACK RETURN', 'MAIL',
        'ly final dependencies: slyly bold '
    ]

    # Write the metrics
    query_plan.print_metrics()
Exemplo n.º 12
0
def query_plan(settings):
    # type: (SyntheticBaselineJoinSettings) -> QueryPlan
    """

    :type settings:
    :return: None
    """

    if settings.use_shared_mem:
        system = WorkerSystem(settings.shared_memory_size)
    else:
        system = None

    query_plan = QueryPlan(system, is_async=settings.parallel, buffer_size=settings.buffer_size,
                           use_shared_mem=settings.use_shared_mem)

    # Define the operators
    scan_A = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_A_key, settings.table_A_sharded, p, settings.sf),
                             "select "
                             "  * "
                             "from "
                             "  S3Object "
                             "{}"
                             .format(
                                 get_sql_suffix(settings.table_A_key, settings.table_A_parts, p,
                                                settings.table_A_sharded, add_where=True)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_A_{}'.format(p),
                             query_plan,
                             False)),
            range(0, settings.table_A_parts))

    field_names_map_A = OrderedDict(
        zip(['_{}'.format(i) for i, name in enumerate(settings.table_A_field_names)], settings.table_A_field_names))

    def project_fn_A(df):
        df = df.rename(columns=field_names_map_A, copy=False)
        return df

    project_A = map(lambda p:
                    query_plan.add_operator(Project(
                        [ProjectExpression(k, v) for k, v in field_names_map_A.iteritems()],
                        'project_A_{}'.format(p),
                        query_plan,
                        False,
                        project_fn_A)),
                    range(0, settings.table_A_parts))

    if settings.table_A_filter_fn is not None:
        filter_A = map(lambda p:
                       query_plan.add_operator(Filter(
                           PredicateExpression(None, pd_expr=settings.table_A_filter_fn), 'filter_A_{}'.format(p), query_plan,
                           False)),
                       range(0, settings.table_A_parts))

    scan_B = \
        map(lambda p:
            query_plan.add_operator(
                SQLTableScan(get_file_key(settings.table_B_key, settings.table_B_sharded, p, settings.sf),
                             "select "
                             "  * "
                             "from "
                             "  S3Object "
                             "{}"
                             .format(
                                 get_sql_suffix(settings.table_B_key, settings.table_B_parts, p,
                                                settings.table_B_sharded, add_where=True)), settings.format_,
                             settings.use_pandas,
                             settings.secure,
                             settings.use_native,
                             'scan_B_{}'.format(p),
                             query_plan,
                             False)),
            range(0, settings.table_B_parts))

    field_names_map_B = OrderedDict(
        zip(['_{}'.format(i) for i, name in enumerate(settings.table_B_field_names)], settings.table_B_field_names))

    def project_fn_B(df):
        df.rename(columns=field_names_map_B, inplace=True)
        return df

    project_B = map(lambda p:
                    query_plan.add_operator(Project(
                        [ProjectExpression(k, v) for k, v in field_names_map_B.iteritems()],
                        'project_B_{}'.format(p),
                        query_plan,
                        False,
                        project_fn_B)),
                    range(0, settings.table_B_parts))

    if settings.table_B_filter_fn is not None:
        filter_b = map(lambda p:
                       query_plan.add_operator(Filter(
                           PredicateExpression(None, pd_expr=settings.table_B_filter_fn), 'filter_b' + '_{}'.format(p), query_plan,
                           False)),
                       range(0, settings.table_B_parts))

    if settings.table_C_key is not None:
        scan_C = \
            map(lambda p:
                query_plan.add_operator(
                    SQLTableScan(get_file_key(settings.table_C_key, settings.table_C_sharded, p, settings.sf),
                                 "select "
                                 "  * "
                                 "from "
                                 "  S3Object "
                                 "{}"
                                 .format(
                                     get_sql_suffix(settings.table_C_key, settings.table_C_parts, p,
                                                    settings.table_C_sharded, add_where=True)), settings.format_,
                                 settings.use_pandas,
                                 settings.secure,
                                 settings.use_native,
                                 'scan_C_{}'.format(p),
                                 query_plan,
                                 False)),
                range(0, settings.table_C_parts))

        field_names_map_C = OrderedDict(
            zip(['_{}'.format(i) for i, name in enumerate(settings.table_C_field_names)], settings.table_C_field_names))

        def project_fn_C(df):
            df = df.rename(columns=field_names_map_C, copy=False)
            return df

        project_C = map(lambda p:
                        query_plan.add_operator(Project(
                            [ProjectExpression(k, v) for k, v in field_names_map_C.iteritems()],
                            'project_C_{}'.format(p),
                            query_plan,
                            False,
                            project_fn_C)),
                        range(0, settings.table_C_parts))

        filter_c = map(lambda p:
                       query_plan.add_operator(Filter(
                           PredicateExpression(None, pd_expr=settings.table_C_filter_fn), 'filter_c' + '_{}'.format(p), query_plan,
                           False)),
                       range(0, settings.table_C_parts))

        map_B_to_C = map(lambda p:
                         query_plan.add_operator(
                             Map(settings.table_B_BC_join_key, 'map_B_to_C_{}'.format(p), query_plan, False)),
                         range(0, settings.table_B_parts))

        map_C_to_C = map(lambda p:
                         query_plan.add_operator(
                             Map(settings.table_C_BC_join_key, 'map_C_to_C_{}'.format(p), query_plan, False)),
                         range(0, settings.table_C_parts))

        join_build_AB_C = map(lambda p:
                              query_plan.add_operator(
                                  HashJoinBuild(settings.table_B_BC_join_key, 'join_build_AB_C_{}'.format(p),
                                                query_plan,
                                                False)),
                              range(0, settings.table_C_parts))

        join_probe_AB_C = map(lambda p:
                              query_plan.add_operator(
                                  HashJoinProbe(
                                      JoinExpression(settings.table_B_BC_join_key, settings.table_C_BC_join_key),
                                      'join_probe_AB_C_{}'.format(p),
                                      query_plan, False)),
                              range(0, settings.table_C_parts))

    map_A_to_B = map(lambda p:
                     query_plan.add_operator(
                         Map(settings.table_A_AB_join_key, 'map_A_to_B_{}'.format(p), query_plan, False)),
                     range(0, settings.table_A_parts))

    map_B_to_B = map(lambda p:
                     query_plan.add_operator(
                         Map(settings.table_B_AB_join_key, 'map_B_to_B_{}'.format(p), query_plan, False)),
                     range(0, settings.table_B_parts))

    join_build_A_B = map(lambda p:
                         query_plan.add_operator(
                             HashJoinBuild(settings.table_A_AB_join_key, 'join_build_A_B_{}'.format(p), query_plan,
                                           False)),
                         range(0, settings.other_parts))

    join_probe_A_B = map(lambda p:
                         query_plan.add_operator(
                             HashJoinProbe(JoinExpression(settings.table_A_AB_join_key, settings.table_B_AB_join_key),
                                           'join_probe_A_B_{}'.format(p),
                                           query_plan, False)),
                         range(0, settings.other_parts))

    if settings.table_C_key is None:

        def part_aggregate_fn(df):
            sum_ = df[settings.table_B_detail_field_name].astype(np.float).sum()
            return pd.DataFrame({'_0': [sum_]})

        part_aggregate = map(lambda p:
                             query_plan.add_operator(Aggregate(
                                 [
                                     AggregateExpression(AggregateExpression.SUM,
                                                         lambda t: float(t[settings.table_B_detail_field_name]))
                                 ],
                                 settings.use_pandas,
                                 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)),
                             range(0, settings.other_parts))

    else:
        def part_aggregate_fn(df):
            sum_ = df[settings.table_C_detail_field_name].astype(np.float).sum()
            return pd.DataFrame({'_0': [sum_]})

        part_aggregate = map(lambda p:
                             query_plan.add_operator(Aggregate(
                                 [
                                     AggregateExpression(AggregateExpression.SUM,
                                                         lambda t: float(t[settings.table_C_detail_field_name]))
                                 ],
                                 settings.use_pandas,
                                 'part_aggregate_{}'.format(p), query_plan, False, part_aggregate_fn)),
                             range(0, settings.table_C_parts))

    def aggregate_reduce_fn(df):
        sum_ = df['_0'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum_]})

    aggregate_reduce = query_plan.add_operator(Aggregate(
        [
            AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0']))
        ],
        settings.use_pandas,
        'aggregate_reduce', query_plan, False, aggregate_reduce_fn))

    aggregate_project = query_plan.add_operator(Project(
        [
            ProjectExpression(lambda t: t['_0'], 'total_balance')
        ],
        'aggregate_project', query_plan,
        False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    # Inline some of the operators
    map(lambda o: o.set_async(False), project_A)
    if settings.table_A_filter_fn is not None:
        map(lambda o: o.set_async(False), filter_A)
    map(lambda o: o.set_async(False), project_B)
    if settings.table_B_filter_fn is not None:
        map(lambda o: o.set_async(False), filter_b)
    map(lambda o: o.set_async(False), map_A_to_B)
    map(lambda o: o.set_async(False), map_B_to_B)
    if settings.table_C_key is not None:
        map(lambda o: o.set_async(False), map_B_to_C)
        map(lambda o: o.set_async(False), map_C_to_C)
        map(lambda o: o.set_async(False), project_C)
        map(lambda o: o.set_async(False), filter_c)
    map(lambda o: o.set_async(False), part_aggregate)
    aggregate_project.set_async(False)

    # Connect the operators
    connect_many_to_many(scan_A, project_A)
    if settings.table_A_filter_fn is not None:
        connect_many_to_many(project_A, filter_A)
        connect_many_to_many(filter_A, map_A_to_B)
    else:
        connect_many_to_many(project_A, map_A_to_B)
    connect_all_to_all(map_A_to_B, join_build_A_B)
    connect_many_to_many(join_build_A_B, join_probe_A_B)

    connect_many_to_many(scan_B, project_B)
    if settings.table_B_filter_fn is not None:
        connect_many_to_many(project_B, filter_b)
        connect_many_to_many(filter_b, map_B_to_B)
    else:
        connect_many_to_many(project_B, map_B_to_B)

    connect_all_to_all(map_B_to_B, join_probe_A_B)

    if settings.table_C_key is None:
        connect_many_to_many(join_probe_A_B, part_aggregate)
    else:
        connect_many_to_many(join_probe_A_B, map_B_to_C)
        connect_all_to_all(map_B_to_C, join_build_AB_C)
        connect_many_to_many(join_build_AB_C, join_probe_AB_C)
        connect_many_to_many(scan_C, project_C)
        connect_many_to_many(project_C, filter_c)
        connect_many_to_many(filter_c, map_C_to_C)
        connect_all_to_all(map_C_to_C, join_probe_AB_C)
        connect_many_to_many(join_probe_AB_C, part_aggregate)

    connect_many_to_one(part_aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, aggregate_project)
    connect_one_to_one(aggregate_project, collate)

    return query_plan
Exemplo n.º 13
0
def run(parallel,
        use_pandas,
        secure,
        use_native,
        buffer_size,
        format_,
        customer_parts,
        order_parts,
        lineitem_parts,
        customer_sharded,
        order_sharded,
        lineitem_sharded,
        other_parts,
        sf,
        expected_result,
        customer_filter_sql=None,
        order_filter_sql=None,
        lineitem_filter_sql=None):
    """

    :return: None
    """

    print('')
    print("TPCH Q3 Baseline Join")
    print("---------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    customer_scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                get_file_key('customer', customer_sharded, p, sf, format_
                             ), "select "
                " * "
                "from "
                "  S3Object "
                "  {} "
                "  {} ".format(
                    ' where ' + customer_filter_sql
                    if customer_filter_sql is not None else '',
                    get_sql_suffix('customer',
                                   customer_parts,
                                   p,
                                   customer_sharded,
                                   add_where=customer_filter_sql is None)
                ), format_, use_pandas, secure, use_native, 'customer_scan' +
                '_{}'.format(p), query_plan, False)), range(0, customer_parts))

    def customer_project_fn(df):
        df = df.filter(items=['_0', '_6'], axis=1)

        df.rename(columns={
            '_0': 'c_custkey',
            '_6': 'c_mktsegment'
        },
                  inplace=True)

        return df

    customer_project = map(
        lambda p: query_plan.add_operator(
            Project([], 'customer_project' + '_{}'.format(p), query_plan,
                    False, customer_project_fn)), range(0, customer_parts))

    def customer_filter_fn(df):
        return df['c_mktsegment'] == 'BUILDING'

    customer_filter = map(
        lambda p: query_plan.add_operator(
            Filter(PredicateExpression(pd_expr=customer_filter_fn),
                   'customer_filter' + '_{}'.format(p), query_plan, False)),
        range(0, customer_parts))

    customer_map = map(
        lambda p: query_plan.add_operator(
            Map('c_custkey', 'customer_map' + '_' + str(p), query_plan, False)
        ), range(0, customer_parts))

    order_scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                get_file_key('orders', order_sharded, p, sf, format_
                             ), "select "
                "  * "
                "from "
                "  S3Object "
                "  {} "
                "  {} ".format(
                    ' where ' + order_filter_sql
                    if order_filter_sql is not None else '',
                    get_sql_suffix('orders',
                                   order_parts,
                                   p,
                                   order_sharded,
                                   add_where=order_filter_sql is None)
                ), format_, use_pandas, secure, use_native, 'order_scan' +
                '_{}'.format(p), query_plan, False)), range(0, order_parts))

    def order_project_fn(df):
        df = df.filter(items=['_0', '_1', '_4', '_7'], axis=1)

        df.rename(columns={
            '_1': 'o_custkey',
            '_0': 'o_orderkey',
            '_4': 'o_orderdate',
            '_7': 'o_shippriority'
        },
                  inplace=True)

        return df

    order_project = map(
        lambda p: query_plan.add_operator(
            Project([], 'order_project' + '_{}'.format(p), query_plan, False,
                    order_project_fn)), range(0, customer_parts))

    def order_filter_fn(df):
        return pd.to_datetime(df['o_orderdate']) < pd.Timestamp(
            datetime.strptime('1995-03-01', '%Y-%m-%d').date())

    order_filter = map(
        lambda p: query_plan.add_operator(
            Filter(PredicateExpression(pd_expr=order_filter_fn),
                   'order_filter' + '_{}'.format(p), query_plan, False)),
        range(0, order_parts))

    order_map_1 = map(
        lambda p: query_plan.add_operator(
            Map('o_custkey', 'order_map_1' + '_' + str(p), query_plan, False)),
        range(0, order_parts))

    customer_order_join_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('c_custkey', 'customer_order_join_build' + '_' + str(
                p), query_plan, False)), range(0, other_parts))

    customer_order_join_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('c_custkey', 'o_custkey'),
                          'customer_order_join_probe' + '_' + str(
                              p), query_plan, False)), range(0, other_parts))

    lineitem_scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                get_file_key('lineitem', lineitem_sharded, p, sf, format_
                             ), "select "
                "  * "
                "from "
                "  S3Object "
                "  {} "
                "  {} ".format(
                    ' where ' + lineitem_filter_sql
                    if lineitem_filter_sql is not None else "",
                    get_sql_suffix('lineitem',
                                   lineitem_parts,
                                   p,
                                   lineitem_sharded,
                                   add_where=lineitem_filter_sql is None)
                ), format_, use_pandas, secure, use_native, 'lineitem_scan' +
                '_{}'.format(p), query_plan, False)), range(0, lineitem_parts))

    def lineitem_project_fn(df):
        df = df.filter(items=['_0', '_5', '_6', '_10'], axis=1)

        df.rename(columns={
            '_0': 'l_orderkey',
            '_5': 'l_extendedprice',
            '_6': 'l_discount',
            '_10': 'l_shipdate'
        },
                  inplace=True)

        return df

    lineitem_project = map(
        lambda p: query_plan.add_operator(
            Project([], 'lineitem_project' + '_{}'.format(p), query_plan,
                    False, lineitem_project_fn)), range(0, lineitem_parts))

    def lineitem_filter_fn(df):
        return pd.to_datetime(df['l_shipdate']) > pd.Timestamp(
            datetime.strptime('1995-03-01', '%Y-%m-%d').date())

    lineitem_filter = map(
        lambda p: query_plan.add_operator(
            Filter(PredicateExpression(pd_expr=lineitem_filter_fn),
                   'lineitem_filter' + '_{}'.format(p), query_plan, False)),
        range(0, lineitem_parts))

    lineitem_map = map(
        lambda p: query_plan.add_operator(
            Map('l_orderkey', 'lineitem_map' + '_' + str(p), query_plan, False)
        ), range(0, lineitem_parts))

    order_map_2 = map(
        lambda p: query_plan.add_operator(
            Map('o_orderkey', 'order_map_2' + '_' + str(p), query_plan, False)
        ), range(0, other_parts))

    customer_order_lineitem_join_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('o_orderkey', 'customer_order_lineitem_join_build' +
                          '_' + str(p), query_plan, False)),
        range(0, other_parts))

    customer_order_lineitem_join_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('o_orderkey', 'l_orderkey'),
                          'customer_order_lineitem_join_probe' + '_' + str(
                              p), query_plan, False)), range(0, other_parts))

    def groupby_fn(df):
        df['l_extendedprice'] = df['l_extendedprice'].astype(np.float)
        df['l_discount'] = df['l_discount'].astype(np.float)
        df['revenue'] = df['l_extendedprice'] * (1 - df['l_discount'])
        grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority'])
        agg_df = grouped['revenue'].sum()
        return agg_df.reset_index()

    group = map(
        lambda p: query_plan.add_operator(
            Group(
                ['l_orderkey', 'o_orderdate', 'o_shippriority'],  # l_partkey
                [
                    AggregateExpression(
                        AggregateExpression.SUM, lambda t_: float(t_[
                            'l_extendedprice'] * (1 - t_['l_discount'])))
                ],
                'group' + '_{}'.format(p),
                query_plan,
                False,
                groupby_fn)),
        range(0, other_parts))

    def group_reduce_fn(df):
        grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority'])
        agg_df = grouped['revenue'].sum()
        return agg_df.reset_index()

    group_reduce = query_plan.add_operator(
        Group(
            ['l_orderkey', 'o_orderdate', 'o_shippriority'],  # l_partkey
            [
                AggregateExpression(
                    AggregateExpression.SUM,
                    lambda t_: float(t_['l_extendedprice'] *
                                     (1 - t_['l_discount'])))
            ],
            'group_reduce',
            query_plan,
            False,
            group_reduce_fn))

    top = query_plan.add_operator(
        Top(10, [
            SortExpression('revenue', float, 'DESC'),
            SortExpression('o_orderdate', date, 'ASC')
        ], use_pandas, 'top', query_plan, False))

    collate = query_plan.add_operator(
        tpch_q19.collate_op('collate', query_plan))

    # Inline what we can
    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), customer_project)
    map(lambda o: o.set_async(False), order_project)
    map(lambda o: o.set_async(False), lineitem_filter)
    map(lambda o: o.set_async(False), customer_filter)
    map(lambda o: o.set_async(False), order_filter)
    map(lambda o: o.set_async(False), lineitem_map)
    map(lambda o: o.set_async(False), customer_map)
    map(lambda o: o.set_async(False), order_map_1)
    map(lambda o: o.set_async(False), order_map_2)

    # Connect the operators
    connect_many_to_many(customer_scan, customer_project)
    connect_many_to_many(customer_project, customer_filter)
    connect_many_to_many(customer_filter, customer_map)
    connect_many_to_many(order_scan, order_project)
    connect_many_to_many(order_project, order_filter)
    connect_many_to_many(order_filter, order_map_1)

    connect_all_to_all(customer_map, customer_order_join_build)
    connect_many_to_many(customer_order_join_build, customer_order_join_probe)
    connect_all_to_all(order_map_1, customer_order_join_probe)

    # connect_many_to_one(customer_order_join_probe, collate)

    connect_many_to_many(lineitem_scan, lineitem_project)
    connect_many_to_many(lineitem_project, lineitem_filter)
    connect_many_to_many(lineitem_filter, lineitem_map)

    connect_many_to_many(customer_order_join_probe, order_map_2)
    connect_all_to_all(order_map_2, customer_order_lineitem_join_build)
    connect_many_to_many(customer_order_lineitem_join_build,
                         customer_order_lineitem_join_probe)
    connect_all_to_all(lineitem_map, customer_order_lineitem_join_probe)

    # connect_many_to_one(customer_order_lineitem_join_probe, collate)

    connect_many_to_many(customer_order_lineitem_join_probe, group)
    # connect_many_to_one(group, collate)

    connect_many_to_one(group, group_reduce)
    connect_one_to_one(group_reduce, top)
    connect_one_to_one(top, collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("customer_parts: {}".format(customer_parts))
    print("order_parts: {}".format(order_parts))
    print("lineitem_parts: {}".format(lineitem_parts))
    print("customer_sharded: {}".format(customer_sharded))
    print("order_sharded: {}".format(order_sharded))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("other_parts: {}".format(other_parts))
    print("format: {}".format(format_))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['l_orderkey', 'o_orderdate', 'o_shippriority', 'revenue']

    assert len(tuples) == 10 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    test_util.assert_tuples(expected_result, tuples)