Пример #1
0
def test_filter_1():
    """

    :return:
    """

    query_plan = QueryPlan(buffer_size=64, is_async=True, use_shared_mem=False)

    # Query plan

    ts = query_plan.add_operator(
        SQLTableScan(
            'tpch-sf1/lineitem_sharded/lineitem.csv.0',
            'select * from S3Object where cast(l_extendedprice as float) >= 0 and cast(l_extendedprice  as float)<= 910;',
            Format.CSV, True, False, False, 'ts', query_plan, False))

    # using a 'use_native=True' argument will result in a None object being returned
    '''
    ts = query_plan.add_operator(
        SQLTableScan('tpch-sf1/lineitem_sharded/lineitem.csv.0', 'select l_partkey  from S3Object limit 5;',Format.CSV , True, False,False, 'ts', query_plan, False))
    #random_strings_2.csv
    '''
    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(c)
    #f.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    #assert 2 + 1 == len(c.tuples())
    print("Tuples:")
    print(c.tuples())
    # Write the metrics
    query_plan.print_metrics()
    print(ROOT_DIR)
Пример #2
0
def test_filter_arg(object_, sql_query):
    # Let's forget about the local filter for now. The pd._expr field of the PredicateExpression class is not well documented and it is needed for the Filter class (on line 102).
    """

    :return:
    """

    query_plan = QueryPlan(buffer_size=64, is_async=True, use_shared_mem=False)

    # Query plan
    '''
    ts = query_plan.add_operator(
        SQLTableScan('lineitem.csv', 'select * from S3Object limit 3;' , False, 'ts', query_plan, False))
    '''
    # using a 'use_native=True' argument will result in a None object being returned
    ts = query_plan.add_operator(
        SQLTableScan(object_, sql_query, Format.CSV, True, False, False, 'ts',
                     query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    #assert 2 + 1 == len(c.tuples())
    print("Tuples:")
    print(c.tuples())
    # Write the metrics
    query_plan.print_metrics()
    print(ROOT_DIR)
Пример #3
0
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts):
    """
    :return: None
    """

    print('')
    print("TPCH Q17 Bloom Join")
    print("-------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    part_scan = map(lambda p:
                    query_plan.add_operator(
                        tpch_q17.sql_scan_select_partkey_where_brand_container_op(
                            part_parts != 1,
                            p,
                            part_parts,
                            use_pandas,
                            'part_scan' + '_' + str(p),
                            query_plan)),
                    range(0, part_parts))

    part_project = map(lambda p:
                       query_plan.add_operator(
                           tpch_q17.project_partkey_op(
                               'part_project' + '_' + str(p),
                               query_plan)),
                       range(0, part_parts))

    part_bloom_create_map = map(lambda p:
                                query_plan.add_operator(
                                    Map('p_partkey', 'part_bloom_create_map' + '_' + str(p), query_plan, True)),
                                range(0, part_parts))

    part_lineitem_join_build_map = map(lambda p:
                                       query_plan.add_operator(
                                           Map('p_partkey', 'part_lineitem_join_build_map' + '_' + str(p), query_plan,
                                               True)),
                                       range(0, part_parts))

    part_bloom_create = map(lambda p:
                            query_plan.add_operator(
                                tpch_q17.bloom_create_partkey_op('part_bloom_create' + '_' + str(p), query_plan)),
                            range(0, part_parts))

    lineitem_bloom_use = map(lambda p:
                             query_plan.add_operator(
                                 tpch_q17.bloom_scan_lineitem_select_orderkey_partkey_quantity_extendedprice_where_partkey_bloom_partkey_op(
                                     part_parts != 1,
                                     p,
                                     part_parts,
                                     use_pandas,
                                     'lineitem_bloom_use' + '_' + str(p),
                                     query_plan)),
                             range(0, lineitem_parts))

    lineitem_project = map(lambda p:
                           query_plan.add_operator(
                               tpch_q17.project_orderkey_partkey_quantity_extendedprice_op(
                                   'lineitem_project' + '_' + str(p),
                                   query_plan)),
                           range(0, lineitem_parts))

    part_lineitem_join_probe_map = map(lambda p:
                                       query_plan.add_operator(
                                           Map('l_partkey', 'part_lineitem_join_probe_map' + '_' + str(p), query_plan,
                                               True)),
                                       range(0, lineitem_parts))

    # part_lineitem_join = map(lambda p:
    #                          query_plan.add_operator(
    #                              tpch_q17.join_p_partkey_l_partkey_op('part_lineitem_join' + '_' + str(p), query_plan)),
    #                          range(0, part_parts))

    part_lineitem_join_build = map(lambda p:
                                   query_plan.add_operator(
                                       HashJoinBuild('p_partkey',
                                                     'part_lineitem_join_build' + '_' + str(p), query_plan,
                                                     False)),
                                   range(0, part_parts))

    part_lineitem_join_probe = map(lambda p:
                                   query_plan.add_operator(
                                       HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                                                     'part_lineitem_join_probe' + '_' + str(p),
                                                     query_plan, True)),
                                   range(0, part_parts))

    lineitem_part_avg_group = map(lambda p:
                                  query_plan.add_operator(
                                      tpch_q17.group_partkey_avg_quantity_op('lineitem_part_avg_group' + '_' + str(p),
                                                                             query_plan)),
                                  range(0, part_parts))

    lineitem_part_avg_group_project = map(lambda p:
                                          query_plan.add_operator(
                                              tpch_q17.project_partkey_avg_quantity_op(
                                                  'lineitem_part_avg_group_project' + '_' + str(p), query_plan)),
                                          range(0, part_parts))

    # part_lineitem_join_avg_group_join = map(lambda p:
    #                                         query_plan.add_operator(
    #                                             tpch_q17.join_l_partkey_p_partkey_op(
    #                                                 'part_lineitem_join_avg_group_join' + '_' + str(p), query_plan)),
    #                                         range(0, part_parts))

    part_lineitem_join_avg_group_join_build = map(lambda p:
                                                  query_plan.add_operator(
                                                      HashJoinBuild('l_partkey',
                                                                    'part_lineitem_join_avg_group_join_build' + '_' + str(
                                                                        p), query_plan, False)),
                                                  range(0, part_parts))

    part_lineitem_join_avg_group_join_probe = map(lambda p:
                                                  query_plan.add_operator(
                                                      HashJoinProbe(JoinExpression('l_partkey', 'p_partkey'),
                                                                    'part_lineitem_join_avg_group_join_probe' + '_' + str(
                                                                        p),
                                                                    query_plan, True)),
                                                  range(0, part_parts))

    lineitem_filter = map(lambda p:
                          query_plan.add_operator(
                              tpch_q17.filter_lineitem_quantity_op('lineitem_filter' + '_' + str(p), query_plan)),
                          range(0, part_parts))

    extendedprice_sum_aggregate = map(lambda p:
                                      query_plan.add_operator(
                                          tpch_q17.aggregate_sum_extendedprice_op(
                                              'extendedprice_sum_aggregate' + '_' + str(p),
                                              query_plan)),
                                      range(0, part_parts))

    aggregate_reduce = query_plan.add_operator(
        Aggregate(
            [
                AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0']))
            ],
            'aggregate_reduce',
            query_plan,
            False))

    extendedprice_sum_aggregate_project = query_plan.add_operator(
        tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project', query_plan))

    collate = query_plan.add_operator(tpch_q17.collate_op('collate', query_plan))

    # Connect the operators
    # part_scan.connect(part_project)
    map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan))
    map(lambda (p, o): o.connect(part_bloom_create_map[p]), enumerate(part_project))
    map(lambda (p, o): o.connect(part_lineitem_join_build_map[p]), enumerate(part_project))

    # part_project.connect(part_bloom_create)
    map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_bloom_create)),
        enumerate(part_bloom_create_map))

    # part_bloom_create.connect(lineitem_bloom_use)
    map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(lineitem_bloom_use)),
        enumerate(part_bloom_create))

    # lineitem_bloom_use.connect(lineitem_project)
    map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_bloom_use))

    # part_lineitem_join.connect_left_producer(part_project)
    map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_lineitem_join_build)),
        enumerate(part_lineitem_join_build_map))
    map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_build))

    # part_lineitem_join.connect_right_producer(lineitem_project)
    map(lambda (p, o): o.connect(part_lineitem_join_probe_map[p]), enumerate(lineitem_project))
    map(lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1), enumerate(part_lineitem_join_probe)),
        enumerate(part_lineitem_join_probe_map))

    # part_lineitem_join.connect(lineitem_part_avg_group)
    map(lambda (p, o): o.connect(lineitem_part_avg_group[p]), enumerate(part_lineitem_join_probe))

    # lineitem_part_avg_group.connect(lineitem_part_avg_group_project)
    map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]), enumerate(lineitem_part_avg_group))

    # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project)
    map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]),
        enumerate(lineitem_part_avg_group_project))

    # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join)
    map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p].connect_build_producer(o),
        enumerate(part_lineitem_join_avg_group_join_build))
    map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p].connect_tuple_producer(o),
        enumerate(part_lineitem_join_probe))

    # part_lineitem_join_avg_group_join.connect(lineitem_filter)
    map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(part_lineitem_join_avg_group_join_probe))

    # lineitem_filter.connect(extendedprice_sum_aggregate)
    map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]), enumerate(lineitem_filter))

    # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project)
    map(lambda (p, o): o.connect(aggregate_reduce), enumerate(extendedprice_sum_aggregate))
    aggregate_reduce.connect(extendedprice_sum_aggregate_project)

    # extendedprice_sum_aggregate_project.connect(collate)
    extendedprice_sum_aggregate_project.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['avg_yearly']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    assert round(float(tuples[1][0]), 10) == 4632.1085714286
Пример #4
0
def run(parallel, use_pandas, buffer_size, table_parts, perc, path, nthreads=16, format_=Format.CSV):
    secure = False
    use_native = False
    print('')
    print("Indexing Benchmark")
    print("------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Scan Index Files
    upper = perc * 100
    index_scan = map(lambda p:
                     query_plan.add_operator(
                         SQLTableScan('{}/index/index_f0_{}.csv'.format(path, p),
                                      "select first_byte, last_byte "
                                      " from S3Object "
                                      " where cast(name as float) < {};".format(upper), format_,
                                      use_pandas, secure, use_native,
                                      'index_scan_{}'.format(p), query_plan,
                                      False)),
                     range(0, table_parts))

    # Range accesses 
    range_access = map(lambda p:
                       query_plan.add_operator(
                           TableRangeAccess('{}/data_{}.csv'.format(path, p),
                                            use_pandas, secure, use_native,
                                            'range_access_{}'.format(p), query_plan,
                                            False)),
                       range(0, table_parts))

    map(lambda o: o.set_nthreads(nthreads), range_access)

    collate = query_plan.add_operator(
        Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(range_access[p]), enumerate(index_scan))
    map(lambda (p, o): o.connect(collate), enumerate(range_access))

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    # collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Пример #5
0
def run(buffer_size):
    """The baseline tst uses hash joins with no projection and no filtering pushed down to s3.

    :return: None
    """

    print('')
    print("TPCH Q14 Baseline Join")
    print("----------------------")

    query_plan = QueryPlan(is_async=False, buffer_size=buffer_size)

    # Query plan
    date = '1993-01-01'
    min_shipped_date = datetime.strptime(date, '%Y-%m-%d')
    max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30)

    lineitem_scan = query_plan.add_operator(tpch_q14.sql_scan_lineitem_operator_def('lineitem_scan', query_plan))
    lineitem_project = query_plan.add_operator(
        tpch_q14.project_partkey_extendedprice_discount_shipdate_operator_def('lineitem_project', query_plan))
    part_scan = query_plan.add_operator(tpch_q14.sql_scan_part_operator_def('part_scan', query_plan))
    part_project = query_plan.add_operator(tpch_q14.project_partkey_brand_type_operator_def('part_project', query_plan))
    lineitem_filter = query_plan.add_operator(
        tpch_q14.filter_shipdate_operator_def(min_shipped_date, max_shipped_date, 'lineitem_filter', query_plan))
    part_filter = query_plan.add_operator(tpch_q14.filter_brand12_operator_def('part_filter', query_plan))
    join = query_plan.add_operator(tpch_q14.join_lineitem_part_operator_def('join', query_plan))
    aggregate = query_plan.add_operator(tpch_q14.aggregate_promo_revenue_operator_def('aggregate', query_plan))
    aggregate_project = query_plan.add_operator(
        tpch_q14.project_promo_revenue_operator_def('aggregate_project', query_plan))
    collate = query_plan.add_operator(tpch_q14.collate_operator_def('collate', query_plan))

    lineitem_scan.connect(lineitem_project)
    lineitem_project.connect(lineitem_filter)
    join.connect_left_producer(lineitem_filter)
    part_scan.connect(part_project)
    part_project.connect(part_filter)
    join.connect_right_producer(part_filter)
    join.connect(aggregate)
    aggregate.connect(aggregate_project)
    aggregate_project.connect(collate)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in collate.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    collate.print_tuples()

    # Write the metrics
    query_plan.print_metrics()

    field_names = ['promo_revenue']

    assert len(collate.tuples()) == 1 + 1

    assert collate.tuples()[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    assert collate.tuples()[1] == [15.090116526324298]
Пример #6
0
def run(parallel,
        use_pandas,
        buffer_size,
        table_parts,
        lower,
        upper,
        sf,
        format_=Format.CSV):
    secure = False
    use_native = False
    print('')
    print("Indexing Benchmark")
    print("------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # SQL scan the file

    scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                get_file_key('lineitem', True, p, sf=sf, format_=format_
                             ), "select * from S3Object "
                "where cast(l_extendedprice as float) >= {} and cast(l_extendedprice as float) <= {};"
                .format(lower, upper), format_, use_pandas, secure,
                use_native, 'scan_{}'.format(p), query_plan, False)),
        range(0, table_parts))
    '''
    scan = map(lambda p:
               query_plan.add_operator(
                   SQLTableScan(get_file_key('lineitem', True, p, sf=sf, format_=format_),
                                "select * from S3Object "
                                "where l_extendedprice  >= {} and l_extendedprice  <= {};".format(
                                    lower, upper), format_,
                                use_pandas, secure, use_native,
                                'scan_{}'.format(p), query_plan,
                                False)),
               range(0, table_parts))
    '''

    # project
    def fn(df):
        df.columns = [
            'l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber',
            'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax',
            'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate',
            'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment'
        ]
        df[['l_extendedprice']] = df[['l_extendedprice']].astype(np.float)
        return df

    project = map(
        lambda p: query_plan.add_operator(
            Project([], 'project_{}'.format(p), query_plan, False, fn)),
        range(0, table_parts))

    # aggregation
    def agg_fun(df):
        return pd.DataFrame({'count': [len(df)]})

    aggregate = query_plan.add_operator(
        Aggregate([], True, 'agg', query_plan, False, agg_fun))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(aggregate), enumerate(project))
    aggregate.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Пример #7
0
def run(parallel,
        use_pandas,
        buffer_size,
        table_parts,
        lower,
        upper,
        sf,
        nthreads=16,
        format_=Format.CSV):
    secure = False
    use_native = False
    print('')
    print("Indexing Benchmark")
    print("------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)
    assert sf == 1 or sf == 10

    # Scan Index Files
    index_scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                'tpch-sf{}/lineitem_sharded/index/index_l_extendedprice.csv.{}'
                .format(sf, p
                        if sf == 1 else p + 1), "select first_byte, last_byte "
                " from S3Object "
                " where col_values  >= {} and col_values  <= {};".format(
                    lower, upper), format_, use_pandas, secure, use_native,
                'index_scan_{}'.format(p), query_plan, False)),
        range(0, table_parts))

    # Range accesses
    range_access = map(
        lambda p: query_plan.add_operator(
            TableRangeAccess(
                get_file_key('lineitem', True, p, sf=sf, format_=Format.CSV),
                use_pandas, secure, use_native, 'range_access_{}'.format(
                    p), query_plan, False)), range(0, table_parts))

    map(lambda o: o.set_nthreads(nthreads), range_access)

    # Aggregation
    def agg_fun(df):
        return pd.DataFrame({'count': [len(df)]})

    aggregate = query_plan.add_operator(
        Aggregate([], True, 'agg', query_plan, False, agg_fun))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(range_access[p]), enumerate(index_scan))
    map(lambda (p, o): o.connect(aggregate), enumerate(range_access))
    aggregate.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Пример #8
0
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts):
    """

    :return: None
    """

    print('')
    print("TPCH Q14 Semi Join")
    print("------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    date = '1993-01-01'
    min_shipped_date = datetime.strptime(date, '%Y-%m-%d')
    max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30)

    part_scan_1 = query_plan.add_operator(
        tpch_q14.sql_scan_part_partkey_where_brand12_operator_def(
            False, 0, 1, use_pandas, 'part_table_scan_1', query_plan))
    part_scan_1_project = query_plan.add_operator(
        tpch_q14.project_p_partkey_operator_def('part_scan_1_project',
                                                query_plan))
    part_bloom_create = query_plan.add_operator(
        tpch_q14.bloom_create_p_partkey_operator_def('part_bloom_create',
                                                     query_plan))
    lineitem_scan_1 = query_plan.add_operator(
        tpch_q14.bloom_scan_lineitem_partkey_where_shipdate_operator_def(
            min_shipped_date, max_shipped_date, False, 0, use_pandas,
            'lineitem_scan_1', query_plan))
    lineitem_scan_1_project = query_plan.add_operator(
        tpch_q14.project_l_partkey_operator_def('lineitem_scan_1_project',
                                                query_plan))
    part_lineitem_join_1 = query_plan.add_operator(
        tpch_q14.join_part_lineitem_operator_def('part_lineitem_join_1',
                                                 query_plan))
    join_bloom_create = query_plan.add_operator(
        tpch_q14.bloom_create_l_partkey_operator_def('join_bloom_create',
                                                     query_plan))
    part_scan_2 = query_plan.add_operator(
        tpch_q14.bloom_scan_part_partkey_type_brand12_operator_def(
            False, 0, 1, use_pandas, 'part_table_scan_2', query_plan))
    part_scan_2_project = query_plan.add_operator(
        tpch_q14.project_partkey_type_operator_def('part_scan_2_project',
                                                   query_plan))
    lineitem_scan_2 = query_plan.add_operator(
        tpch_q14.bloom_scan_lineitem_where_shipdate_operator_def(
            min_shipped_date, max_shipped_date, False, 0, use_pandas,
            'lineitem_scan_2', query_plan))
    lineitem_scan_2_project = query_plan.add_operator(
        tpch_q14.project_partkey_extendedprice_discount_operator_def(
            'lineitem_scan_2_project', query_plan))
    part_lineitem_join_2 = query_plan.add_operator(
        tpch_q14.join_part_lineitem_operator_def('part_lineitem_join_2',
                                                 query_plan))
    aggregate = query_plan.add_operator(
        tpch_q14.aggregate_promo_revenue_operator_def('aggregate', query_plan))
    aggregate_project = query_plan.add_operator(
        tpch_q14.project_promo_revenue_operator_def('aggregate_project',
                                                    query_plan))
    collate = query_plan.add_operator(
        tpch_q14.collate_operator_def('collate', query_plan))

    part_scan_1.connect(part_scan_1_project)
    part_scan_1_project.connect(part_bloom_create)
    part_bloom_create.connect(lineitem_scan_1)
    part_lineitem_join_1.connect_left_producer(part_scan_1_project)
    lineitem_scan_1.connect(lineitem_scan_1_project)
    part_lineitem_join_1.connect_right_producer(lineitem_scan_1_project)
    part_lineitem_join_1.connect(join_bloom_create)
    join_bloom_create.connect(part_scan_2)
    join_bloom_create.connect(lineitem_scan_2)
    part_scan_2.connect(part_scan_2_project)
    part_lineitem_join_2.connect_left_producer(part_scan_2_project)
    lineitem_scan_2.connect(lineitem_scan_2_project)
    part_lineitem_join_2.connect_right_producer(lineitem_scan_2_project)
    part_lineitem_join_2.connect(aggregate)
    aggregate.connect(aggregate_project)
    aggregate_project.connect(collate)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['promo_revenue']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    assert tuples[1] == [15.090116526324298]
Пример #9
0
def run(sort_field, k, parallel, use_pandas, sort_order, buffer_size, table_parts, path, format_):
    """
    Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap
    :return:
    """

    secure = False
    use_native = False
    print('')
    print("Top K Benchmark, Filtered. Sort Field: {}, Order: {}".format(sort_field, sort_order))
    print("----------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)
    
    # Scan
    scan = map(lambda p: 
               query_plan.add_operator(
                    SQLTableScan("{}/topk_data_{}.csv".format(path, p),
                        "select * from S3Object;", format_, use_pandas, secure, use_native,
                        'scan_{}'.format(p), query_plan,
                        False)),
               range(0, table_parts))
  
    # Project
    def project_fn(df):
        df.columns = ['F0', 'F1', 'F2']
        df[ [sort_field] ] = df[ [sort_field] ].astype(np.float)
        return df
   
    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] 
    
    project = map(lambda p: 
                  query_plan.add_operator( 
                      Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)),
                  range(0, table_parts))

    # TopK
    sort_expr = SortExpression(sort_field, 'float', sort_order)
    topk = map(lambda p: 
               query_plan.add_operator(
                    Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)),
               range(0, table_parts))

    # TopK reduce
    topk_reduce = query_plan.add_operator(
                    Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) 

    collate = query_plan.add_operator(
        Collate('collate', query_plan, False))

    #profile_path = '../benchmark-output/topk/'
    #scan[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_scan_0" + ".prof"))
    #project[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_project_0" + ".prof"))
    #groupby[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_0" + ".prof"))
    #groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_reduce" + ".prof"))
    #collate.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_collate" + ".prof"))

    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(topk[p]), enumerate(project))
    map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
    topk_reduce.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Пример #10
0
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts):
    """

    :return: None
    """

    print('')
    print("TPCH Q14 Baseline Join")
    print("----------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    # This date is chosen because it triggers the filter to filter out 1 of the rows in the root data set.
    date = '1996-03-13'
    min_shipped_date = datetime.strptime(date, '%Y-%m-%d')
    max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30)

    lineitem_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q14.sql_scan_lineitem_extra_filtered_operator_def(
                lineitem_parts != 1, p, use_pandas, 'lineitem_scan' + '_' +
                str(p), query_plan)), range(0, lineitem_parts))

    lineitem_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.
            project_partkey_extendedprice_discount_shipdate_operator_def(
                'lineitem_project' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))

    part_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q14.sql_scan_part_operator_def(part_parts != 1, p, part_parts,
                                                use_pandas, 'part_scan' + '_' +
                                                str(p), query_plan)),
        range(0, part_parts))

    part_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.project_partkey_brand_type_operator_def(
                'part_project' + '_' + str(p), query_plan)),
        range(0, part_parts))

    lineitem_filter = map(
        lambda p: query_plan.add_operator(
            tpch_q14.filter_shipdate_operator_def(
                min_shipped_date, max_shipped_date, 'lineitem_filter' + '_' +
                str(p), query_plan)), range(0, lineitem_parts))

    part_filter = map(
        lambda p: query_plan.add_operator(
            tpch_q14.filter_brand12_operator_def('part_filter' + '_' + str(p),
                                                 query_plan)),
        range(0, part_parts))

    join_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('p_partkey', 'join_build' + '_' + str(p), query_plan,
                          False)), range(0, part_parts))

    join_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                          'join_probe' + '_' + str(p), query_plan, False)),
        range(0, part_parts))

    part_aggregate = map(
        lambda p: query_plan.add_operator(
            tpch_q14.aggregate_promo_revenue_operator_def(
                'part_aggregate' + '_' + str(p), query_plan)),
        range(0, part_parts))

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0'])),
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_1']))
        ], 'aggregate_reduce', query_plan, False))

    aggregate_project = query_plan.add_operator(
        tpch_q14.project_promo_revenue_operator_def('aggregate_project',
                                                    query_plan))

    collate = query_plan.add_operator(
        tpch_q14.collate_operator_def('collate', query_plan))

    # Connect the operators
    map(lambda (p, o): o.connect(lineitem_project[p]),
        enumerate(lineitem_scan))
    map(lambda (p, o): o.connect(lineitem_filter[p]),
        enumerate(lineitem_project))
    map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan))
    map(lambda (p, o): o.connect(part_filter[p]), enumerate(part_project))
    map(lambda (p, o): o.connect(join_build[p]), enumerate(part_filter))
    map(
        lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo),
                           enumerate(join_build)), enumerate(join_probe))
    map(lambda (p, o): join_probe[p % part_parts].connect_tuple_producer(o),
        enumerate(lineitem_filter))
    map(lambda (p, o): o.connect(part_aggregate[p]), enumerate(join_probe))
    map(lambda (p, o): o.connect(aggregate_reduce), enumerate(part_aggregate))
    aggregate_reduce.connect(aggregate_project)
    aggregate_project.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['promo_revenue']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    assert round(float(tuples[1][0]), 10) == 33.4262326420
Пример #11
0
def run(group_fields, agg_fields, parallel, use_pandas, buffer_size,
        table_parts, files, format_):
    """
    
    :return: None
    """

    secure = False
    use_native = False
    print('')
    print("Groupby Benchmark, Baseline. Group Fields: {} Aggregate Fields: {}".
          format(group_fields, agg_fields))
    print("----------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Scan
    scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                files.format(p), "select {} from S3Object;".format(','.join(
                    group_fields + agg_fields)), format_, use_pandas, secure,
                use_native, 'scan_{}'.format(p), query_plan, False)),
        range(0, table_parts))

    # Project
    def project_fn(df):
        df.columns = group_fields + agg_fields
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_{}'.format(n)], v) for n, v in enumerate(group_fields)] \
                  + [ProjectExpression(lambda t_: t_['_{}'.format(n + len(group_fields))], v) for n, v in enumerate(agg_fields)]

    project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'project_{}'.format(p), query_plan, False,
                    project_fn)), range(0, table_parts))

    # Groupby
    def groupby_fn(df):
        df[agg_fields] = df[agg_fields].astype(np.float)
        grouped = df.groupby(group_fields)
        agg_df = pd.DataFrame(
            {f: grouped[f].sum()
             for n, f in enumerate(agg_fields)})
        return agg_df.reset_index()

    groupby = map(
        lambda p: query_plan.add_operator(
            Group(group_fields, [], 'groupby_{}'.format(p), query_plan, False,
                  groupby_fn)), range(0, table_parts))

    groupby_reduce = query_plan.add_operator(
        Group(group_fields, [], 'groupby_reduce', query_plan, False,
              groupby_fn))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(groupby[p]), enumerate(project))
    map(lambda (p, o): o.connect(groupby_reduce), enumerate(groupby))
    groupby_reduce.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Пример #12
0
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts, sharded):
    """
    :return: None
    """

    print('')
    print("TPCH Q19 Baseline Join")
    print("----------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Define the operators
    lineitem_scan = map(lambda p:
                        query_plan.add_operator(
                            tpch_q19.sql_scan_lineitem_select_all_where_partkey_op(sharded,
                                                                                   p,
                                                                                   lineitem_parts,
                                                                                   use_pandas,
                                                                                   'lineitem_scan' + '_' + str(p),
                                                                                   query_plan)),
                        range(0, lineitem_parts))

    part_scan = map(lambda p:
                    query_plan.add_operator(
                        tpch_q19.sql_scan_part_select_all_where_partkey_op(sharded,
                                                                           p,
                                                                           part_parts,
                                                                           use_pandas,
                                                                           'part_scan' + '_' + str(p), query_plan)),
                    range(0, part_parts))

    lineitem_project = map(lambda p:
                           query_plan.add_operator(
                               tpch_q19.project_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_op(
                                   'lineitem_project' + '_' + str(p), query_plan)),
                           range(0, lineitem_parts))

    lineitem_map = map(lambda p:
                       query_plan.add_operator(Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, True)),
                       range(0, part_parts))

    part_project = map(lambda p:
                       query_plan.add_operator(
                           tpch_q19.project_partkey_brand_size_container_op('part_project' + '_' + str(p),
                                                                            query_plan)),
                       range(0, part_parts))

    part_map = map(lambda p:
                   query_plan.add_operator(Map('p_partkey', 'part_map' + '_' + str(p), query_plan, True)),
                   range(0, part_parts))

    # lineitem_part_join = map(lambda p:
    #                          query_plan.add_operator(tpch_q19.join_op(query_plan)),
    #                          range(0, part_parts))

    lineitem_part_join_build = map(lambda p:
                                   query_plan.add_operator(
                                       HashJoinBuild('p_partkey',
                                                     'lineitem_partjoin_build' + '_' + str(p), query_plan,
                                                     False)),
                                   range(0, part_parts))

    lineitem_part_join_probe = map(lambda p:
                                   query_plan.add_operator(
                                       HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                                                     'lineitem_part_join_probe' + '_' + str(p),
                                                     query_plan, True)),
                                   range(0, part_parts))

    filter_op = map(lambda p:
                    query_plan.add_operator(tpch_q19.filter_def('filter_op' + '_' + str(p), query_plan)),
                    range(0, part_parts))
    aggregate = map(lambda p:
                    query_plan.add_operator(tpch_q19.aggregate_def('aggregate' + '_' + str(p), query_plan)),
                    range(0, part_parts))

    aggregate_reduce = query_plan.add_operator(
        Aggregate(
            [
                AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0']))
            ],
            'aggregate_reduce',
            query_plan,
            False))

    aggregate_project = query_plan.add_operator(
        tpch_q19.aggregate_project_def('aggregate_project', query_plan))
    collate = query_plan.add_operator(tpch_q19.collate_op('collate', query_plan))

    # Connect the operators
    # lineitem_scan.connect(lineitem_project)
    connect_many_to_many(lineitem_scan, lineitem_project)
    connect_all_to_all(lineitem_project, lineitem_map)

    # part_scan.connect(part_project)
    connect_many_to_many(part_scan, part_project)
    connect_many_to_many(part_project, part_map)

    # lineitem_part_join.connect_left_producer(lineitem_project)
    connect_all_to_all(part_map, lineitem_part_join_build)

    # lineitem_part_join.connect_right_producer(part_project)
    connect_many_to_many(lineitem_part_join_build, lineitem_part_join_probe)
    connect_many_to_many(lineitem_map, lineitem_part_join_probe)

    # lineitem_part_join.connect(filter_op)
    connect_many_to_many(lineitem_part_join_probe, filter_op)

    # filter_op.connect(aggregate)
    connect_many_to_many(filter_op, aggregate)

    # aggregate.connect(aggregate_project)
    connect_many_to_one(aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, aggregate_project)

    # aggregate_project.connect(collate)
    connect_one_to_one(aggregate_project, collate)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['revenue']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    numpy.testing.assert_almost_equal(tuples[1], 92403.0667)
Пример #13
0
def test_operators():

    system = WorkerSystem()

    query_plan = QueryPlan(system, is_async=True, buffer_size=0)

    # Query plan
    ts = query_plan.add_operator(
        SQLTableScan('nation.csv', 'select * from S3Object '
                     'limit 3;', True, False, False, 'scan', query_plan, True))

    p = query_plan.add_operator(
        Project([
            ProjectExpression(lambda t_: t_['_0'], 'n_nationkey'),
            ProjectExpression(lambda t_: t_['_1'], 'n_name'),
            ProjectExpression(lambda t_: t_['_2'], 'n_regionkey'),
            ProjectExpression(lambda t_: t_['_3'], 'n_comment')
        ], 'project', query_plan, True))

    c = query_plan.add_operator(Collate('collate', query_plan, True))

    ts.connect(p)
    p.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = c.tuples()

    c.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    field_names = ['n_nationkey', 'n_name', 'n_regionkey', 'n_comment']

    assert len(tuples) == 3 + 1

    assert tuples[0] == field_names

    assert tuples[1] == [
        '0', 'ALGERIA', '0',
        ' haggle. carefully final deposits detect slyly agai'
    ]
    assert tuples[2] == [
        '1', 'ARGENTINA', '1',
        'al foxes promise slyly according to the regular accounts. bold requests alon'
    ]
    assert tuples[3] == [
        '2', 'BRAZIL', '1',
        'y alongside of the pending deposits. carefully special packages '
        'are about the ironic forges. slyly special '
    ]
Пример #14
0
def run(parallel, use_pandas, buffer_size, table_parts, lower, upper, sf):
    secure = False
    use_native = False
    print('')
    print("Indexing Benchmark")
    print("------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    def fn(df):
        # df.columns = ['l_orderkey', 'l_partkey', 'l_suppkey', 'l_linenumber', 'l_quantity', 'l_extendedprice', 'l_discount', 'l_tax', 'l_returnflag', 'l_linestatus', 'l_shipdate', 'l_commitdate', 'l_receiptdate', 'l_shipinstruct', 'l_shipmode', 'l_comment']
        # df[ ['l_extendedprice'] ] = df[ ['l_extendedprice'] ].astype(np.float)
        # criterion = (df['l_extendedprice'] >= lower) & (
        #        df['l_extendedprice'] <= upper)
        df[['_5']] = df[['_5']].astype(np.float)
        criterion = (df['_5'] >= lower) & (df['_5'] <= upper)
        return df[criterion]

    # scan the file
    scan_filter = map(
        lambda p: query_plan.add_operator(
            TableScan(get_file_key('lineitem', True, p, sf=sf),
                      use_pandas,
                      secure,
                      use_native,
                      'scan_{}'.format(p),
                      query_plan,
                      False,
                      fn=fn)), range(0, table_parts))

    # aggregation
    def agg_fun(df):
        return pd.DataFrame({'count': [len(df)]})

    aggregate = query_plan.add_operator(
        Aggregate([], True, 'agg', query_plan, False, agg_fun))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(aggregate), enumerate(scan_filter))
    aggregate.connect(collate)

    # scan_filter[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_0" + ".prof"))
    # aggregate.set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_aggregate" + ".prof"))
    # collate.set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_collate" + ".prof"))

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Пример #15
0
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts, part_parts, lineitem_sharded,
        part_sharded, other_parts, sf, expected_result, format_):
    """The baseline tst uses nested loop joins with no projection and no filtering pushed down to s3.

    This works by:

    1. Scanning part and filtering on brand and container
    2. It then scans lineitem
    3. It then joins the two tables (essentially filtering out lineitems that dont include parts that we filtered out in
       step 1)
    4. It then computes the average of l_quantity from the joined table and groups the results by partkey
    5. It then joins these computed averages with the joined table in step 3
    6. It then filters out any rows where l_quantity is less than the computed average

    TODO: There are few ways this can be done, the above is just one.

    :return: None
    """

    print('')
    print("TPCH Q17 Baseline Join")
    print("----------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    part_scan = map(lambda p:
                    query_plan.add_operator(
                        tpch_q17.sql_scan_part_select_all_op(
                            part_sharded,
                            p,
                            part_parts,
                            use_pandas,
                            secure,
                            use_native,
                            'part_scan' + '_' + str(p),
                            query_plan,
                            sf, format_)),
                    range(0, part_parts))

    lineitem_scan = map(lambda p:
                        query_plan.add_operator(
                            tpch_q17.sql_scan_lineitem_select_all_op(
                                lineitem_sharded,
                                p,
                                lineitem_parts,
                                use_pandas,
                                secure,
                                use_native,
                                'lineitem_scan' + '_' + str(p),
                                query_plan,
                                sf, format_)),
                        range(0, lineitem_parts))

    part_project = map(lambda p:
                       query_plan.add_operator(
                           tpch_q17.project_partkey_brand_container_op(
                               'part_project' + '_' + str(p),
                               query_plan)),
                       range(0, part_parts))

    part_filter = map(lambda p:
                      query_plan.add_operator(tpch_q17.filter_brand_container_op(
                          'part_filter' + '_' + str(p),
                          query_plan)),
                      range(0, part_parts))

    part_map = map(lambda p:
                   query_plan.add_operator(Map('p_partkey', 'part_map' + '_' + str(p), query_plan, False)),
                   range(0, part_parts))

    lineitem_project = map(lambda p:
                           query_plan.add_operator(
                               tpch_q17.project_lineitem_orderkey_partkey_quantity_extendedprice_op(
                                   'lineitem_project' + '_' + str(p), query_plan)),
                           range(0, lineitem_parts))

    lineitem_map = map(lambda p:
                       query_plan.add_operator(Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, False)),
                       range(0, lineitem_parts))

    part_lineitem_join_build = map(lambda p:
                                   query_plan.add_operator(
                                       HashJoinBuild('p_partkey',
                                                     'part_lineitem_join_build' + '_' + str(p), query_plan,
                                                     False)),
                                   range(0, other_parts))

    part_lineitem_join_probe = map(lambda p:
                                   query_plan.add_operator(
                                       HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                                                     'part_lineitem_join_probe' + '_' + str(p),
                                                     query_plan, False)),
                                   range(0, other_parts))

    lineitem_part_avg_group = map(lambda p:
                                  query_plan.add_operator(
                                      tpch_q17.group_partkey_avg_quantity_5_op('lineitem_part_avg_group' + '_' + str(p),
                                                                               query_plan)),
                                  range(0, other_parts))

    lineitem_part_avg_group_project = map(lambda p:
                                          query_plan.add_operator(
                                              tpch_q17.project_partkey_avg_quantity_op(
                                                  'lineitem_part_avg_group_project' + '_' + str(p), query_plan)),
                                          range(0, other_parts))

    part_lineitem_join_avg_group_join_build = \
        map(lambda p:
            query_plan.add_operator(
                HashJoinBuild('l_partkey',
                              'part_lineitem_join_avg_group_join_build' + '_' + str(p),
                              query_plan,
                              False)),
            range(0, other_parts))

    part_lineitem_join_avg_group_join_probe = \
        map(lambda p:
            query_plan.add_operator(
                HashJoinProbe(JoinExpression('l_partkey', 'l_partkey'),
                              'part_lineitem_join_avg_group_join_probe' + '_' + str(p),
                              query_plan,
                              False)),
            range(0, other_parts))

    lineitem_filter = map(lambda p:
                          query_plan.add_operator(
                              tpch_q17.filter_lineitem_quantity_op('lineitem_filter' + '_' + str(p), query_plan)),
                          range(0, other_parts))

    extendedprice_sum_aggregate = map(lambda p:
                                      query_plan.add_operator(
                                          tpch_q17.aggregate_sum_extendedprice_op(
                                              use_pandas,
                                              'extendedprice_sum_aggregate' + '_' + str(p),
                                              query_plan)),
                                      range(0, other_parts))

    def aggregate_reduce_fn(df):
        sum1_ = df['_0'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum1_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate(
            [
                AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0']))
            ],
            use_pandas,
            'aggregate_reduce',
            query_plan,
            False, aggregate_reduce_fn))

    extendedprice_sum_aggregate_project = query_plan.add_operator(
        tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project', query_plan))

    collate = query_plan.add_operator(tpch_q17.collate_op('collate', query_plan))

    # Inline what we can
    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), part_project)
    map(lambda o: o.set_async(False), lineitem_filter)
    map(lambda o: o.set_async(False), part_filter)
    map(lambda o: o.set_async(False), lineitem_map)
    map(lambda o: o.set_async(False), part_map)
    map(lambda o: o.set_async(False), lineitem_part_avg_group)
    map(lambda o: o.set_async(False), lineitem_part_avg_group_project)
    map(lambda o: o.set_async(False), extendedprice_sum_aggregate)
    extendedprice_sum_aggregate_project.set_async(False)

    # Connect the operators
    # part_scan.connect(part_project)
    map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan))
    map(lambda (p, o): o.connect(part_filter[p]), enumerate(part_project))
    map(lambda (p, o): o.connect(part_map[p]), enumerate(part_filter))

    # lineitem_scan.connect(lineitem_project)
    map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_scan))
    map(lambda (p, o): o.connect(lineitem_map[p]), enumerate(lineitem_project))

    # part_lineitem_join.connect_left_producer(part_project)
    # part_lineitem_join.connect_right_producer(lineitem_project)
    # part_lineitem_join.connect(lineitem_part_avg_group)
    map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_lineitem_join_build)), enumerate(part_map))
    map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o), enumerate(part_lineitem_join_build))
    map(lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1), enumerate(part_lineitem_join_probe)),
        enumerate(lineitem_map))
    map(lambda (p, o): o.connect(lineitem_part_avg_group[p]), enumerate(part_lineitem_join_probe))

    # map(lambda (p, o): o.map(Mapper('_1', 1, part_lineitem_join_probe)), enumerate(lineitem_scan))

    # lineitem_part_avg_group.connect(lineitem_part_avg_group_project)
    map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]), enumerate(lineitem_part_avg_group))

    # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project)
    # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join)
    # part_lineitem_join_avg_group_join.connect(lineitem_filter)

    map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]),
        enumerate(lineitem_part_avg_group_project))

    # map(lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo),
    #                        enumerate(part_lineitem_join_avg_group_join_build)),
    #     enumerate(part_lineitem_join_avg_group_join_probe))
    map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p].connect_build_producer(o),
        enumerate(part_lineitem_join_avg_group_join_build))
    map(lambda (p, o): part_lineitem_join_avg_group_join_probe[p % part_parts].connect_tuple_producer(o),
        enumerate(part_lineitem_join_probe))
    map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(part_lineitem_join_avg_group_join_probe))

    # lineitem_filter.connect(extendedprice_sum_aggregate)
    map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]), enumerate(lineitem_filter))
    # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project)
    # extendedprice_sum_aggregate_project.connect(collate)
    map(lambda (p, o): o.connect(aggregate_reduce), enumerate(extendedprice_sum_aggregate))
    aggregate_reduce.connect(extendedprice_sum_aggregate_project)
    extendedprice_sum_aggregate_project.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("part_sharded: {}".format(part_sharded))
    print("other_parts: {}".format(other_parts))
    print("format: {}".format(format_))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['avg_yearly']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    if s3filter.util.constants.TPCH_SF == 10:
        assert round(float(tuples[1][0]),
                     10) == 372414.2899999995  # TODO: This isn't correct but haven't checked tpch17 on 10 sf yet
    elif s3filter.util.constants.TPCH_SF == 1:
        numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
Пример #16
0
def run(group_fields, agg_fields, parallel, use_pandas, buffer_size,
        table_parts, files, format_):
    """
    
    :return: None
    """

    secure = False
    use_native = False
    print('')
    print("Groupby Benchmark, Baseline. Group Fields: {} Aggregate Fields: {}".
          format(group_fields, agg_fields))
    print("----------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    ##########################
    ## Phase 1. Find out group names
    ##########################
    # Scan
    scan_phase1 = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                files.format(p), "select {} from S3Object;".format(','.join(
                    group_fields)), format_, use_pandas, secure, use_native,
                'scan_phase1_{}'.format(p), query_plan, False)),
        range(0, table_parts))

    # Project
    def project_fn(df):
        df.columns = group_fields
        return df

    project_exprs = [
        ProjectExpression(lambda t_: t_['_{}'.format(n)], v)
        for n, v in enumerate(group_fields)
    ]

    project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'project_{}'.format(p), query_plan, False,
                    project_fn)), range(0, table_parts))

    # Groupby
    def groupby_fn(df):
        return df.drop_duplicates()

    groupby = map(
        lambda p: query_plan.add_operator(
            Group(group_fields, [], 'groupby_{}'.format(p), query_plan, False,
                  groupby_fn)), range(0, table_parts))

    groupby_reduce = query_plan.add_operator(
        Group(group_fields, [], 'groupby_reduce', query_plan, False,
              groupby_fn))

    # GroupbyFilterBuild
    agg_exprs = [('SUM', 'CAST({} AS float)'.format(agg_field))
                 for agg_field in agg_fields]

    groupby_filter_build = query_plan.add_operator(
        GroupbyFilterBuild(group_fields, agg_fields, agg_exprs,
                           'groupby_filter_build', query_plan, False))

    ##########################
    ## Phase 2. Perform aggregation at S3.
    ##########################
    # Scan
    scan_phase2 = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                'groupby_benchmark/shards-10GB/groupby_data_{}.csv'.format(
                    p), "", format_, use_pandas, secure, use_native,
                'scan_phase2_{}'.format(p), query_plan, False)),
        range(0, table_parts))

    groupby_decoder = map(
        lambda p: query_plan.add_operator(
            GroupbyDecoder(agg_fields, 'groupby_decoder_{}'.format(p),
                           query_plan, False)), range(0, table_parts))

    def groupby_fn_phase2(df):
        #print df
        df[agg_fields] = df[agg_fields].astype(np.float)
        grouped = df.groupby(group_fields)
        agg_df = pd.DataFrame(
            {f: grouped[f].sum()
             for n, f in enumerate(agg_fields)})
        return agg_df.reset_index()

    groupby_reduce_phase2 = query_plan.add_operator(
        Group(group_fields, [], 'groupby_reduce_phase2', query_plan, False,
              groupby_fn_phase2))

    #scan_phase1[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_phase1_0" + ".prof"))
    #scan_phase2[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_phase2_0" + ".prof"))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    # phase 1
    map(lambda (p, o): o.connect(project[p]), enumerate(scan_phase1))
    map(lambda (p, o): o.connect(groupby[p]), enumerate(project))
    map(lambda (p, o): o.connect(groupby_reduce), enumerate(groupby))
    groupby_reduce.connect(groupby_filter_build)

    # phase 2
    map(lambda (p, o): groupby_filter_build.connect(o, 0),
        enumerate(scan_phase2))
    map(lambda (p, o): groupby_filter_build.connect(o, 1),
        enumerate(groupby_decoder))
    map(lambda (p, o): o.connect(groupby_decoder[p]), enumerate(scan_phase2))
    map(lambda (p, o): o.connect(groupby_reduce_phase2),
        enumerate(groupby_decoder))
    # map(lambda (p, o): groupby_reduce.connect(o), enumerate(groupby_decoder))

    groupby_reduce_phase2.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Пример #17
0
def run(sf, parallel, use_pandas, secure, use_native, buffer_sizje,
        lineitem_parts, sharded, format_):
    """

    :return: None
    """

    print('')
    print("TPCH Q1 Baseline Group By")
    print("----------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    lineitem_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q1.sql_scan_lineitem_operator_def(
                sharded, p, sf, use_pandas, secure, use_native,
                'lineitem_scan' + '_' + str(p), query_plan, format_)),
        range(0, lineitem_parts))
    '''
    lineitem_project = map(lambda p:
                           query_plan.add_operator(
                               tpch_q1.project_lineitem_operator_def(
                                   'lineitem_project' + '_' + str(p),
                                   query_plan)),
                           range(0, lineitem_parts))

    date = '1998-12-01'
    max_shipped_date = datetime.strptime(date, '%Y-%m-%d') - timedelta(days=117)

    lineitem_filter = map(lambda p:
                          query_plan.add_operator(
                              tpch_q1.filter_shipdate_operator_def(
                                  max_shipped_date,
                                  'lineitem_filter' + '_' + str(p),
                                  query_plan)),
                          range(0, lineitem_parts))

    profile_file_name = os.path.join(ROOT_DIR, "../tests-output/" + gen_test_id() + ".prof")

    groupby = map(lambda p:
                  query_plan.add_operator(
                      tpch_q1.groupby_returnflag_linestatus_operator_def(
                          'groupby' + '_' + str(p),
                          query_plan)),
                  range(0, lineitem_parts))

    # groupby[0].set_profiled(True, profile_file_name)

    groupby_reduce = query_plan.add_operator(
        tpch_q1.groupby_reduce_returnflag_linestatus_operator_def(
            'groupby_reduce',
            query_plan, use_pandas))
    # lineitem_scan[0].set_profiled(True,
    #                               os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_scan_0" + ".prof"))
    # lineitem_project[0].set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/",
    #                                                     gen_test_id() + "_project_0" + ".prof"))
    # groupby[0].set_profiled(True,
    #                         os.path.join(ROOT_DIR, "../benchmark-output/", gen_test_id() + "_groupby_0" + ".prof"))
    # groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, "../benchmark-output/",
    #                                                gen_test_id() + "_groupby_reduce" + ".prof"))

    collate = query_plan.add_operator(
        Collate('collate', query_plan, False))

    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), lineitem_filter)
    # map(lambda o: o.set_async(False), groupby)

    map(lambda (p, o): o.connect(lineitem_project[p]), enumerate(lineitem_scan))
    map(lambda (p, o): o.connect(lineitem_filter[p]), enumerate(lineitem_project))
    map(lambda (p, o): o.connect(groupby[p]), enumerate(lineitem_filter))
    map(lambda (p, o): o.connect(groupby_reduce), enumerate(groupby))
    groupby_reduce.connect(collate)
    '''
    groupby = map(
        lambda p: query_plan.add_operator(
            tpch_q1.groupby_returnflag_linestatus_operator_def(
                'groupby' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))
    collate = query_plan.add_operator(Collate('collate', query_plan, False))
    map(lambda (o): o.connect(collate), lineitem_scan)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("lineitem parts: {}".format(lineitem_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(lineitem_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Пример #18
0
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts,
        part_parts, lineitem_sharded, part_sharded, other_parts, sf,
        expected_result, format_):
    """

    :return: None
    """

    print('')
    print("TPCH Q14 Baseline Join")
    print("----------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    # DATE is the first day of a month randomly selected from a random year within [1993 .. 1997].
    date = '1993-01-01'
    min_shipped_date = datetime.strptime(date, '%Y-%m-%d')
    max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30)

    lineitem_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q14.sql_scan_lineitem_operator_def(
                lineitem_sharded, p, lineitem_parts, use_pandas, secure,
                use_native, 'lineitem_scan' + '_' + str(
                    p), query_plan, sf, format_)), range(0, lineitem_parts))

    lineitem_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.
            project_partkey_extendedprice_discount_shipdate_operator_def(
                'lineitem_project' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))

    part_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q14.sql_scan_part_operator_def(
                part_sharded, p, part_parts, use_pandas, secure, use_native,
                'part_scan' + '_' + str(p), query_plan, sf, format_)),
        range(0, part_parts))

    part_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.project_partkey_brand_type_operator_def(
                'part_project' + '_' + str(p), query_plan)),
        range(0, part_parts))

    lineitem_filter = map(
        lambda p: query_plan.add_operator(
            tpch_q14.filter_shipdate_operator_def(
                min_shipped_date, max_shipped_date, 'lineitem_filter' + '_' +
                str(p), query_plan)), range(0, lineitem_parts))

    lineitem_map = map(
        lambda p: query_plan.add_operator(
            Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, False)
        ), range(0, lineitem_parts))

    part_map = map(
        lambda p: query_plan.add_operator(
            Map('p_partkey', 'part_map' + '_' + str(p), query_plan, False)),
        range(0, part_parts))

    # part_filter = map(lambda p:
    #                   query_plan.add_operator(
    #                       tpch_q14.filter_brand12_operator_def('part_filter' + '_' + str(p), query_plan)),
    #                   range(0, part_parts))

    join_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('p_partkey', 'join_build' + '_' + str(p), query_plan,
                          False)), range(0, other_parts))

    join_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                          'join_probe' + '_' + str(p), query_plan, False)),
        range(0, other_parts))

    part_aggregate = map(
        lambda p: query_plan.add_operator(
            tpch_q14.aggregate_promo_revenue_operator_def(
                'part_aggregate' + '_' + str(p), query_plan)),
        range(0, other_parts))

    def aggregate_reduce_fn(df):
        sum1_ = df['_0'].astype(np.float).sum()
        sum2_ = df['_1'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum1_], '_1': [sum2_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0'])),
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_1']))
        ], use_pandas, 'aggregate_reduce', query_plan, False,
                  aggregate_reduce_fn))

    aggregate_project = query_plan.add_operator(
        tpch_q14.project_promo_revenue_operator_def('aggregate_project',
                                                    query_plan))

    collate = query_plan.add_operator(
        tpch_q14.collate_operator_def('collate', query_plan))

    # Inline what we can
    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), part_project)
    map(lambda o: o.set_async(False), lineitem_filter)
    map(lambda o: o.set_async(False), part_map)
    map(lambda o: o.set_async(False), lineitem_map)
    map(lambda o: o.set_async(False), part_aggregate)
    aggregate_project.set_async(False)

    # Connect the operators
    connect_many_to_many(lineitem_scan, lineitem_project)
    connect_many_to_many(lineitem_project, lineitem_filter)
    connect_many_to_many(part_scan, part_project)
    connect_many_to_many(part_project, part_map)
    connect_all_to_all(part_map, join_build)
    connect_many_to_many(join_build, join_probe)
    connect_many_to_many(lineitem_filter, lineitem_map)
    connect_all_to_all(lineitem_map, join_probe)
    connect_many_to_many(join_probe, part_aggregate)
    connect_many_to_one(part_aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, aggregate_project)
    connect_one_to_one(aggregate_project, collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("part_sharded: {}".format(part_sharded))
    print("other_parts: {}".format(other_parts))
    print("format: {}".format(format_))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(lineitem_parts))

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['promo_revenue']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    if s3filter.util.constants.TPCH_SF == 10:
        assert round(float(tuples[1][0]), 10) == 15.4488836202
    elif s3filter.util.constants.TPCH_SF == 1:
        numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
Пример #19
0
def run(parallel, use_pandas, secure, use_native, buffer_size, format_, customer_parts, order_parts, lineitem_parts,
        customer_sharded,
        order_sharded, lineitem_sharded, other_parts, fp_rate, sf, expected_result, customer_filter_sql=None,
        order_filter_sql=None, lineitem_filter_sql=None):
    """

    :return: None
    """

    print('')
    print("TPCH Q3 Bloom Join")
    print("------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    customer_scan = map(lambda p:
                        query_plan.add_operator(
                            SQLTableScan(get_file_key('customer', customer_sharded, p, sf, format_),
                                         "select "
                                         "  c_custkey "
                                         "from "
                                         "  S3Object "
                                         "where "
                                         "  c_mktsegment = 'BUILDING' "
                                         "  {} "
                                         "  {} "
                                         .format(
                                             ' and ' + customer_filter_sql if customer_filter_sql is not None else '',
                                             get_sql_suffix('customer', customer_parts, p, customer_sharded,
                                                            add_where=False)),
                                         format_,
                                         use_pandas, secure, use_native,
                                         'customer_scan' + '_{}'.format(p),
                                         query_plan,
                                         False)),
                        range(0, customer_parts))

    def customer_project_fn(df):
        df = df.filter(items=['_0'], axis=1)

        df.rename(columns={'_0': 'c_custkey'},
                  inplace=True)

        return df

    customer_project = map(lambda p:
                           query_plan.add_operator(
                               Project([],
                                       'customer_project' + '_{}'.format(p),
                                       query_plan,
                                       False, customer_project_fn)),
                           range(0, customer_parts))

    customer_bloom_create = query_plan.add_operator(
        BloomCreate('c_custkey', 'customer_bloom_create', query_plan, False,
                    fp_rate))

    customer_map = map(lambda p:
                       query_plan.add_operator(Map('c_custkey', 'customer_map' + '_' + str(p), query_plan, False)),
                       range(0, customer_parts))

    order_scan = map(lambda p:
                     query_plan.add_operator(
                         SQLTableScanBloomUse(get_file_key('orders', order_sharded, p, sf, format_),
                                              "select "
                                              "  o_custkey, o_orderkey, o_orderdate, o_shippriority "
                                              "from "
                                              "  S3Object "
                                              "where "
                                              "  cast(o_orderdate as timestamp) < cast('1995-03-01' as timestamp) "
                                              "  {} "
                                              "  {} "
                                              .format(
                                                  ' and ' + order_filter_sql if order_filter_sql is not None else '',
                                                  get_sql_suffix('orders', order_parts, p, order_sharded,
                                                                 add_where=False)),
                                              'o_custkey',
                                              format_,
                                              use_pandas, secure, use_native,
                                              'order_scan' + '_{}'.format(p),
                                              query_plan,
                                              False)),
                     range(0, order_parts))

    def order_project_fn(df):
        df = df.filter(items=['_0', '_1', '_2', '_3'], axis=1)

        df.rename(columns={'_0': 'o_custkey', '_1': 'o_orderkey', '_2': 'o_orderdate', '_3': 'o_shippriority'},
                  inplace=True)

        return df

    order_project = map(lambda p:
                        query_plan.add_operator(
                            Project([],
                                    'order_project' + '_{}'.format(p),
                                    query_plan,
                                    False, order_project_fn)),
                        range(0, customer_parts))

    order_map_1 = map(lambda p:
                      query_plan.add_operator(Map('o_custkey', 'order_map_1' + '_' + str(p), query_plan, False)),
                      range(0, order_parts))

    customer_order_join_build = map(lambda p:
                                    query_plan.add_operator(
                                        HashJoinBuild('c_custkey',
                                                      'customer_order_join_build' + '_' + str(p), query_plan,
                                                      False)),
                                    range(0, other_parts))

    customer_order_join_probe = map(lambda p:
                                    query_plan.add_operator(
                                        HashJoinProbe(JoinExpression('c_custkey', 'o_custkey'),
                                                      'customer_order_join_probe' + '_' + str(p),
                                                      query_plan, False)),
                                    range(0, other_parts))

    order_bloom_create = query_plan.add_operator(
        BloomCreate('o_orderkey', 'order_bloom_create', query_plan, False,
                    fp_rate))

    lineitem_scan = map(lambda p:
                        query_plan.add_operator(
                            SQLTableScanBloomUse(get_file_key('lineitem', lineitem_sharded, p, sf, format_),
                                                 "select "
                                                 "  l_orderkey, l_extendedprice, l_discount "
                                                 "from "
                                                 "  S3Object "
                                                 "where "
                                                 "  cast(l_shipdate as timestamp) > cast('1995-03-01' as timestamp) "
                                                 "  {} "
                                                 "  {} "
                                                 .format(
                                                     ' and ' + lineitem_filter_sql if lineitem_filter_sql is not None else '',
                                                     get_sql_suffix('lineitem', lineitem_parts, p, lineitem_sharded,
                                                                    add_where=False)),
                                                 'l_orderkey', format_,
                                                 use_pandas, secure, use_native,
                                                 'lineitem_scan' + '_{}'.format(p),
                                                 query_plan,
                                                 False)),
                        range(0, lineitem_parts))

    def lineitem_project_fn(df):
        df = df.filter(items=['_0', '_1', '_2'], axis=1)

        df.rename(columns={'_0': 'l_orderkey', '_1': 'l_extendedprice', '_2': 'l_discount'},
                  inplace=True)

        return df

    lineitem_project = map(lambda p:
                           query_plan.add_operator(
                               Project([],
                                       'lineitem_project' + '_{}'.format(p),
                                       query_plan,
                                       False, lineitem_project_fn)),
                           range(0, lineitem_parts))

    lineitem_map = map(lambda p:
                       query_plan.add_operator(Map('l_orderkey', 'lineitem_map' + '_' + str(p), query_plan, False)),
                       range(0, lineitem_parts))

    order_map_2 = map(lambda p:
                      query_plan.add_operator(Map('o_orderkey', 'order_map_2' + '_' + str(p), query_plan, False)),
                      range(0, other_parts))

    customer_order_lineitem_join_build = map(lambda p:
                                             query_plan.add_operator(
                                                 HashJoinBuild('o_orderkey',
                                                               'customer_order_lineitem_join_build' + '_' + str(p),
                                                               query_plan,
                                                               False)),
                                             range(0, other_parts))

    customer_order_lineitem_join_probe = map(lambda p:
                                             query_plan.add_operator(
                                                 HashJoinProbe(JoinExpression('o_orderkey', 'l_orderkey'),
                                                               'customer_order_lineitem_join_probe' + '_' + str(p),
                                                               query_plan, False)),
                                             range(0, other_parts))

    def groupby_fn(df):
        df['l_extendedprice'] = df['l_extendedprice'].astype(np.float)
        df['l_discount'] = df['l_discount'].astype(np.float)
        df['revenue'] = df['l_extendedprice'] * (1 - df['l_discount'])
        grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority'])
        agg_df = grouped['revenue'].sum()
        return agg_df.reset_index()

    group = map(lambda p:
                query_plan.add_operator(
                    Group(
                        ['l_orderkey', 'o_orderdate', 'o_shippriority'],  # l_partkey
                        [
                            AggregateExpression(AggregateExpression.SUM,
                                                lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount'])))
                        ],
                        'group' + '_{}'.format(p), query_plan,
                        False, groupby_fn)),
                range(0, other_parts))

    def group_reduce_fn(df):
        grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority'])
        agg_df = grouped['revenue'].sum()
        return agg_df.reset_index()

    group_reduce = query_plan.add_operator(
        Group(
            ['l_orderkey', 'o_orderdate', 'o_shippriority'],  # l_partkey
            [
                AggregateExpression(AggregateExpression.SUM,
                                    lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount'])))
            ],
            'group_reduce', query_plan,
            False, group_reduce_fn))

    top = query_plan.add_operator(
        Top(10, [SortExpression('revenue', float, 'DESC'), SortExpression('o_orderdate', date, 'ASC')], use_pandas,
            'top', query_plan,
            False))

    collate = query_plan.add_operator(tpch_q19.collate_op('collate', query_plan))

    # Inline what we can
    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), customer_project)
    map(lambda o: o.set_async(False), order_project)
    map(lambda o: o.set_async(False), lineitem_map)
    map(lambda o: o.set_async(False), customer_map)
    map(lambda o: o.set_async(False), order_map_1)
    map(lambda o: o.set_async(False), order_map_2)

    # Connect the operators
    connect_many_to_many(customer_scan, customer_project)
    connect_many_to_many(customer_project, customer_map)

    connect_many_to_one(customer_project, customer_bloom_create)
    connect_one_to_many(customer_bloom_create, order_scan)

    connect_many_to_many(order_scan, order_project)
    connect_many_to_many(order_project, order_map_1)

    connect_all_to_all(customer_map, customer_order_join_build)
    connect_many_to_many(customer_order_join_build, customer_order_join_probe)
    connect_all_to_all(order_map_1, customer_order_join_probe)

    # connect_many_to_one(customer_order_join_probe, collate)

    connect_many_to_one(order_project, order_bloom_create)
    connect_one_to_many(order_bloom_create, lineitem_scan)

    connect_many_to_many(lineitem_scan, lineitem_project)
    connect_many_to_many(lineitem_project, lineitem_map)

    connect_many_to_many(customer_order_join_probe, order_map_2)
    connect_all_to_all(order_map_2, customer_order_lineitem_join_build)
    connect_many_to_many(customer_order_lineitem_join_build, customer_order_lineitem_join_probe)
    connect_all_to_all(lineitem_map, customer_order_lineitem_join_probe)

    # connect_many_to_one(customer_order_lineitem_join_probe, collate)

    connect_many_to_many(customer_order_lineitem_join_probe, group)
    # connect_many_to_one(group, collate)

    connect_many_to_one(group, group_reduce)
    connect_one_to_one(group_reduce, top)
    connect_one_to_one(top, collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("customer_parts: {}".format(customer_parts))
    print("order_parts: {}".format(order_parts))
    print("lineitem_parts: {}".format(lineitem_parts))
    print("customer_sharded: {}".format(customer_sharded))
    print("order_sharded: {}".format(order_sharded))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("other_parts: {}".format(other_parts))
    print("fp_rate: {}".format(fp_rate))
    print("format: {}".format(format_))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['l_orderkey', 'o_orderdate', 'o_shippriority', 'revenue']

    assert len(tuples) == 10 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    test_util.assert_tuples(expected_result, tuples)
Пример #20
0
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts,
        part_parts, lineitem_sharded, part_sharded, other_parts, sf, fp_rate,
        expected_result, format_):
    """

    :return: None
    """

    print('')
    print("TPCH Q19 Bloom Join")
    print("-------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Define the operators
    lineitem_bloom_use = \
        map(lambda p:
            query_plan.add_operator(
                tpch_q19.bloom_scan_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_where_filtered_op(
                    lineitem_sharded,
                    p,
                    lineitem_parts,
                    use_pandas,
                    secure,
                    use_native,
                    'lineitem_bloom_use' + '_' + str(p), query_plan, sf, format_)),
            range(0, lineitem_parts))

    part_bloom_create = query_plan.add_operator(
        tpch_q19.bloom_create_partkey_op('part_bloom_create', query_plan,
                                         fp_rate))

    part_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q19.
            sql_scan_part_partkey_brand_size_container_where_filtered_op(
                part_sharded, p, part_parts, use_pandas, secure, use_native,
                'part_scan' + '_' + str(p), query_plan, sf, format_)),
        range(0, part_parts))

    lineitem_project = \
        map(lambda p:
            query_plan.add_operator(
                tpch_q19.project_partkey_quantity_extendedprice_discount_shipinstruct_shipmode_filtered_op(
                    'lineitem_project' + '_' + str(p),
                    query_plan)),
            range(0, lineitem_parts))

    lineitem_map = map(
        lambda p: query_plan.add_operator(
            Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, False)
        ), range(0, lineitem_parts))

    part_project = map(
        lambda p: query_plan.add_operator(
            tpch_q19.project_partkey_brand_size_container_filtered_op(
                'part_project' + '_' + str(p), query_plan)),
        range(0, part_parts))

    part_map = map(
        lambda p: query_plan.add_operator(
            Map('p_partkey', 'part_map' + '_' + str(p), query_plan, False)),
        range(0, part_parts))

    # part_bloom_create_map = map(lambda p:
    #                query_plan.add_operator(
    #                    Map('p_partkey', 'part_bloom_create_map' + '_' + str(p), query_plan, False)),
    #                range(0, part_parts))

    # lineitem_part_join = map(lambda p:
    #                          query_plan.add_operator(tpch_q19.join_op(query_plan)),
    #                          range(0, part_parts))

    lineitem_part_join_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('p_partkey', 'lineitem_part_join_build' + '_' + str(
                p), query_plan, False)), range(0, other_parts))

    lineitem_part_join_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                          'lineitem_part_join_probe' + '_' + str(
                              p), query_plan, False)), range(0, other_parts))

    filter_op = map(
        lambda p: query_plan.add_operator(
            tpch_q19.filter_def('filter_op' + '_' + str(p), query_plan)),
        range(0, other_parts))
    aggregate = map(
        lambda p: query_plan.add_operator(
            tpch_q19.aggregate_def('aggregate' + '_' + str(p), query_plan,
                                   use_pandas)), range(0, other_parts))

    def aggregate_reduce_fn(df):
        sum1_ = df['_0'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum1_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], use_pandas, 'aggregate_reduce', query_plan, False,
                  aggregate_reduce_fn))

    aggregate_project = query_plan.add_operator(
        tpch_q19.aggregate_project_def('aggregate_project', query_plan))
    collate = query_plan.add_operator(
        tpch_q19.collate_op('collate', query_plan))

    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), part_project)
    map(lambda o: o.set_async(False), lineitem_map)
    map(lambda o: o.set_async(False), part_map)
    map(lambda o: o.set_async(False), filter_op)
    map(lambda o: o.set_async(False), aggregate)
    aggregate_project.set_async(False)

    # Connect the operators
    # part_scan.connect(part_project)
    connect_many_to_many(part_scan, part_project)
    connect_many_to_many(part_project, part_map)

    # part_project.connect(part_bloom_create)
    connect_many_to_one(part_project, part_bloom_create)

    # part_bloom_create.connect(lineitem_bloom_use)
    connect_one_to_many(part_bloom_create, lineitem_bloom_use)

    # lineitem_bloom_use.connect(lineitem_project)
    connect_many_to_many(lineitem_bloom_use, lineitem_project)
    connect_many_to_many(lineitem_project, lineitem_map)

    # lineitem_part_join.connect_left_producer(lineitem_project)
    connect_all_to_all(part_map, lineitem_part_join_build)

    # lineitem_part_join.connect_right_producer(part_project)
    connect_many_to_many(lineitem_part_join_build, lineitem_part_join_probe)
    connect_all_to_all(lineitem_map, lineitem_part_join_probe)

    # lineitem_part_join.connect(filter_op)
    connect_many_to_many(lineitem_part_join_probe, filter_op)

    # filter_op.connect(aggregate)
    connect_many_to_many(filter_op, aggregate)

    # aggregate.connect(aggregate_project)
    connect_many_to_one(aggregate, aggregate_reduce)
    connect_one_to_one(aggregate_reduce, aggregate_project)

    # aggregate_project.connect(collate)
    connect_one_to_one(aggregate_project, collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("part_sharded: {}".format(part_sharded))
    print("other_parts: {}".format(other_parts))
    print("fp_rate: {}".format(fp_rate))
    print("format: {}".format(format_))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['revenue']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
Пример #21
0
def test_pandas_filter_baseline():
    """

    :return:
    """

    query_plan = QueryPlan(buffer_size=8192)

    # Query plan
    ts = query_plan.add_operator(
        SQLTableScan('lineitem.csv', 'select * from S3Object limit 3;', True,
                     'ts', query_plan, False))

    # f = query_plan.add_operator(
    #     Filter(PredicateExpression(lambda t_: cast(t_['_10'], timestamp) >= cast('1996-03-01', timestamp)),
    #            'f', query_plan,
    #            False))

    def pd_expr(df):
        # df['_10'] = pd.to_datetime(df['_10'])
        return pd.to_datetime(df['_10']) >= '1996-03-01'

    f = query_plan.add_operator(
        Filter(PredicateExpression(None, pd_expr), 'f', query_plan, True))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(f)
    f.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))

    assert 2 + 1 == len(c.tuples())

    field_names = [
        '_0', '_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10',
        '_11', '_12', '_13', '_14', '_15'
    ]

    assert c.tuples()[0] == field_names

    assert c.tuples()[1] == [
        '1', '155190', '7706', '1', '17', '21168.23', '0.04', '0.02', 'N', 'O',
        '1996-03-13', '1996-02-12', '1996-03-22', 'DELIVER IN PERSON', 'TRUCK',
        'egular courts above the'
    ]

    assert c.tuples()[2] == [
        '1', '67310', '7311', '2', '36', '45983.16', '0.09', '0.06', 'N', 'O',
        '1996-04-12', '1996-02-28', '1996-04-20', 'TAKE BACK RETURN', 'MAIL',
        'ly final dependencies: slyly bold '
    ]

    # Write the metrics
    query_plan.print_metrics()
Пример #22
0
def run(group_fields, agg_fields, parallel, use_pandas, buffer_size,
        table_parts, files):
    """
    
    :return: None
    """

    secure = False
    use_native = False
    print('')
    print("Groupby Benchmark, Baseline. Group Fields: {} Aggregate Fields: {}".
          format(group_fields, agg_fields))
    print("----------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    def fn(df):
        df.columns = [
            'G0',
            'G1',
            'G2',
            'G3',
            'G4',
            'G5',
            'G6',
            'G7',
            'G8',
            'G9',
            'F0',
            'F1',
            'F2',
            'F3',
            'F4',
            'F5',
            'F6',
            'F7',
            'F8',
            'F9',
        ]
        df = df.filter(items=group_fields + agg_fields, axis=1)
        return df

    # Scan
    scan = map(
        lambda p: query_plan.add_operator(
            TableScan(files.format(p),
                      use_pandas,
                      secure,
                      use_native,
                      'scan_{}'.format(p),
                      query_plan,
                      False,
                      fn=fn)), range(0, table_parts))

    # Groupby
    def groupby_fn(df):

        df[agg_fields] = df[agg_fields].astype(np.float)
        grouped = df.groupby(group_fields)
        agg_df = pd.DataFrame(
            {f: grouped[f].sum()
             for n, f in enumerate(agg_fields)})
        return agg_df.reset_index()

    groupby = map(
        lambda p: query_plan.add_operator(
            Group(group_fields, [], 'groupby_{}'.format(p), query_plan, False,
                  groupby_fn)), range(0, table_parts))

    # inlined
    map(lambda p: p.set_async(False), groupby)

    groupby_reduce = query_plan.add_operator(
        Group(group_fields, [], 'groupby_reduce', query_plan, False,
              groupby_fn))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    #profile_path = '../benchmark-output/groupby/'
    #scan[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_scan_0" + ".prof"))
    #project[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_project_0" + ".prof"))
    #groupby[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_0" + ".prof"))
    #groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_reduce" + ".prof"))
    #collate.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_collate" + ".prof"))
    connect_many_to_many(scan, groupby)
    connect_many_to_one(groupby, groupby_reduce)
    connect_one_to_one(groupby_reduce, collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Пример #23
0
def test_filter_baseline():
    """

    :return:
    """

    query_plan = QueryPlan(buffer_size=64, is_async=True, use_shared_mem=False)

    # Query plan
    '''
    ts = query_plan.add_operator(
        SQLTableScan('lineitem.csv', 'select * from S3Object limit 3;' , False, 'ts', query_plan, False))
    '''
    # using a 'use_native=True' argument will result in a None object being returned
    ts = query_plan.add_operator(
        SQLTableScan('random_strings_2.csv', 'select * from S3Object limit 3;',
                     Format.CSV, True, False, False, 'ts', query_plan, False))
    f = query_plan.add_operator(
        Filter(
            PredicateExpression(lambda t_: cast(t_['_10'], timestamp) >= cast(
                '1996-03-01', timestamp)), 'f', query_plan, False))

    c = query_plan.add_operator(Collate('c', query_plan, False))

    ts.connect(f)
    f.connect(c)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    # num_rows = 0
    # for t in c.tuples():
    #     num_rows += 1
    #     print("{}:{}".format(num_rows, t))
    print(len(c.tuples()))
    assert 2 + 1 == len(c.tuples())

    field_names = [
        '_0', '_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10',
        '_11', '_12', '_13', '_14', '_15'
    ]

    assert c.tuples()[0] == field_names

    assert c.tuples()[1] == [
        '1', '155190', '7706', '1', '17', '21168.23', '0.04', '0.02', 'N', 'O',
        '1996-03-13', '1996-02-12', '1996-03-22', 'DELIVER IN PERSON', 'TRUCK',
        'egular courts above the'
    ]

    assert c.tuples()[2] == [
        '1', '67310', '7311', '2', '36', '45983.16', '0.09', '0.06', 'N', 'O',
        '1996-04-12', '1996-02-28', '1996-04-20', 'TAKE BACK RETURN', 'MAIL',
        'ly final dependencies: slyly bold '
    ]

    # Write the metrics
    query_plan.print_metrics()
Пример #24
0
def run(parallel,
        use_pandas,
        buffer_size,
        table_first_part,
        table_parts,
        queried_columns,
        select_columns,
        lower,
        upper,
        path,
        format_=Format.CSV):
    secure = False
    use_native = False
    print('')
    print("CSV Filter+Project Benchmark")
    print("------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # SQL scan the file
    scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                "{}/lineitem.tbl.{}".format(path, p
                                            ), "select {} from S3Object "
                "where cast(l_orderkey as int) >= {} and cast(l_orderkey as int) <= {};"
                .format(select_columns, lower, upper), format_, use_pandas,
                secure, use_native, 'scan_{}'.format(p), query_plan, False)),
        range(table_first_part, table_first_part + table_parts))

    # project
    def fn(df):
        df.columns = queried_columns
        df[['l_orderkey']] = df[['l_orderkey']].astype(np.int64)
        return df

    project = map(
        lambda p: query_plan.add_operator(
            Project([], 'project_{}'.format(p), query_plan, False, fn)),
        range(table_first_part, table_first_part + table_parts))

    # aggregation
    def agg_fun(df):
        return pd.DataFrame({'count': [len(df)]})

    aggregate = query_plan.add_operator(
        Aggregate([], True, 'agg', query_plan, False, agg_fun))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(aggregate), enumerate(project))
    aggregate.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Пример #25
0
def run(parallel, buffer_size):
    """

    :return: None
    """

    print('')
    print("TPCH Q14 Filtered Join")
    print("----------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    # DATE is the first day of a month randomly selected from a random year within [1993 .. 1997].
    date = '1993-01-01'
    min_shipped_date = datetime.strptime(date, '%Y-%m-%d')
    max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30)

    lineitem_scan = query_plan.add_operator(
        tpch_q14.
        sql_scan_lineitem_partkey_extendedprice_discount_where_shipdate_operator_def(
            min_shipped_date, max_shipped_date, 'lineitem_scan', query_plan))
    lineitem_project = query_plan.add_operator(
        tpch_q14.project_partkey_extendedprice_discount_operator_def(
            'lineitem_project', query_plan))
    part_scan = query_plan.add_operator(
        tpch_q14.sql_scan_part_partkey_type_part_where_brand12_operator_def(
            'part_scan', query_plan))
    part_project = query_plan.add_operator(
        tpch_q14.project_partkey_type_operator_def('part_project', query_plan))
    join = query_plan.add_operator(
        tpch_q14.join_lineitem_part_operator_def('join', query_plan))
    aggregate = query_plan.add_operator(
        tpch_q14.aggregate_promo_revenue_operator_def('aggregate', query_plan))
    aggregate_project = query_plan.add_operator(
        tpch_q14.project_promo_revenue_operator_def('aggregate_project',
                                                    query_plan))
    collate = query_plan.add_operator(
        tpch_q14.collate_operator_def('collate', query_plan))

    lineitem_scan.connect(lineitem_project)
    part_scan.connect(part_project)
    join.connect_left_producer(lineitem_project)
    join.connect_right_producer(part_project)
    join.connect(aggregate)
    aggregate.connect(aggregate_project)
    aggregate_project.connect(collate)

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['promo_revenue']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    assert tuples[1] == [15.090116526324298]
Пример #26
0
def run(parallel, use_pandas, buffer_size, lineitem_parts, part_parts):
    """The baseline tst uses nested loop joins with no projection and no filtering pushed down to s3.

    This works by:

    1. Scanning part and filtering on brand and container
    2. It then scans lineitem
    3. It then joins the two tables (essentially filtering out lineitems that dont include parts that we filtered out in
       step 1)
    4. It then computes the average of l_quantity from the joined table and groups the results by partkey
    5. It then joins these computed averages with the joined table in step 3
    6. It then filters out any rows where l_quantity is less than the computed average

    TODO: There are few ways this can be done, the above is just one.

    :return: None
    """

    print('')
    print("TPCH Q17 Baseline Join")
    print("----------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    part_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q17.sql_scan_part_select_all_where_brand_and_container_op(
                part_parts != 1, p, part_parts, use_pandas, 'part_scan' + '_' +
                str(p), query_plan)), range(0, part_parts))

    lineitem_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q17.sql_scan_lineitem_select_all_where_partkey_op(
                lineitem_parts != 1, p, lineitem_parts, use_pandas,
                'lineitem_scan' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))

    part_project = map(
        lambda p: query_plan.add_operator(
            tpch_q17.project_partkey_brand_container_op(
                'part_project' + '_' + str(p), query_plan)),
        range(0, part_parts))

    part_filter = map(
        lambda p: query_plan.add_operator(
            tpch_q17.filter_brand_container_op('part_filter' + '_' + str(p),
                                               query_plan)),
        range(0, part_parts))

    part_map = map(
        lambda p: query_plan.add_operator(
            Map('p_partkey', 'part_map' + '_' + str(p), query_plan, True)),
        range(0, part_parts))

    lineitem_map = map(
        lambda p: query_plan.add_operator(
            Map('l_partkey', 'lineitem_map' + '_' + str(p), query_plan, True)),
        range(0, lineitem_parts))

    lineitem_project = map(
        lambda p: query_plan.add_operator(
            tpch_q17.
            project_lineitem_orderkey_partkey_quantity_extendedprice_op(
                'lineitem_project' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))

    # part_lineitem_join = map(lambda p:
    #                          query_plan.add_operator(
    #                              tpch_q17.join_p_partkey_l_partkey_op('part_lineitem_join', query_plan)),
    #                          range(0, lineitem_parts))

    part_lineitem_join_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('p_partkey', 'part_lineitem_join_build' + '_' + str(
                p), query_plan, False)), range(0, part_parts))

    part_lineitem_join_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                          'part_lineitem_join_probe' + '_' + str(
                              p), query_plan, True)), range(0, part_parts))

    lineitem_part_avg_group = map(
        lambda p: query_plan.add_operator(
            tpch_q17.group_partkey_avg_quantity_5_op(
                'lineitem_part_avg_group' + '_' + str(p), query_plan)),
        range(0, part_parts))

    lineitem_part_avg_group_project = map(
        lambda p: query_plan.add_operator(
            tpch_q17.project_partkey_avg_quantity_op(
                'lineitem_part_avg_group_project' + '_' + str(p), query_plan)),
        range(0, part_parts))

    # part_lineitem_join_avg_group_join = map(lambda p:
    #                                         query_plan.add_operator(
    #                                             tpch_q17.join_l_partkey_p_partkey_op(
    #                                                 'part_lineitem_join_avg_group_join', query_plan)),
    #                                         range(0, lineitem_parts))

    part_lineitem_join_avg_group_join_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild(
                'l_partkey', 'part_lineitem_join_avg_group_join_build' + '_' +
                str(p), query_plan, False)), range(0, part_parts))

    part_lineitem_join_avg_group_join_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(
                JoinExpression('l_partkey', 'p_partkey'),
                'part_lineitem_join_avg_group_join_probe' + '_' + str(
                    p), query_plan, True)), range(0, part_parts))

    lineitem_filter = map(
        lambda p: query_plan.add_operator(
            tpch_q17.filter_lineitem_quantity_op(
                'lineitem_filter' + '_' + str(p), query_plan)),
        range(0, part_parts))

    extendedprice_sum_aggregate = map(
        lambda p: query_plan.add_operator(
            tpch_q17.aggregate_sum_extendedprice_op(
                'extendedprice_sum_aggregate' + '_' + str(p), query_plan)),
        range(0, part_parts))

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], 'aggregate_reduce', query_plan, False))

    extendedprice_sum_aggregate_project = query_plan.add_operator(
        tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project',
                                       query_plan))

    collate = query_plan.add_operator(
        tpch_q17.collate_op('collate', query_plan))

    # Connect the operators
    # part_scan.connect(part_project)
    map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan))
    map(lambda (p, o): o.connect(part_filter[p]), enumerate(part_project))
    map(lambda (p, o): o.connect(part_map[p]), enumerate(part_filter))

    # lineitem_scan.connect(lineitem_project)
    map(lambda (p, o): o.connect(lineitem_project[p]),
        enumerate(lineitem_scan))
    map(lambda (p, o): o.connect(lineitem_map[p]), enumerate(lineitem_project))

    # part_lineitem_join.connect_left_producer(part_project)
    # part_lineitem_join.connect_right_producer(lineitem_project)
    # part_lineitem_join.connect(lineitem_part_avg_group)
    map(
        lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2),
                             enumerate(part_lineitem_join_build)),
        enumerate(part_map))
    map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o),
        enumerate(part_lineitem_join_build))
    map(
        lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1),
                             enumerate(part_lineitem_join_probe)),
        enumerate(lineitem_map))
    map(lambda (p, o): o.connect(lineitem_part_avg_group[p]),
        enumerate(part_lineitem_join_probe))

    # map(lambda (p, o): o.map(Mapper('_1', 1, part_lineitem_join_probe)), enumerate(lineitem_scan))

    # lineitem_part_avg_group.connect(lineitem_part_avg_group_project)
    map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]),
        enumerate(lineitem_part_avg_group))

    # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project)
    # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join)
    # part_lineitem_join_avg_group_join.connect(lineitem_filter)

    map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]),
        enumerate(lineitem_part_avg_group_project))

    # map(lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo),
    #                        enumerate(part_lineitem_join_avg_group_join_build)),
    #     enumerate(part_lineitem_join_avg_group_join_probe))
    map(
        lambda (p, o): part_lineitem_join_avg_group_join_probe[p].
        connect_build_producer(o),
        enumerate(part_lineitem_join_avg_group_join_build))
    map(
        lambda (p, o): part_lineitem_join_avg_group_join_probe[p % part_parts].
        connect_tuple_producer(o), enumerate(part_lineitem_join_probe))
    map(lambda (p, o): o.connect(lineitem_filter[p]),
        enumerate(part_lineitem_join_avg_group_join_probe))

    # lineitem_filter.connect(extendedprice_sum_aggregate)
    map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]),
        enumerate(lineitem_filter))
    # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project)
    # extendedprice_sum_aggregate_project.connect(collate)
    map(lambda (p, o): o.connect(aggregate_reduce),
        enumerate(extendedprice_sum_aggregate))
    aggregate_reduce.connect(extendedprice_sum_aggregate_project)
    extendedprice_sum_aggregate_project.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['avg_yearly']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    assert round(float(tuples[1][0]), 10) == 4632.1085714286
Пример #27
0
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts,
        part_parts, lineitem_sharded, part_sharded, other_parts, sf, fp_rate,
        expected_result, format_):
    """
    :return: None
    """

    print('')
    print("TPCH Q17 Bloom Join")
    print("-------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    part_scan = map(
        lambda p: query_plan.add_operator(
            tpch_q17.sql_scan_select_partkey_where_brand_container_op(
                part_sharded, p, part_parts, use_pandas, secure, use_native,
                'part_scan' + '_' + str(p), query_plan, sf, format_)),
        range(0, part_parts))

    part_project = map(
        lambda p: query_plan.add_operator(
            tpch_q17.project_partkey_op('part_project' + '_' + str(p),
                                        query_plan)), range(0, part_parts))

    part_lineitem_join_build_map = map(
        lambda p: query_plan.add_operator(
            Map('p_partkey', 'part_lineitem_join_build_map' + '_' + str(p),
                query_plan, False)), range(0, part_parts))

    part_bloom_create = query_plan.add_operator(
        tpch_q17.bloom_create_partkey_op(fp_rate, 'part_bloom_create',
                                         query_plan))

    lineitem_bloom_use = \
        map(lambda p:
            query_plan.add_operator(
                tpch_q17.bloom_scan_lineitem_select_orderkey_partkey_quantity_extendedprice_bloom_partkey_op(
                    lineitem_sharded,
                    p,
                    part_parts,
                    use_pandas,
                    secure,
                    use_native,
                    'lineitem_bloom_use' + '_' + str(p),
                    query_plan,
                    sf, format_)),
            range(0, lineitem_parts))

    lineitem_project = map(
        lambda p: query_plan.add_operator(
            tpch_q17.
            project_lineitem_filtered_orderkey_partkey_quantity_extendedprice_op(
                'lineitem_project' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))

    part_lineitem_join_probe_map = map(
        lambda p: query_plan.add_operator(
            Map('l_partkey', 'part_lineitem_join_probe_map' + '_' + str(p),
                query_plan, False)), range(0, lineitem_parts))

    part_lineitem_join_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('p_partkey', 'part_lineitem_join_build' + '_' + str(
                p), query_plan, False)), range(0, other_parts))

    part_lineitem_join_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                          'part_lineitem_join_probe' + '_' + str(
                              p), query_plan, False)), range(0, other_parts))

    lineitem_part_avg_group = map(
        lambda p: query_plan.add_operator(
            tpch_q17.group_partkey_avg_quantity_op(
                'lineitem_part_avg_group' + '_' + str(p), query_plan)),
        range(0, other_parts))

    lineitem_part_avg_group_project = map(
        lambda p: query_plan.add_operator(
            tpch_q17.project_partkey_avg_quantity_op(
                'lineitem_part_avg_group_project' + '_' + str(p), query_plan)),
        range(0, other_parts))

    part_lineitem_join_avg_group_join_build = \
        map(lambda p:
            query_plan.add_operator(
                HashJoinBuild('l_partkey',
                              'part_lineitem_join_avg_group_join_build' + '_' + str(p),
                              query_plan,
                              False)),
            range(0, other_parts))

    part_lineitem_join_avg_group_join_probe = \
        map(lambda p:
            query_plan.add_operator(
                HashJoinProbe(JoinExpression('l_partkey', 'l_partkey'),
                              'part_lineitem_join_avg_group_join_probe' + '_' + str(p),
                              query_plan,
                              False)),
            range(0, other_parts))

    lineitem_filter = map(
        lambda p: query_plan.add_operator(
            tpch_q17.filter_lineitem_quantity_op(
                'lineitem_filter' + '_' + str(p), query_plan)),
        range(0, other_parts))

    extendedprice_sum_aggregate = map(
        lambda p: query_plan.add_operator(
            tpch_q17.aggregate_sum_extendedprice_op(
                use_pandas, 'extendedprice_sum_aggregate' + '_' + str(p),
                query_plan)), range(0, other_parts))

    def aggregate_reduce_fn(df):
        sum1_ = df['_0'].astype(np.float).sum()
        return pd.DataFrame({'_0': [sum1_]})

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0']))
        ], use_pandas, 'aggregate_reduce', query_plan, False,
                  aggregate_reduce_fn))

    extendedprice_sum_aggregate_project = query_plan.add_operator(
        tpch_q17.project_avg_yearly_op('extendedprice_sum_aggregate_project',
                                       query_plan))

    collate = query_plan.add_operator(
        tpch_q17.collate_op('collate', query_plan))

    # Inline what we can
    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), part_project)
    map(lambda o: o.set_async(False), lineitem_filter)
    map(lambda o: o.set_async(False), part_lineitem_join_probe_map)
    map(lambda o: o.set_async(False), part_lineitem_join_build_map)
    map(lambda o: o.set_async(False), lineitem_part_avg_group)
    map(lambda o: o.set_async(False), lineitem_part_avg_group_project)
    map(lambda o: o.set_async(False), extendedprice_sum_aggregate)
    extendedprice_sum_aggregate_project.set_async(False)

    # Connect the operators
    # part_scan.connect(part_project)
    map(lambda (p, o): o.connect(part_project[p]), enumerate(part_scan))
    # map(lambda (p, o): o.connect(part_bloom_create_map[p]), enumerate(part_project))
    map(lambda (p, o): o.connect(part_lineitem_join_build_map[p]),
        enumerate(part_project))

    connect_many_to_one(part_project, part_bloom_create)
    connect_one_to_many(part_bloom_create, lineitem_bloom_use)

    # part_project.connect(part_bloom_create)
    # map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(part_bloom_create)),
    #     enumerate(part_bloom_create_map))

    # part_bloom_create.connect(lineitem_bloom_use)
    # map(lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2), enumerate(lineitem_bloom_use)),
    #     enumerate(part_bloom_create))

    # lineitem_bloom_use.connect(lineitem_project)
    map(lambda (p, o): o.connect(lineitem_project[p]),
        enumerate(lineitem_bloom_use))

    # part_lineitem_join.connect_left_producer(part_project)
    map(
        lambda (p1, o1): map(lambda (p2, o2): o1.connect(o2),
                             enumerate(part_lineitem_join_build)),
        enumerate(part_lineitem_join_build_map))
    map(lambda (p, o): part_lineitem_join_probe[p].connect_build_producer(o),
        enumerate(part_lineitem_join_build))

    # part_lineitem_join.connect_right_producer(lineitem_project)
    # map(lambda (p, o): o.connect(part_lineitem_join_probe_map[p]), enumerate(lineitem_project))
    connect_many_to_many(lineitem_project, part_lineitem_join_probe_map)

    map(
        lambda (p1, o1): map(lambda (p2, o2): o2.connect_tuple_producer(o1),
                             enumerate(part_lineitem_join_probe)),
        enumerate(part_lineitem_join_probe_map))

    # part_lineitem_join.connect(lineitem_part_avg_group)
    map(lambda (p, o): o.connect(lineitem_part_avg_group[p]),
        enumerate(part_lineitem_join_probe))

    # lineitem_part_avg_group.connect(lineitem_part_avg_group_project)
    map(lambda (p, o): o.connect(lineitem_part_avg_group_project[p]),
        enumerate(lineitem_part_avg_group))

    # part_lineitem_join_avg_group_join.connect_left_producer(lineitem_part_avg_group_project)
    map(lambda (p, o): o.connect(part_lineitem_join_avg_group_join_build[p]),
        enumerate(lineitem_part_avg_group_project))

    # part_lineitem_join_avg_group_join.connect_right_producer(part_lineitem_join)
    map(
        lambda (p, o): part_lineitem_join_avg_group_join_probe[p].
        connect_build_producer(o),
        enumerate(part_lineitem_join_avg_group_join_build))
    map(
        lambda (p, o): part_lineitem_join_avg_group_join_probe[p].
        connect_tuple_producer(o), enumerate(part_lineitem_join_probe))

    # part_lineitem_join_avg_group_join.connect(lineitem_filter)
    map(lambda (p, o): o.connect(lineitem_filter[p]),
        enumerate(part_lineitem_join_avg_group_join_probe))

    # lineitem_filter.connect(extendedprice_sum_aggregate)
    map(lambda (p, o): o.connect(extendedprice_sum_aggregate[p]),
        enumerate(lineitem_filter))

    # extendedprice_sum_aggregate.connect(extendedprice_sum_aggregate_project)
    map(lambda (p, o): o.connect(aggregate_reduce),
        enumerate(extendedprice_sum_aggregate))
    aggregate_reduce.connect(extendedprice_sum_aggregate_project)

    # extendedprice_sum_aggregate_project.connect(collate)
    extendedprice_sum_aggregate_project.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("part_sharded: {}".format(part_sharded))
    print("other_parts: {}".format(other_parts))
    print("fp_rate: {}".format(fp_rate))
    print("format: {}".format(format_))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['avg_yearly']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    if s3filter.util.constants.TPCH_SF == 10:
        assert round(
            float(tuples[1][0]), 10
        ) == 372414.2899999995  # TODO: This isn't correct but haven't checked tpch17 on 10 sf yet
    elif s3filter.util.constants.TPCH_SF == 1:
        numpy.testing.assert_approx_equal(float(tuples[1][0]), expected_result)
Пример #28
0
def run(parallel, use_pandas, secure, use_native, buffer_size, lineitem_parts,
        part_parts, lineitem_sharded, part_sharded):
    """

    :return: None
    """

    print('')
    print("TPCH Q14 Semi Join")
    print("------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    # DATE is the first day of a month randomly selected from a random year within [1993 .. 1997].
    date = '1993-01-01'
    min_shipped_date = datetime.strptime(date, '%Y-%m-%d')
    max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30)

    part_scan_1 = map(
        lambda p: query_plan.add_operator(
            tpch_q14.sql_scan_part_partkey_where_brand12_operator_def(
                part_sharded, p, part_parts, use_pandas, secure, use_native,
                'part_table_scan_1' + '_' + str(p), query_plan)),
        range(0, part_parts))
    part_1_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.project_p_partkey_operator_def(
                'part_1_project' + '_' + str(p), query_plan)),
        range(0, part_parts))
    part_bloom_create = map(
        lambda p: query_plan.add_operator(
            tpch_q14.bloom_create_p_partkey_operator_def(
                'part_bloom_create' + '_' + str(p), query_plan)),
        range(0, part_parts))
    lineitem_scan_1 = map(
        lambda p: query_plan.add_operator(
            tpch_q14.bloom_scan_lineitem_partkey_where_shipdate_operator_def(
                min_shipped_date, max_shipped_date, lineitem_sharded, p,
                use_pandas, secure, use_native, 'lineitem_scan_1' + '_' + str(
                    p), query_plan)), range(0, lineitem_parts))
    lineitem_1_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.project_l_partkey_operator_def(
                'lineitem_1_project' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))
    part_lineitem_join_1_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('p_partkey', 'part_lineitem_join_1_build' + '_' +
                          str(p), query_plan, False)), range(0, part_parts))
    part_lineitem_join_1_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                          'part_lineitem_join_1_probe' + '_' + str(
                              p), query_plan, False)), range(0, part_parts))
    join_bloom_create = map(
        lambda p: query_plan.add_operator(
            tpch_q14.bloom_create_l_partkey_operator_def(
                'join_bloom_create' + '_' + str(p), query_plan)),
        range(0, part_parts))
    part_scan_2 = map(
        lambda p: query_plan.add_operator(
            tpch_q14.bloom_scan_part_partkey_type_brand12_operator_def(
                part_sharded, p, part_parts, use_pandas, secure, use_native,
                'part_table_scan_2' + '_' + str(p), query_plan)),
        range(0, part_parts))
    part_2_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.project_partkey_type_operator_def(
                'part_2_project' + '_' + str(p), query_plan)),
        range(0, part_parts))
    lineitem_scan_2 = map(
        lambda p: query_plan.add_operator(
            tpch_q14.bloom_scan_lineitem_where_shipdate_operator_def(
                min_shipped_date, max_shipped_date, lineitem_sharded, p,
                use_pandas, secure, use_native, 'lineitem_scan_2' + '_' + str(
                    p), query_plan)), range(0, lineitem_parts))
    lineitem_2_project = map(
        lambda p: query_plan.add_operator(
            tpch_q14.project_partkey_extendedprice_discount_operator_def(
                'lineitem_2_project' + '_' + str(p), query_plan)),
        range(0, lineitem_parts))

    part_lineitem_join_2_build = map(
        lambda p: query_plan.add_operator(
            HashJoinBuild('p_partkey', 'part_lineitem_join_2_build' + '_' +
                          str(p), query_plan, False)), range(0, part_parts))
    part_lineitem_join_2_probe = map(
        lambda p: query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'),
                          'part_lineitem_join_2_probe' + '_' + str(
                              p), query_plan, False)), range(0, part_parts))

    part_aggregate = map(
        lambda p: query_plan.add_operator(
            tpch_q14.aggregate_promo_revenue_operator_def(
                'part_aggregate' + '_' + str(p), query_plan)),
        range(0, part_parts))

    aggregate_reduce = query_plan.add_operator(
        Aggregate([
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_0'])),
            AggregateExpression(AggregateExpression.SUM,
                                lambda t: float(t['_1']))
        ], 'aggregate_reduce', query_plan, False))

    aggregate_project = query_plan.add_operator(
        tpch_q14.project_promo_revenue_operator_def('aggregate_project',
                                                    query_plan))

    collate = query_plan.add_operator(
        tpch_q14.collate_operator_def('collate', query_plan))

    # Connect the operators
    map(lambda o, p: o.connect(part_1_project[p]), enumerate(part_scan_1))
    map(lambda (p, o): o.connect(part_bloom_create[p]),
        enumerate(part_1_project)),
    map(
        lambda
        (p, o): map(lambda
                    (bp, bo): bo.connect(o), enumerate(part_bloom_create)),
        enumerate(lineitem_scan_1))
    map(lambda (p, o): o.connect(part_lineitem_join_1_build[p]),
        enumerate(part_1_project))
    map(
        lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo),
                           enumerate(part_lineitem_join_1_build)),
        enumerate(part_lineitem_join_1_probe))
    map(lambda (p, o): o.connect(lineitem_1_project[p]),
        enumerate(lineitem_scan_1))
    map(
        lambda (p, o): part_lineitem_join_1_probe[p % part_parts].
        connect_tuple_producer(o), enumerate(lineitem_1_project))
    map(lambda (p, o): o.connect(join_bloom_create[p]),
        enumerate(part_lineitem_join_1_probe))
    map(
        lambda (p, o): map(lambda
                           (bp, bo): o.connect(bo), enumerate(part_scan_2)),
        enumerate(join_bloom_create))
    map(
        lambda
        (p, o): map(lambda
                    (bp, bo): bo.connect(o), enumerate(join_bloom_create)),
        enumerate(lineitem_scan_2))
    map(lambda (p, o): o.connect(part_2_project[p]), enumerate(part_scan_2))
    map(lambda (p, o): o.connect(part_lineitem_join_2_build[p]),
        enumerate(part_2_project))
    map(
        lambda (p, o): map(lambda (bp, bo): o.connect_build_producer(bo),
                           enumerate(part_lineitem_join_2_build)),
        enumerate(part_lineitem_join_2_probe))
    map(lambda (p, o): o.connect(lineitem_2_project[p]),
        enumerate(lineitem_scan_2))
    map(
        lambda (p, o): part_lineitem_join_2_probe[p % part_parts].
        connect_tuple_producer(o), enumerate(lineitem_2_project))
    map(lambda (p, o): o.connect(part_aggregate[p]),
        enumerate(part_lineitem_join_2_probe))
    map(lambda (p, o): o.connect(aggregate_reduce), enumerate(part_aggregate))
    aggregate_reduce.connect(aggregate_project)
    aggregate_project.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("lineitem parts: {}".format(lineitem_parts))
    print("part_parts: {}".format(part_parts))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("part_sharded: {}".format(part_sharded))
    print('')

    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['promo_revenue']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    if s3filter.util.constants.TPCH_SF == 10:
        assert round(float(tuples[1][0]), 10) == 15.4488836202
    elif s3filter.util.constants.TPCH_SF == 1:
        numpy.testing.assert_almost_equal(float(tuples[1][0]), 15.0901165263)
Пример #29
0
def run(sort_field, k, parallel, use_pandas, sort_order, buffer_size,
        table_first_part, table_parts, queried_columns, select_columns, path,
        format_):
    """
    Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap
    :return:
    """

    secure = False
    use_native = False
    print('')
    print("Top K Benchmark, ColumnScan. Sort Field: {}, Order: {}".format(
        sort_field, sort_order))
    print("----------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    sample_scan = map(
        lambda p: query_plan.add_operator(
            #SQLTableScan("{}/lineitem.snappy.parquet.{}".format(path, p),
            SQLTableScan(
                "{}/lineitem.typed.1RowGroup.parquet.{}".format(
                    path, p), 'select {} from S3Object;'.format(
                        sort_field), format_, use_pandas, secure, use_native,
                'column_scan_{}'.format(p), query_plan, False)),
        range(table_first_part, table_first_part + table_parts))

    # Sampling project
    def project_fn1(df):
        df.columns = [sort_field]
        df[[sort_field]] = df[[sort_field]].astype(np.float)
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    sample_project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'sample_project_{}'.format(p), query_plan,
                    False, project_fn1)),
        range(table_first_part, table_first_part + table_parts))

    # TopK samples
    sort_expr = SortExpression(sort_field, float, sort_order)

    sample_topk = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'sample_topk', query_plan, False))

    # Generate SQL command for second scan
    sql_gen = query_plan.add_operator(
        TopKFilterBuild(
            sort_order,
            'float',
            'select {} from S3object '.format(select_columns),
            #' CAST({} as float) '.format(sort_field), 'sql_gen', query_plan, False ))
            ' {} '.format(sort_field),
            'sql_gen',
            query_plan,
            False))

    # Scan
    scan = map(
        lambda p: query_plan.add_operator(
            #SQLTableScan("{}/lineitem.snappy.parquet.{}".format(path, p),
            SQLTableScan(
                "{}/lineitem.typed.1RowGroup.parquet.{}".format(path, p), "",
                format_, use_pandas, secure, use_native, 'scan_{}'.format(
                    p), query_plan, False)),
        range(table_first_part, table_first_part + table_parts))

    # Project
    def project_fn2(df):
        df.columns = queried_columns
        df[[sort_field]] = df[[sort_field]].astype(np.float)
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'project_{}'.format(p), query_plan, False,
                    project_fn2)),
        range(table_first_part, table_first_part + table_parts))

    # TopK
    topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_{}'.format(p),
                query_plan, False)),
        range(table_first_part, table_first_part + table_parts))

    # TopK reduce
    topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    #profile_path = '../benchmark-output/groupby/'
    #scan[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_scan_0" + ".prof"))
    #project[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_project_0" + ".prof"))
    #groupby[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_0" + ".prof"))
    #groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_reduce" + ".prof"))
    #collate.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_collate" + ".prof"))

    map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan))
    map(lambda (p, o): o.connect(sample_topk), enumerate(sample_project))
    sample_topk.connect(sql_gen)

    map(lambda (p, o): sql_gen.connect(o), enumerate(scan))
    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(topk[p]), enumerate(project))
    map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
    topk_reduce.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
Пример #30
0
def run(parallel, buffer_size, parts):
    """

    :return: None
    """

    print('')
    print("TPCH Q14 Partitioned Bloom Join")
    print("-------------------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Query plan
    # DATE is the first day of a month randomly selected from a random year within [1993 .. 1997].
    date = '1993-01-01'
    min_shipped_date = datetime.strptime(date, '%Y-%m-%d')
    max_shipped_date = datetime.strptime(date, '%Y-%m-%d') + timedelta(days=30)

    collate = query_plan.add_operator(tpch_q14.collate_operator_def('collate', query_plan))
    sum_aggregate = query_plan.add_operator(
        Aggregate(
            [
                AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_0'])),
                AggregateExpression(AggregateExpression.SUM, lambda t: float(t['_1']))
            ],
            'sum_aggregate',
            query_plan,
            False))
    aggregate_project = query_plan.add_operator(
        tpch_q14.project_promo_revenue_operator_def('aggregate_project', query_plan))

    hash_join_build_ops = []
    hash_join_probe_ops = []
    part_bloom_create_ops = []
    line_item_scan_ops = []
    for p in range(0, parts):
        part_scan = query_plan.add_operator(
            tpch_q14.sql_scan_part_partkey_type_part_where_brand12_sharded_operator_def(p, parts,
                                                                                        'part_scan' + '_' + str(p),
                                                                                        query_plan))
        part_scan_project = query_plan.add_operator(
            tpch_q14.project_partkey_type_operator_def('part_scan_project' + '_' + str(p), query_plan))

        part_bloom_create = query_plan.add_operator(
            tpch_q14.bloom_create_p_partkey_operator_def('part_bloom_create' + '_' + str(p), query_plan))
        part_bloom_create_ops.append(part_bloom_create)

        lineitem_scan = query_plan.add_operator(
            tpch_q14.bloom_scan_lineitem_where_shipdate_sharded_operator_def(min_shipped_date, max_shipped_date, p,
                                                                             parts, 'lineitem_scan' + '_' + str(p),
                                                                             query_plan))
        line_item_scan_ops.append(lineitem_scan)

        lineitem_scan_project = query_plan.add_operator(
            tpch_q14.project_partkey_extendedprice_discount_operator_def('lineitem_scan_project' + '_' + str(p),
                                                                         query_plan))

        join_build = query_plan.add_operator(
            HashJoinBuild('p_partkey', 'join_build' + '_' + str(p), query_plan, False))
        hash_join_build_ops.append(join_build)

        join_probe = query_plan.add_operator(
            HashJoinProbe(JoinExpression('p_partkey', 'l_partkey'), 'join_probe' + '_' + str(p),
                          query_plan, False))
        hash_join_probe_ops.append(join_probe)

        aggregate = query_plan.add_operator(
            tpch_q14.aggregate_promo_revenue_operator_def('aggregate' + '_' + str(p), query_plan))

        part_scan.connect(part_scan_project)
        lineitem_scan.connect(lineitem_scan_project)

        part_scan_project.connect(part_bloom_create)
        # part_bloom_create.connect(lineitem_scan)

        part_scan_project.connect(join_build)

        join_probe.connect(aggregate)
        # aggregate.connect(aggregate_project)
        aggregate.connect(sum_aggregate)

        join_probe.connect_tuple_producer(lineitem_scan_project)
        # join_probe.connect(merge)

    for bloom_create_op in part_bloom_create_ops:
        for scan_op in line_item_scan_ops:
            bloom_create_op.connect(scan_op)

    for probe_op in hash_join_probe_ops:
        for build_op in hash_join_build_ops:
            probe_op.connect_build_producer(build_op)

    sum_aggregate.connect(aggregate_project)
    aggregate_project.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print("parts: {}".format(parts))
    print('')

    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(parts))

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['promo_revenue']

    assert len(tuples) == 1 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    # assert tuples[1] == [15.090116526324298]
    assert round(float(tuples[1][0]), 10) == 15.0901165263