예제 #1
0
def run_baseline_topk(stats, sort_field_index, sort_field, k, parallel,
                      use_pandas, sort_order, buffer_size, table_parts_start,
                      table_parts_end, tbl_s3key, format_, shards_path):

    secure = False
    use_native = False
    print('')
    print("Top K Benchmark, Baseline. Sort Field: {}, Order: {}, k: {}".format(
        sort_field, sort_order, k))
    print("----------------------")

    stats += ['baseline', shards_path, sort_field, sort_order, k, 0, 0]

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    table_parts = table_parts_end - table_parts_start + 1
    per_part_samples = int(sample_size / table_parts)
    table_name = os.path.basename(tbl_s3key)

    # Scan
    scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan("{}.{}".format(shards_path, p
                                        ), "select * from S3Object;", format_,
                         use_pandas, secure, use_native, 'scan_{}'.format(
                             p), query_plan, False)),
        range(table_parts_start, table_parts_end + 1))

    # Project
    def project_fn(df):
        df.columns = [
            sort_field if x == sort_field_index else x for x in df.columns
        ]
        df[[sort_field]] = df[[sort_field]].astype(np.float)
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'project_{}'.format(p), query_plan,
                    False, project_fn)),
        range(table_parts_start, table_parts_end + 1))

    # TopK
    sort_expr = [SortExpression(sort_field, float, sort_order)]
    topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan,
                False)), range(table_parts_start, table_parts_end + 1))

    # TopK reduce
    topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(topk[p]), enumerate(project))
    map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
    topk_reduce.connect(collate)

    # Start the query
    query_plan.execute()
    print('Done')

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    query_time = query_plan.total_elapsed_time
    cost, bytes_scanned, bytes_returned, rows = query_plan.cost()
    computation_cost = query_plan.computation_cost()
    data_cost = query_plan.data_cost()[0]

    stats += [
        0, 0, 0, query_time, rows, bytes_scanned, bytes_returned, data_cost,
        computation_cost, cost
    ]
예제 #2
0
def run(sort_field, k, parallel, use_pandas, sort_order, buffer_size,
        table_first_part, table_parts, queried_columns, select_columns, path,
        format_):
    """
    Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap
    :return:
    """

    secure = False
    use_native = False
    print('')
    print("Top K Benchmark, ColumnScan. Sort Field: {}, Order: {}".format(
        sort_field, sort_order))
    print("----------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    sample_scan = map(
        lambda p: query_plan.add_operator(
            #SQLTableScan("{}/lineitem.snappy.parquet.{}".format(path, p),
            SQLTableScan(
                "{}/lineitem.typed.1RowGroup.parquet.{}".format(
                    path, p), 'select {} from S3Object;'.format(
                        sort_field), format_, use_pandas, secure, use_native,
                'column_scan_{}'.format(p), query_plan, False)),
        range(table_first_part, table_first_part + table_parts))

    # Sampling project
    def project_fn1(df):
        df.columns = [sort_field]
        df[[sort_field]] = df[[sort_field]].astype(np.float)
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    sample_project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'sample_project_{}'.format(p), query_plan,
                    False, project_fn1)),
        range(table_first_part, table_first_part + table_parts))

    # TopK samples
    sort_expr = SortExpression(sort_field, float, sort_order)

    sample_topk = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'sample_topk', query_plan, False))

    # Generate SQL command for second scan
    sql_gen = query_plan.add_operator(
        TopKFilterBuild(
            sort_order,
            'float',
            'select {} from S3object '.format(select_columns),
            #' CAST({} as float) '.format(sort_field), 'sql_gen', query_plan, False ))
            ' {} '.format(sort_field),
            'sql_gen',
            query_plan,
            False))

    # Scan
    scan = map(
        lambda p: query_plan.add_operator(
            #SQLTableScan("{}/lineitem.snappy.parquet.{}".format(path, p),
            SQLTableScan(
                "{}/lineitem.typed.1RowGroup.parquet.{}".format(path, p), "",
                format_, use_pandas, secure, use_native, 'scan_{}'.format(
                    p), query_plan, False)),
        range(table_first_part, table_first_part + table_parts))

    # Project
    def project_fn2(df):
        df.columns = queried_columns
        df[[sort_field]] = df[[sort_field]].astype(np.float)
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'project_{}'.format(p), query_plan, False,
                    project_fn2)),
        range(table_first_part, table_first_part + table_parts))

    # TopK
    topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_{}'.format(p),
                query_plan, False)),
        range(table_first_part, table_first_part + table_parts))

    # TopK reduce
    topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    #profile_path = '../benchmark-output/groupby/'
    #scan[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_scan_0" + ".prof"))
    #project[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_project_0" + ".prof"))
    #groupby[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_0" + ".prof"))
    #groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_reduce" + ".prof"))
    #collate.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_collate" + ".prof"))

    map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan))
    map(lambda (p, o): o.connect(sample_topk), enumerate(sample_project))
    sample_topk.connect(sql_gen)

    map(lambda (p, o): sql_gen.connect(o), enumerate(scan))
    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(topk[p]), enumerate(project))
    map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
    topk_reduce.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"),
                           gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
예제 #3
0
def run_local_indexed_sampling(stats,
                               sort_field_index,
                               sort_field,
                               k,
                               sample_size,
                               batch_size,
                               parallel,
                               use_pandas,
                               sort_order,
                               buffer_size,
                               table_parts_start,
                               table_parts_end,
                               tbl_s3key,
                               shards_path,
                               format_,
                               sampling_only=True):
    """
    Executes the randomly sampled topk query by firstly building a random sample, then extracting the filtering threshold
    Finally scanning the table to retrieve only the records beyond the threshold
    :return:
    """

    secure = False
    use_native = False
    n_threads = multiprocessing.cpu_count()

    print('')
    print("Top K Benchmark, Sampling. Sort Field: {}, Order: {}".format(
        sort_field, sort_order))
    print("----------------------")

    stats += [
        'sampling_{}_{}'.format('indexed', 'non-filtered'), shards_path,
        sort_field, sort_order, k, sample_size, batch_size
    ]

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    tbl_smpler = query_plan.add_operator(
        TableRandomSampleGenerator(tbl_s3key, sample_size, batch_size,
                                   "table_sampler", query_plan, False))
    sample_scan = map(
        lambda p: query_plan.add_operator(
            TableRangeAccess(tbl_s3key, use_pandas, secure, use_native,
                             "sample_scan_{}".format(p), query_plan, False)),
        range(table_parts_start, table_parts_end + 1))
    map(lambda (i, op): sample_scan[i].set_nthreads(n_threads),
        enumerate(sample_scan))

    # sample_scan = query_plan.add_operator(
    #     TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p),
    #                      query_plan, False))
    # sample_scan.set_nthreads(n_threads)

    # Sampling project
    def project_fn(df):
        df.columns = [
            sort_field if x == sort_field_index else x for x in df.columns
        ]
        df = df[[sort_field]].astype(np.float, errors='ignore')
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    sample_project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'sample_project_{}'.format(p), query_plan,
                    False, project_fn)),
        range(table_parts_start, table_parts_end + 1))
    # sample_project = query_plan.add_operator(
    #                             Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn))

    # TopK samples
    sort_expr = [SortExpression(sort_field, float, sort_order)]
    sample_topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p),
                query_plan, False)),
        range(table_parts_start, table_parts_end + 1))
    # sample_topk = query_plan.add_operator(
    #                 Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False))

    sample_topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False))

    # Generate SQL command for second scan
    sql_gen = query_plan.add_operator(
        TopKFilterBuild(sort_order, 'float', 'select * from S3object ',
                        ' CAST({} as float) '.format(sort_field), 'sql_gen',
                        query_plan, False))
    if not sampling_only:
        # Scan
        scan = map(
            lambda p: query_plan.add_operator(
                SQLTableScan("{}.{}".format(shards_path, p), "", format_,
                             use_pandas, secure, use_native, 'scan_{}'.format(
                                 p), query_plan, False)),
            range(table_parts_start, table_parts_end + 1))

        # Project
        def project_fn(df):
            df.columns = [
                sort_field if x == sort_field_index else x for x in df.columns
            ]
            df[[sort_field]] = df[[sort_field]].astype(np.float)
            return df

        project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

        project = map(
            lambda p: query_plan.add_operator(
                Project(project_exprs, 'project_{}'.format(p), query_plan,
                        False, project_fn)),
            range(table_parts_start, table_parts_end + 1))

        # TopK
        topk = map(
            lambda p: query_plan.add_operator(
                Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan,
                    False)), range(table_parts_start, table_parts_end + 1))

        # TopK reduce
        topk_reduce = query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda o: tbl_smpler.connect(o), sample_scan)
    map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan))
    map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project))
    map(lambda o: o.connect(sample_topk_reduce), sample_topk)
    sample_topk_reduce.connect(sql_gen)

    if not sampling_only:
        map(lambda (p, o): sql_gen.connect(o), enumerate(scan))
        map(lambda (p, o): o.connect(project[p]), enumerate(scan))
        map(lambda (p, o): o.connect(topk[p]), enumerate(project))
        map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
        topk_reduce.connect(collate)
    else:
        sql_gen.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts_end - table_parts_start))
    print('')

    # Write the plan graph
    # query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    # tuples = collate.tuples()

    # collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    sampling_time = query_plan.total_elapsed_time
    cost, bytes_scanned, bytes_returned, rows = query_plan.cost()
    computation_cost = query_plan.computation_cost()
    data_cost = query_plan.data_cost()[0]

    stats += [
        sql_gen.threshold, sampling_time, 0, sampling_time, rows,
        bytes_scanned, bytes_returned, data_cost, computation_cost, cost
    ]
예제 #4
0
def run_head_table_sampling(stats,
                            sort_field_index,
                            sort_field,
                            k,
                            sample_size,
                            parallel,
                            use_pandas,
                            sort_order,
                            buffer_size,
                            table_parts_start,
                            table_parts_end,
                            tbl_s3key,
                            shards_path,
                            format_,
                            sampling_only=True):

    secure = False
    use_native = False
    print('')
    print(
        "Top K Benchmark, Head Table Sampling. Sort Field: {}, Order: {}, k: {}, Sample Size:{}"
        .format(sort_field, sort_order, k, sample_size))
    print("----------------------")

    stats += [
        'sampling_{}_{}'.format('head_table', 'non-filtered'), shards_path,
        sort_field, sort_order, k, sample_size, 1
    ]

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    table_parts = table_parts_end - table_parts_start + 1
    per_part_samples = int(sample_size / table_parts)
    table_name = os.path.basename(tbl_s3key)
    sample_scan = map(
        lambda p: query_plan.add_operator(
            SQLTableScan(
                "{}.{}".format(shards_path, p),
                'select {} from S3Object limit {};'.format(
                    sort_field, per_part_samples), format_, use_pandas, secure,
                use_native, 'sample_scan_{}'.format(p), query_plan, False)),
        range(table_parts_start, table_parts_end + 1))

    # Sampling project
    def project_fn(df):
        df.columns = [sort_field]
        df[[sort_field]] = df[[sort_field]].astype(np.float)
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    sample_project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'sample_project_{}'.format(p), query_plan,
                    False, project_fn)),
        range(table_parts_start, table_parts_end + 1))

    # TopK samples
    sort_expr = [SortExpression(sort_field, float, sort_order)]
    sample_topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p),
                query_plan, False)),
        range(table_parts_start, table_parts_end + 1))

    sample_topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False))

    # Generate SQL command for second scan
    sql_gen = query_plan.add_operator(
        TopKFilterBuild(sort_order, 'float', 'select * from S3object ',
                        ' CAST({} as float) '.format(sort_field), 'sql_gen',
                        query_plan, False))

    if not sampling_only:
        # Scan
        scan = map(
            lambda p: query_plan.add_operator(
                SQLTableScan("{}.{}".format(shards_path, p), "", format_,
                             use_pandas, secure, use_native, 'scan_{}'.format(
                                 p), query_plan, False)),
            range(table_parts_start, table_parts_end + 1))

        # Project
        def project_fn(df):
            df.columns = [
                sort_field if x == sort_field_index else x for x in df.columns
            ]
            df[[sort_field]] = df[[sort_field]].astype(np.float)
            return df

        project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

        project = map(
            lambda p: query_plan.add_operator(
                Project(project_exprs, 'project_{}'.format(p), query_plan,
                        False, project_fn)),
            range(table_parts_start, table_parts_end + 1))

        # TopK
        topk = map(
            lambda p: query_plan.add_operator(
                Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan,
                    False)), range(table_parts_start, table_parts_end + 1))

        # TopK reduce
        topk_reduce = query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda (p, o): o.connect(sample_project[p]), enumerate(sample_scan))
    map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project))
    map(lambda op: op.connect(sample_topk_reduce), sample_topk)
    sample_topk_reduce.connect(sql_gen)

    if not sampling_only:
        map(lambda (p, o): sql_gen.connect(o), enumerate(scan))
        map(lambda (p, o): o.connect(project[p]), enumerate(scan))
        map(lambda (p, o): o.connect(topk[p]), enumerate(project))
        map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
        topk_reduce.connect(collate)
    else:
        sql_gen.connect(collate)

    # Start the query
    query_plan.execute()
    print('Done')

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    sampling_time = query_plan.total_elapsed_time
    cost, bytes_scanned, bytes_returned, rows = query_plan.cost()
    computation_cost = query_plan.computation_cost()
    data_cost = query_plan.data_cost()[0]

    stats += [
        sql_gen.threshold, sampling_time, 0, sampling_time, rows,
        bytes_scanned, bytes_returned, data_cost, computation_cost, cost
    ]
예제 #5
0
def run_memory_indexed_sampling(stats,
                                sort_field_index,
                                sort_field,
                                k,
                                sample_size,
                                batch_size,
                                parallel,
                                use_pandas,
                                sort_order,
                                buffer_size,
                                table_parts_start,
                                table_parts_end,
                                tbl_s3key,
                                shards_path,
                                format_,
                                sampling_only=True):
    """
    Executes the randomly sampled topk query by firstly building a random sample, then extracting the filtering threshold
    Finally scanning the table to retrieve only the records beyond the threshold
    :return:
    """

    secure = False
    use_native = False
    n_threads = multiprocessing.cpu_count()

    print('')
    print(
        "Top K Benchmark, Memory Indexed Sampling. Sort Field: {}, Order: {}, K: {}, Sample Size: {}, Batch Size: {}"
        .format(sort_field, sort_order, k, sample_size, batch_size))
    print("----------------------")

    stats += [
        'sampling_{}_{}'.format('memory_indexed', 'non-filtered'), shards_path,
        sort_field, sort_order, k, sample_size, batch_size
    ]

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    # Sampling
    tbl_smpler = query_plan.add_operator(
        TableRandomSampleGenerator(tbl_s3key, sample_size, batch_size,
                                   "table_sampler", query_plan, False))

    sample_scanners = map(
        lambda p: query_plan.add_operator(
            TableRangeAccess(tbl_s3key, use_pandas, secure, use_native,
                             "sample_scan_{}".format(p), query_plan, False)),
        range(table_parts_start, table_parts_end + 1))
    map(lambda op: op.set_nthreads(n_threads), sample_scanners)

    # sample_scan = query_plan.add_operator(
    #     TableRangeAccess(tbl_s3key, use_pandas, secure, use_native, "sample_scan_{}".format(p),
    #                      query_plan, False))
    # sample_scan.set_nthreads(n_threads)

    # Sampling project
    def project_fn(df):
        df.columns = [
            sort_field if x == sort_field_index else x for x in df.columns
        ]
        df = df[[sort_field]].astype(np.float, errors='ignore')
        return df

    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

    sample_project = map(
        lambda p: query_plan.add_operator(
            Project(project_exprs, 'sample_project_{}'.format(p), query_plan,
                    False, project_fn)),
        range(table_parts_start, table_parts_end + 1))
    # sample_project = query_plan.add_operator(
    #                             Project(project_exprs, 'sample_project_{}'.format(p), query_plan, False, project_fn))

    # TopK samples
    sort_expr = [SortExpression(sort_field, float, sort_order)]
    sample_topk = map(
        lambda p: query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p),
                query_plan, False)),
        range(table_parts_start, table_parts_end + 1))
    # sample_topk = query_plan.add_operator(
    #                 Top(k, sort_expr, use_pandas, 'sample_topk_{}'.format(p), query_plan, False))

    sample_topk_reduce = query_plan.add_operator(
        Top(k, sort_expr, use_pandas, 'sample_topk_reduce', query_plan, False))

    # Generate SQL command for second scan
    sql_gen = query_plan.add_operator(
        TopKFilterBuild(sort_order, 'float', 'select * from S3object ',
                        ' CAST({} as float) '.format(sort_field),
                        'sample_sql_gen', query_plan, False))

    if not sampling_only:
        # Scan
        scan = map(
            lambda p: query_plan.add_operator(
                SQLTableScan("{}.{}".format(shards_path, p), "", format_,
                             use_pandas, secure, use_native, 'scan_{}'.format(
                                 p), query_plan, False)),
            range(table_parts_start, table_parts_end + 1))

        # Project
        def project_fn(df):
            df.columns = [
                sort_field if x == sort_field_index else x for x in df.columns
            ]
            df[[sort_field]] = df[[sort_field]].astype(np.float)
            return df

        project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)]

        project = map(
            lambda p: query_plan.add_operator(
                Project(project_exprs, 'project_{}'.format(p), query_plan,
                        False, project_fn)),
            range(table_parts_start, table_parts_end + 1))

        # TopK
        topk = map(
            lambda p: query_plan.add_operator(
                Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan,
                    False)), range(table_parts_start, table_parts_end + 1))

        # TopK reduce
        topk_reduce = query_plan.add_operator(
            Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False))

    collate = query_plan.add_operator(Collate('collate', query_plan, False))

    map(lambda op: tbl_smpler.connect(op), sample_scanners)
    map(lambda (p, o): o.connect(sample_project[p]),
        enumerate(sample_scanners))
    map(lambda (p, o): o.connect(sample_topk[p]), enumerate(sample_project))
    map(lambda o: o.connect(sample_topk_reduce), sample_topk)
    sample_topk_reduce.connect(sql_gen)

    if not sampling_only:
        map(lambda (p, o): sql_gen.connect(o), enumerate(scan))
        map(lambda (p, o): o.connect(project[p]), enumerate(scan))
        map(lambda (p, o): o.connect(topk[p]), enumerate(project))
        map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
        topk_reduce.connect(collate)
    else:
        sql_gen.connect(collate)

    # Start the query
    query_plan.execute()
    print('Done')
    # tuples = collate.tuples()

    # collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    # query_plan.stop()

    sampling_threshold = query_plan.retrieve_sampling_threshold()
    sampling_runtime = query_plan.get_phase_runtime('sampl')
    sampling_num_http_requests, sampling_requests_cost = query_plan.requests_cost(
        'sampl')
    sampling_returned_bytes, sampling_returned_rows, sampling_transfer_cost = query_plan.data_transfer_cost(
        phase_keyword='sampl')
    sampling_scanned_bytes, sampling_scan_cost = query_plan.data_scanning_cost(
        'sampl')

    total_runtime = query_plan.total_elapsed_time
    total_http_requests, total_requests_cost = query_plan.requests_cost()
    total_returned_bytes, total_returned_rows, total_transfer_cost = query_plan.data_transfer_cost(
    )
    total_scanned_bytes, total_scan_cost = query_plan.data_scanning_cost()

    total_data_cost = query_plan.data_cost()[0]
    total_computation_cost = query_plan.computation_cost()
    total_cost = query_plan.cost()[0]

    stats += [
        sampling_threshold, sampling_runtime, total_runtime - sampling_runtime,
        total_runtime, sampling_returned_rows,
        sampling_scanned_bytes * BYTE_TO_MB,
        sampling_returned_bytes * BYTE_TO_MB, sampling_num_http_requests,
        sampling_requests_cost, sampling_transfer_cost, sampling_scan_cost,
        total_returned_rows, total_scanned_bytes * BYTE_TO_MB,
        total_returned_bytes * BYTE_TO_MB, total_http_requests,
        total_requests_cost, total_transfer_cost, total_scan_cost,
        total_data_cost, total_computation_cost, total_cost
    ]
예제 #6
0
def run(parallel, use_pandas, secure, use_native, buffer_size, format_, customer_parts, order_parts, lineitem_parts,
        customer_sharded,
        order_sharded, lineitem_sharded, other_parts, fp_rate, sf, expected_result, customer_filter_sql=None,
        order_filter_sql=None, lineitem_filter_sql=None):
    """

    :return: None
    """

    print('')
    print("TPCH Q3 Bloom Join")
    print("------------------")

    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)

    customer_scan = map(lambda p:
                        query_plan.add_operator(
                            SQLTableScan(get_file_key('customer', customer_sharded, p, sf, format_),
                                         "select "
                                         "  c_custkey "
                                         "from "
                                         "  S3Object "
                                         "where "
                                         "  c_mktsegment = 'BUILDING' "
                                         "  {} "
                                         "  {} "
                                         .format(
                                             ' and ' + customer_filter_sql if customer_filter_sql is not None else '',
                                             get_sql_suffix('customer', customer_parts, p, customer_sharded,
                                                            add_where=False)),
                                         format_,
                                         use_pandas, secure, use_native,
                                         'customer_scan' + '_{}'.format(p),
                                         query_plan,
                                         False)),
                        range(0, customer_parts))

    def customer_project_fn(df):
        df = df.filter(items=['_0'], axis=1)

        df.rename(columns={'_0': 'c_custkey'},
                  inplace=True)

        return df

    customer_project = map(lambda p:
                           query_plan.add_operator(
                               Project([],
                                       'customer_project' + '_{}'.format(p),
                                       query_plan,
                                       False, customer_project_fn)),
                           range(0, customer_parts))

    customer_bloom_create = query_plan.add_operator(
        BloomCreate('c_custkey', 'customer_bloom_create', query_plan, False,
                    fp_rate))

    customer_map = map(lambda p:
                       query_plan.add_operator(Map('c_custkey', 'customer_map' + '_' + str(p), query_plan, False)),
                       range(0, customer_parts))

    order_scan = map(lambda p:
                     query_plan.add_operator(
                         SQLTableScanBloomUse(get_file_key('orders', order_sharded, p, sf, format_),
                                              "select "
                                              "  o_custkey, o_orderkey, o_orderdate, o_shippriority "
                                              "from "
                                              "  S3Object "
                                              "where "
                                              "  cast(o_orderdate as timestamp) < cast('1995-03-01' as timestamp) "
                                              "  {} "
                                              "  {} "
                                              .format(
                                                  ' and ' + order_filter_sql if order_filter_sql is not None else '',
                                                  get_sql_suffix('orders', order_parts, p, order_sharded,
                                                                 add_where=False)),
                                              'o_custkey',
                                              format_,
                                              use_pandas, secure, use_native,
                                              'order_scan' + '_{}'.format(p),
                                              query_plan,
                                              False)),
                     range(0, order_parts))

    def order_project_fn(df):
        df = df.filter(items=['_0', '_1', '_2', '_3'], axis=1)

        df.rename(columns={'_0': 'o_custkey', '_1': 'o_orderkey', '_2': 'o_orderdate', '_3': 'o_shippriority'},
                  inplace=True)

        return df

    order_project = map(lambda p:
                        query_plan.add_operator(
                            Project([],
                                    'order_project' + '_{}'.format(p),
                                    query_plan,
                                    False, order_project_fn)),
                        range(0, customer_parts))

    order_map_1 = map(lambda p:
                      query_plan.add_operator(Map('o_custkey', 'order_map_1' + '_' + str(p), query_plan, False)),
                      range(0, order_parts))

    customer_order_join_build = map(lambda p:
                                    query_plan.add_operator(
                                        HashJoinBuild('c_custkey',
                                                      'customer_order_join_build' + '_' + str(p), query_plan,
                                                      False)),
                                    range(0, other_parts))

    customer_order_join_probe = map(lambda p:
                                    query_plan.add_operator(
                                        HashJoinProbe(JoinExpression('c_custkey', 'o_custkey'),
                                                      'customer_order_join_probe' + '_' + str(p),
                                                      query_plan, False)),
                                    range(0, other_parts))

    order_bloom_create = query_plan.add_operator(
        BloomCreate('o_orderkey', 'order_bloom_create', query_plan, False,
                    fp_rate))

    lineitem_scan = map(lambda p:
                        query_plan.add_operator(
                            SQLTableScanBloomUse(get_file_key('lineitem', lineitem_sharded, p, sf, format_),
                                                 "select "
                                                 "  l_orderkey, l_extendedprice, l_discount "
                                                 "from "
                                                 "  S3Object "
                                                 "where "
                                                 "  cast(l_shipdate as timestamp) > cast('1995-03-01' as timestamp) "
                                                 "  {} "
                                                 "  {} "
                                                 .format(
                                                     ' and ' + lineitem_filter_sql if lineitem_filter_sql is not None else '',
                                                     get_sql_suffix('lineitem', lineitem_parts, p, lineitem_sharded,
                                                                    add_where=False)),
                                                 'l_orderkey', format_,
                                                 use_pandas, secure, use_native,
                                                 'lineitem_scan' + '_{}'.format(p),
                                                 query_plan,
                                                 False)),
                        range(0, lineitem_parts))

    def lineitem_project_fn(df):
        df = df.filter(items=['_0', '_1', '_2'], axis=1)

        df.rename(columns={'_0': 'l_orderkey', '_1': 'l_extendedprice', '_2': 'l_discount'},
                  inplace=True)

        return df

    lineitem_project = map(lambda p:
                           query_plan.add_operator(
                               Project([],
                                       'lineitem_project' + '_{}'.format(p),
                                       query_plan,
                                       False, lineitem_project_fn)),
                           range(0, lineitem_parts))

    lineitem_map = map(lambda p:
                       query_plan.add_operator(Map('l_orderkey', 'lineitem_map' + '_' + str(p), query_plan, False)),
                       range(0, lineitem_parts))

    order_map_2 = map(lambda p:
                      query_plan.add_operator(Map('o_orderkey', 'order_map_2' + '_' + str(p), query_plan, False)),
                      range(0, other_parts))

    customer_order_lineitem_join_build = map(lambda p:
                                             query_plan.add_operator(
                                                 HashJoinBuild('o_orderkey',
                                                               'customer_order_lineitem_join_build' + '_' + str(p),
                                                               query_plan,
                                                               False)),
                                             range(0, other_parts))

    customer_order_lineitem_join_probe = map(lambda p:
                                             query_plan.add_operator(
                                                 HashJoinProbe(JoinExpression('o_orderkey', 'l_orderkey'),
                                                               'customer_order_lineitem_join_probe' + '_' + str(p),
                                                               query_plan, False)),
                                             range(0, other_parts))

    def groupby_fn(df):
        df['l_extendedprice'] = df['l_extendedprice'].astype(np.float)
        df['l_discount'] = df['l_discount'].astype(np.float)
        df['revenue'] = df['l_extendedprice'] * (1 - df['l_discount'])
        grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority'])
        agg_df = grouped['revenue'].sum()
        return agg_df.reset_index()

    group = map(lambda p:
                query_plan.add_operator(
                    Group(
                        ['l_orderkey', 'o_orderdate', 'o_shippriority'],  # l_partkey
                        [
                            AggregateExpression(AggregateExpression.SUM,
                                                lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount'])))
                        ],
                        'group' + '_{}'.format(p), query_plan,
                        False, groupby_fn)),
                range(0, other_parts))

    def group_reduce_fn(df):
        grouped = df.groupby(['l_orderkey', 'o_orderdate', 'o_shippriority'])
        agg_df = grouped['revenue'].sum()
        return agg_df.reset_index()

    group_reduce = query_plan.add_operator(
        Group(
            ['l_orderkey', 'o_orderdate', 'o_shippriority'],  # l_partkey
            [
                AggregateExpression(AggregateExpression.SUM,
                                    lambda t_: float(t_['l_extendedprice'] * (1 - t_['l_discount'])))
            ],
            'group_reduce', query_plan,
            False, group_reduce_fn))

    top = query_plan.add_operator(
        Top(10, [SortExpression('revenue', float, 'DESC'), SortExpression('o_orderdate', date, 'ASC')], use_pandas,
            'top', query_plan,
            False))

    collate = query_plan.add_operator(tpch_q19.collate_op('collate', query_plan))

    # Inline what we can
    map(lambda o: o.set_async(False), lineitem_project)
    map(lambda o: o.set_async(False), customer_project)
    map(lambda o: o.set_async(False), order_project)
    map(lambda o: o.set_async(False), lineitem_map)
    map(lambda o: o.set_async(False), customer_map)
    map(lambda o: o.set_async(False), order_map_1)
    map(lambda o: o.set_async(False), order_map_2)

    # Connect the operators
    connect_many_to_many(customer_scan, customer_project)
    connect_many_to_many(customer_project, customer_map)

    connect_many_to_one(customer_project, customer_bloom_create)
    connect_one_to_many(customer_bloom_create, order_scan)

    connect_many_to_many(order_scan, order_project)
    connect_many_to_many(order_project, order_map_1)

    connect_all_to_all(customer_map, customer_order_join_build)
    connect_many_to_many(customer_order_join_build, customer_order_join_probe)
    connect_all_to_all(order_map_1, customer_order_join_probe)

    # connect_many_to_one(customer_order_join_probe, collate)

    connect_many_to_one(order_project, order_bloom_create)
    connect_one_to_many(order_bloom_create, lineitem_scan)

    connect_many_to_many(lineitem_scan, lineitem_project)
    connect_many_to_many(lineitem_project, lineitem_map)

    connect_many_to_many(customer_order_join_probe, order_map_2)
    connect_all_to_all(order_map_2, customer_order_lineitem_join_build)
    connect_many_to_many(customer_order_lineitem_join_build, customer_order_lineitem_join_probe)
    connect_all_to_all(lineitem_map, customer_order_lineitem_join_probe)

    # connect_many_to_one(customer_order_lineitem_join_probe, collate)

    connect_many_to_many(customer_order_lineitem_join_probe, group)
    # connect_many_to_one(group, collate)

    connect_many_to_one(group, group_reduce)
    connect_one_to_one(group_reduce, top)
    connect_one_to_one(top, collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print('secure: {}'.format(secure))
    print('use_native: {}'.format(use_native))
    print("customer_parts: {}".format(customer_parts))
    print("order_parts: {}".format(order_parts))
    print("lineitem_parts: {}".format(lineitem_parts))
    print("customer_sharded: {}".format(customer_sharded))
    print("order_sharded: {}".format(order_sharded))
    print("lineitem_sharded: {}".format(lineitem_sharded))
    print("other_parts: {}".format(other_parts))
    print("fp_rate: {}".format(fp_rate))
    print("format: {}".format(format_))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()

    field_names = ['l_orderkey', 'o_orderdate', 'o_shippriority', 'revenue']

    assert len(tuples) == 10 + 1

    assert tuples[0] == field_names

    # NOTE: This result has been verified with the equivalent data and query on PostgreSQL
    test_util.assert_tuples(expected_result, tuples)
예제 #7
0
def topk_baseline(stats,
                  k,
                  sort_index='_5',
                  col_type=float,
                  col_name='l_extendedprice',
                  sort_order='DESC',
                  use_pandas=True,
                  filtered=False,
                  table_name='tpch-sf1/lineitem.csv',
                  shards_prefix='tpch-sf1/lineitem_sharded',
                  shards_start=0,
                  shards_end=31):
    """
    Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap
    :return:
    """

    limit = k
    num_rows = 0
    parallel_shards = True
    processes = multiprocessing.cpu_count()

    query_stats = [
        'baseline' if filtered is False else 'filtered', shards_prefix,
        col_name, sort_order, limit
    ]

    sql = 'select * from S3Object;'

    if filtered:
        sql = '''select l_extendedprice
                  from S3Object;'''
        sort_index = '_0'

    print("\n\nBaseline TopK with order {} on field {}\n".format(
        sort_order, col_name))

    query_plan = QueryPlan(is_async=True)

    # Query plan
    # ts = query_plan.add_operator(
    #     SQLTableScan('lineitem.csv', 'select * from S3Object limit {};'.format(limit), 'table_scan', query_plan, False))
    sort_exp = SortExpression(sort_index, col_type, sort_order)
    top_op = query_plan.add_operator(
        Top(limit, sort_exp, use_pandas, 'topk', query_plan, False))
    for process in range(processes):
        proc_parts = [
            x for x in range(shards_start, shards_end + 1)
            if x % processes == process
        ]
        pc = query_plan.add_operator(
            SQLShardedTableScan(table_name, sql, use_pandas, True, False,
                                "topk_table_scan_parts_{}".format(proc_parts),
                                proc_parts, shards_prefix, parallel_shards,
                                query_plan, False))
        # pc.set_profiled(True, "topk_table_scan_parts_{}.txt".format(proc_parts))
        pc_top = query_plan.add_operator(
            Top(limit, sort_exp, use_pandas,
                'topk_parts_{}'.format(proc_parts), query_plan, False))
        pc.connect(pc_top)
        pc_top.connect(top_op)

    c = query_plan.add_operator(Collate('collate', query_plan, False))

    top_op.connect(c)

    # Write the plan graph
    # query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    for t in c.tuples():
        num_rows += 1
        # print("{}:{}".format(num_rows, t))

    # assert num_rows == limit + 1

    cost, bytes_scanned, bytes_returned, rows = query_plan.cost()
    computation_cost = query_plan.computation_cost()
    data_cost = query_plan.data_cost()[0]

    query_stats += [
        0, 0, query_plan.total_elapsed_time, query_plan.total_elapsed_time, 0,
        rows, bytes_scanned, bytes_returned, data_cost, computation_cost, cost,
        num_rows == limit + 1
    ]
    stats.append(query_stats)

    # Write the metrics
    query_plan.print_metrics()
    query_plan.stop()
예제 #8
0
def run(sort_field, k, parallel, use_pandas, sort_order, buffer_size, table_parts, path, format_):
    """
    Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap
    :return:
    """

    secure = False
    use_native = False
    print('')
    print("Top K Benchmark, Filtered. Sort Field: {}, Order: {}".format(sort_field, sort_order))
    print("----------------------")

    # Query plan
    query_plan = QueryPlan(is_async=parallel, buffer_size=buffer_size)
    
    # Scan
    scan = map(lambda p: 
               query_plan.add_operator(
                    SQLTableScan("{}/topk_data_{}.csv".format(path, p),
                        "select * from S3Object;", format_, use_pandas, secure, use_native,
                        'scan_{}'.format(p), query_plan,
                        False)),
               range(0, table_parts))
  
    # Project
    def project_fn(df):
        df.columns = ['F0', 'F1', 'F2']
        df[ [sort_field] ] = df[ [sort_field] ].astype(np.float)
        return df
   
    project_exprs = [ProjectExpression(lambda t_: t_['_0'], sort_field)] 
    
    project = map(lambda p: 
                  query_plan.add_operator( 
                      Project(project_exprs, 'project_{}'.format(p), query_plan, False, project_fn)),
                  range(0, table_parts))

    # TopK
    sort_expr = SortExpression(sort_field, 'float', sort_order)
    topk = map(lambda p: 
               query_plan.add_operator(
                    Top(k, sort_expr, use_pandas, 'topk_{}'.format(p), query_plan, False)),
               range(0, table_parts))

    # TopK reduce
    topk_reduce = query_plan.add_operator(
                    Top(k, sort_expr, use_pandas, 'topk_reduce', query_plan, False)) 

    collate = query_plan.add_operator(
        Collate('collate', query_plan, False))

    #profile_path = '../benchmark-output/topk/'
    #scan[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_scan_0" + ".prof"))
    #project[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_project_0" + ".prof"))
    #groupby[0].set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_0" + ".prof"))
    #groupby_reduce.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_groupby_reduce" + ".prof"))
    #collate.set_profiled(True, os.path.join(ROOT_DIR, profile_path, gen_test_id() + "_collate" + ".prof"))

    map(lambda (p, o): o.connect(project[p]), enumerate(scan))
    map(lambda (p, o): o.connect(topk[p]), enumerate(project))
    map(lambda (p, o): o.connect(topk_reduce), enumerate(topk))
    topk_reduce.connect(collate)

    # Plan settings
    print('')
    print("Settings")
    print("--------")
    print('')
    print('use_pandas: {}'.format(use_pandas))
    print("table parts: {}".format(table_parts))
    print('')

    # Write the plan graph
    query_plan.write_graph(os.path.join(ROOT_DIR, "../benchmark-output"), gen_test_id() + "-" + str(table_parts))

    # Start the query
    query_plan.execute()
    print('Done')
    tuples = collate.tuples()

    collate.print_tuples(tuples)

    # Write the metrics
    query_plan.print_metrics()

    # Shut everything down
    query_plan.stop()
예제 #9
0
def __test_topk_baseline(sort_index='_5',
                         col_type=float,
                         col_name='l_quantity',
                         sort_order='DESC',
                         use_pandas=True,
                         filtered=False):
    """
    Executes the baseline topk query by scanning a table and keeping track of the max/min records in a heap
    :return:
    """

    limit = 50
    num_rows = 0
    shards = 32
    parallel_shards = True
    shards_prefix = "sf1000-lineitem"
    processes = multiprocessing.cpu_count()

    sql = 'select * from S3Object;'

    if filtered:
        sql = '''select l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_comment
                  from S3Object;'''

    print("\n\nBaseline TopK with order {} on field {}\n".format(
        sort_order, col_name))

    query_plan = QueryPlan(is_async=True)

    # Query plan
    # ts = query_plan.add_operator(
    #     SQLTableScan('lineitem.csv', 'select * from S3Object limit {};'.format(limit), 'table_scan', query_plan, False))
    sort_exp = SortExpression(sort_index, col_type, sort_order)
    top_op = query_plan.add_operator(
        Top(limit, sort_exp, use_pandas, 'topk', query_plan, False))
    for process in range(processes):
        proc_parts = [x for x in range(shards) if x % processes == process]
        pc = query_plan.add_operator(
            SQLShardedTableScan("lineitem.csv", sql, use_pandas, True,
                                "topk_table_scan_parts_{}".format(proc_parts),
                                proc_parts, shards_prefix, parallel_shards,
                                query_plan, False))
        # pc.set_profiled(True, "topk_table_scan_parts_{}.txt".format(proc_parts))
        pc_top = query_plan.add_operator(
            Top(limit, sort_exp, use_pandas,
                'topk_parts_{}'.format(proc_parts), query_plan, False))
        pc.connect(pc_top)
        pc_top.connect(top_op)

    c = query_plan.add_operator(Collate('collate', query_plan, False))

    top_op.connect(c)

    # Write the plan graph
    # query_plan.write_graph(os.path.join(ROOT_DIR, "../tests-output"), gen_test_id())

    # Start the query
    query_plan.execute()

    # Assert the results
    for t in c.tuples():
        num_rows += 1
        print("{}:{}".format(num_rows, t))

    assert num_rows == limit + 1

    # Write the metrics
    query_plan.print_metrics()
    query_plan.stop()