Пример #1
0
def create_hive_partition_data(input, file_format, table_name, partitions, output, num_files):
	if not os.path.exists(output):
		os.makedirs(output)

	bc = BlazingContext(dask_client=None)
	if file_format == 'psv':
		dtypes = get_dtypes(table_name)
		col_names = get_column_names(table_name)
		bc.create_table(table_name, input, file_format='csv', delimiter="|", dtype=dtypes,names=col_names)
	else:
		bc.create_table(table_name, input)

	columns = bc.describe_table(table_name)
	data_partition_array_dict = []
	for partition in partitions:
		if partition in columns:
			result = bc.sql(f'select distinct({partition}) from {table_name}')

			if type(result) is dask_cudf.core.DataFrame:
				result = result.compute()

			valuesPartition = result.to_pandas().to_dict()
			finalValues = list(set(valuesPartition[partition].values()) & set(partitions[partition]))
			dictOfvalues = {i: finalValues[i] for i in range(0, len(finalValues))}
			valuesPartition[partition] = dictOfvalues
			data_partition_array_dict.append(valuesPartition)
		else:
			print('Column "' + partition + '" not exist')

	_save_partition_files(bc, table_name, data_partition_array_dict, output, file_format, num_files)
Пример #2
0
def testing_load_hive_table(table_name, location, partitions, partitions_schema):
	bc = BlazingContext()

	bc.create_table(table_name, location,
					file_format='parquet',
					hive_table_name=table_name,
					partitions=partitions,
					partitions_schema=partitions_schema)
Пример #3
0
def testing_load_hive_table(table_name, file_format, location, partitions, partitions_schema):
	bc2 = BlazingContext(dask_client=None)

	if file_format == 'psv':
		dtypes = get_dtypes(table_name)
		col_names = get_column_names(table_name)
		bc2.create_table(table_name, location,
						file_format='csv',
						hive_table_name=table_name,
						partitions=partitions,
						partitions_schema=partitions_schema,
						delimiter="|",
						dtype=dtypes,
						names=col_names)
	else:
		bc2.create_table(table_name, location, file_format=file_format)
Пример #4
0
def create_hive_partition_data(input, table_name, partitions, output, num_files_per_parquet):
	if not os.path.exists(output):
		os.makedirs(output)

	bc = BlazingContext()
	bc.create_table(table_name, input)

	columns = bc.describe_table(table_name)
	data_partition_array_dict = []
	for partition in partitions:
		if partition in columns:
			values = bc.sql(f'select distinct({partition}) from {table_name}')
			data_partition_array_dict.append(values.to_pandas().to_dict())
		else:
			print('Column "' + partition + '" not exist')

	_save_partition_files(bc, table_name, data_partition_array_dict, output, num_files_per_parquet)
Пример #5
0
class BlazingSQLHelper:
    def __init__(self):
        cluster = LocalCUDACluster()
        client = Client(cluster)
        self._bc = BlazingContext(dask_client=client, network_interface='lo')

    """This function runs blazingSQL query. 
    
    :param config: Query related tables configuration.
    :type config: dict
    :return: Query results.
    :rtype: cudf.DataFrame
    """

    def run_query(self, config):
        for table in config["tables"]:
            table_name = table["table_name"]
            file_path = table["input_path"]
            kwargs = table.copy()
            del kwargs["table_name"]
            del kwargs["input_path"]
            self._bc.create_table(table_name, file_path, **kwargs)
        sql = config["sql"]
        log.debug("Executing query: %s" % (sql))
        result = self._bc.sql(sql)
        result = result.compute()
        return result

    """This function drops blazingSQL tables.
    :param table_names: List of table names to drop.
    :type table_names: List
    """

    def drop_table(self, table_names):
        for table_name in table_names:
            log.debug("Drop table: %s" % (table_name))
            self._bc.drop_table(table_name)
Пример #6
0
class BlazingSQLHelper:
    def __init__(self, pool=False):
        # Setting pool=True allocates half the GPU memory.
        self._bc = BlazingContext(pool=pool)

    """This function runs blazingSQL query. 
    
    :param config: Query related tables configuration.
    :type config: dict
    :return: Query results.
    :rtype: cudf.DataFrame
    """

    def run_query(self, config):
        for table in config["tables"]:
            table_name = table["table_name"]
            file_path = table["input_path"]
            kwargs = table.copy()
            del kwargs["table_name"]
            del kwargs["input_path"]
            self._bc.create_table(table_name, file_path, **kwargs)
        sql = config["sql"]
        log.debug("Executing query: %s" % (sql))
        result = self._bc.sql(sql)
        self.has_data = False
        return result

    """This function drops blazingSQL tables.
    :param table_names: List of table names to drop.
    :type table_names: List
    """

    def drop_table(self, table_names):
        for table_name in table_names:
            log.debug("Drop table: %s" % (table_name))
            self._bc.drop_table(table_name)
Пример #7
0
def main(drill, dir_data_file, nRals):

    table1 = "customer"
    table2 = "orders"
    table3 = "lineitem"

    #Read Data TPCH------------------------------------------------------------------------------------------------------------
    customer_gdf = cs.read_data(table1, dir_data_file)
    orders_gdf = cs.read_data(table2, dir_data_file)
    lineitem_gdf = cs.read_data(table3, dir_data_file)

    #Create Tables ------------------------------------------------------------------------------------------------------------
    bc = BlazingContext()

    bc.create_table('customer', customer_gdf)
    bc.create_table('orders', orders_gdf)
    bc.create_table('lineitem', lineitem_gdf)

    #Run Query -----------------------------------------------------------------------------
    worder = 1  #Parameter to indicate if its necessary to order the resulsets before compare them
    use_percentage = False
    acceptable_difference = 0.01
    queryType = 'Performance Test'

    print('==============================')
    print(queryType)
    print('==============================')

    queryId = 'TEST_01'
    query = "select min(l_orderkey) from lineitem"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_02'
    query = "select sum(o_custkey), avg(o_totalprice), min(o_custkey), max(o_orderkey), count(o_orderkey) from orders"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_03'
    query = "select max(l_partkey) from lineitem group by l_orderkey"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_04'
    query = "select max(l_partkey), min(l_orderkey), sum(l_orderkey), avg(l_orderkey) from lineitem group by l_orderkey"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_05'
    query = "select max(l_partkey) from lineitem group by l_orderkey, l_suppkey, l_linenumber"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_06'
    query = """select max(l_partkey), min(l_orderkey), sum(l_orderkey), avg(l_orderkey) from lineitem group by l_orderkey, 
    l_suppkey, l_linenumber"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_07'
    query = "select l_orderkey from lineitem group by l_orderkey"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_08'
    query = "select l_orderkey, l_extendedprice, l_shipdate from lineitem group by l_orderkey, l_extendedprice, l_shipdate, l_linestatus"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_09'
    query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_returnflag from orders as o
            inner join lineitem as l on o.o_orderkey = l.l_orderkey"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_10'
    query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_returnflag from lineitem as l 
            inner join orders as o on o.o_orderkey = l.l_orderkey
            inner join customer as c on c.c_custkey = o.o_custkey"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_11'
    query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_returnflag from lineitem as l 
    inner join orders as o on o.o_orderkey = l.l_orderkey
    inner join customer as c on c.c_nationkey = o.o_custkey and l.l_linenumber = c.c_custkey"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_12'
    query = """select o.o_totalprice, l.l_partkey from orders as o
        left outer join lineitem as l on o.o_custkey = l.l_linenumber and l.l_suppkey = o.o_orderkey"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_13'
    query = """select c.c_name, o.o_custkey, l.l_linenumber from customer as c
        left outer join orders as o on c.c_custkey = o.o_custkey
        left outer join lineitem as l on l.l_orderkey = o.o_orderkey"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_14'
    query = """select o.o_orderkey, o.o_totalprice, l.l_partkey, l.l_linestatus from orders as o
        full outer join lineitem as l on l.l_orderkey = o.o_orderkey"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_15'
    query = """select c.c_name, o.o_custkey, l.l_linenumber from customer as c
        full outer join orders as o on c.c_custkey = o.o_custkey
        full outer join lineitem as l on l.l_orderkey = o.o_orderkey"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_16'
    query = """select l_returnflag, l_shipdate, l_linestatus from lineitem where l_orderkey < 10000
        union all
        select l_returnflag, l_shipdate, l_linestatus from lineitem where l_partkey > 100"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_17'
    query = """select o_orderdate as d1, o_orderpriority as s1, o_orderstatus as s2, o_orderkey as l1 from orders where o_orderkey < 10000
        union all
        select o_orderdate as d1, o_orderpriority as s1, o_orderstatus as s2, o_orderkey as l1 from orders where o_custkey < 100000
        union all
        select o_orderdate as d1, o_orderpriority as s1, o_orderstatus as s2, o_orderkey as l1 from orders where o_orderstatus = 'O'
        union all
        select o_orderdate as d1, o_orderpriority as s1, o_orderstatus as s2, o_orderkey as l1 from orders where o_totalprice < 350"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_18'
    query = """select o_orderkey, o_custkey, o_orderstatus, o_totalprice, o_orderpriority from orders 
    where o_custkey<>4318470"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_19'
    query = "select l_orderkey, l_partkey, l_suppkey, l_returnflag from lineitem where l_returnflag<>'g packages.'"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_20'
    query = "select o_orderkey, o_custkey, o_orderstatus, o_totalprice, o_orderpriority from orders where o_custkey=88910"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_21'
    query = "select l_orderkey, l_partkey, l_suppkey, l_returnflag from lineitem where l_returnflag='N'"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_22'
    query = "select o_orderkey, o_custkey, o_orderstatus, o_totalprice from orders where o_orderkey=1000"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_23'
    query = "select l_orderkey, l_partkey, l_suppkey, l_comment from lineitem where l_comment='dolites wake'"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_24'
    query = """select c_custkey, c_nationkey, c_acctbal from customer 
    where c_custkey < 15000 and c_nationkey = 5 or c_custkey = c_nationkey * c_nationkey or c_nationkey >= 10 
    or c_acctbal <= 500"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    #     queryId = 'TEST_25'
    #     query = "select o_orderkey, o_orderstatus, o_totalprice, o_comment from orders where o_orderstatus = 'O'"
    #     runTest.run_query_performance(bc, drill, query, queryId, queryType, worder, '', acceptable_difference, use_percentage, fileSchemaType)

    queryId = 'TEST_26'
    query = "select ((l_orderkey + l_partkey)/2)* l_quantity - l_suppkey*(l_discount - 1000.999) from lineitem"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_27'
    query = """select (l_orderkey + l_partkey*l_quantity)/2, (l_orderkey + l_quantity - l_partkey)*(l_orderkey + 3), 
    l_orderkey/l_partkey - l_quantity, l_quantity*l_linenumber/l_partkey*l_tax from lineitem"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_28'
    query = """select EXTRACT(YEAR FROM l_receiptdate) - EXTRACT(YEAR FROM l_shipdate) as years_late, 
        EXTRACT(MONTH FROM l_receiptdate) - EXTRACT(MONTH FROM l_shipdate) as months_late, 
        EXTRACT(DAY FROM l_receiptdate) - EXTRACT(DAY FROM l_shipdate) as days_late 
        from lineitem"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_29'
    query = """select l_quantity, sin(l_quantity), cos(l_quantity), asin(l_quantity), acos(l_quantity), ln(l_quantity), 
    tan(l_quantity), atan(l_quantity), floor(l_quantity), ceil(l_quantity) from lineitem"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_30'
    query = "select o_totalprice, o_custkey + o_custkey from orders order by o_totalprice"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_31'
    query = "select o_orderkey, o_custkey, o_orderstatus from orders order by o_comment"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_32'
    query = "select o_orderkey, o_custkey, o_totalprice, o_orderstatus from orders order by o_orderkey, o_custkey, o_totalprice"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_33'
    query = """select o_orderkey, o_custkey, o_totalprice, o_orderstatus from orders order by o_orderstatus, 
    o_shippriority, o_comment"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_34'
    query = "select c_custkey + c_nationkey, c_acctbal from customer order by 1 desc, 2"
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_35'
    query = """select custOrders.avgPrice, custOrders.numOrders from customer
    inner join
    (select o_custkey as o_custkey, avg(o_totalprice) as avgPrice, count(o_totalprice) as numOrders from orders
    where o_custkey <= 100 group by o_custkey) as custOrders
    on custOrders.o_custkey = customer.c_custkey
    where customer.c_nationkey <= 5"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)

    queryId = 'TEST_36'
    query = """with ordersTemp as ( 
            select min(o_orderkey) as priorityKey, o_custkey from orders group by o_custkey 
        ), ordersjoin as(
            select orders.o_custkey from orders inner join ordersTemp on ordersTemp.priorityKey = orders.o_orderkey
        ) 
        select customer.c_custkey, customer.c_nationkey from customer inner join ordersjoin on ordersjoin.o_custkey =  customer.c_custkey"""
    runTest.run_query_performance(bc, drill, query, queryId, queryType, worder,
                                  '', acceptable_difference, use_percentage,
                                  fileSchemaType)
Пример #8
0
import subprocess

# client = Client('127.0.0.1:8786')
# client.restart()
# bc = BlazingContext(dask_client=client, network_interface="lo")

bc = BlazingContext()

authority = 'localhost:54310'
hdfs_host = 'localhost'
hdfs_port = 54310
hdfs_driver = 'libhdfs'
result, error_msg, fs = bc.hdfs(authority,
                                host=hdfs_host,
                                port=hdfs_port,
                                user='******',
                                driver=hdfs_driver)

cursor = hive.connect('localhost').cursor()

table = bc.create_table('ptransactions', cursor, file_format='parquet')
for i in range(11):
    query = "SELECT * FROM ptransactions where t_year=2017 and t_company_id={t_company_id} LIMIT 10".format(
        t_company_id=i)
    ddf = bc.sql(query)
    print(query)

    if isinstance(ddf, cudf.DataFrame):
        print(ddf)
    else:
        print(ddf.compute())
Пример #9
0
import time
import pprint
from blazingsql import BlazingContext
from dask.distributed import Client
client = Client('127.0.0.1:8786')
client.restart()
bc = BlazingContext(dask_client=client, network_interface="lo")

# bc = BlazingContext()

dir_data_fs = '/home/aocsa/tpch/100MB2Part/'
nfiles = 4

# bc.create_table('customer', [dir_data_fs + '/customer_0_0.parquet', dir_data_fs + '/customer_1_0.parquet', dir_data_fs + '/customer_2_0.parquet'])

bc.create_table('customer', dir_data_fs + '/customer_*.parquet')

# "BindableTableScan(table=[[main, customer]],
# filters=[[OR(AND(<($0, 15000), =($1, 5)), =($0, *($1, $1)), >=($1, 10), <=($2, 500))]],
# projects=[[0, 3, 5]], aliases=[[c_custkey, c_nationkey, c_acctbal]])"
# query = """select c_custkey, c_nationkey, c_acctbal
#             from
#               customer
#             where
#               c_custkey > 2990 and c_custkey < 3010
#             """

query = "select sum(c_custkey)/count(c_custkey), min(c_custkey) from customer limit 5"

# [b'c_custkey', b'c_name', b'c_address', b'c_nationkey', b'c_phone', b'c_acctbal', b'c_mktsegment', b'c_comment']
lp = bc.explain(query)
Пример #10
0
from blazingsql import BlazingContext
bc = BlazingContext()

company_types = ['int64', 'str', 'date', 'str', 'str', 'str']
bc.create_table('companies', '/home/nomis/20GB_CSV/Companies.csv', dtype=company_types)
address_types = ['int64', 'str', 'str', 'str', 'str', 'str', 'str', 'str', 'int64']
bc.create_table('addresses', '/home/nomis/20GB_CSV/Addresses.csv', dtype=address_types)
department_types = ['int64', 'str', 'int64']
bc.create_table('departments', '/home/nomis/20GB_CSV/Departments.csv', dtype=department_types)
category_types = ['int64', 'str', 'int64']
bc.create_table('categories', '/home/nomis/20GB_CSV/Categories.csv', dtype=category_types)
product_types = ['int64', 'str', 'str', 'str', 'str', 'date', 'float64', 'int64']
bc.create_table('products', '/home/nomis/20GB_CSV/Products.csv', dtype=product_types)
review_types = ['int64', 'str', 'str', 'date', 'int64']
bc.create_table('reviews', '/home/nomis/20GB_CSV/Reviews.csv', dtype=review_types)

query = '''
SELECT a.City
FROM Addresses a
    JOIN Companies c ON a.CompanyId = c.Id
    JOIN Departments d ON c.Id = d.CompanyId
    JOIN Categories k ON d.Id = k.DepartmentId
    JOIN Products p ON k.Id = p.CategoryId
    JOIN Reviews r ON p.Id = r.ProductId
WHERE YEAR(r.PublishDate) >= '2021'
GROUP BY a.City
    '''

af = bc.sql(query)
Пример #11
0
from blazingsql import BlazingContext
from dask.distributed import Client
# client = Client('127.0.0.1:8786')
# client.restart()
# bc = BlazingContext(dask_client=client, network_interface="lo")

bc = BlazingContext()

dir_data_fs = '/home/aocsa/tpch/DataSet5Part100MB'
nfiles = 4

# bc.create_table('customer', [dir_data_fs + '/customer_0_0.parquet', dir_data_fs + '/customer_1_0.parquet', dir_data_fs + '/customer_2_0.parquet'])

bc.create_table('customer', [
    dir_data_fs + '/customer_0_0.parquet', dir_data_fs +
    '/customer_1_0.parquet', dir_data_fs + '/customer_2_0.parquet',
    dir_data_fs + '/customer_3_0.parquet',
    dir_data_fs + '/customer_4_0.parquet'
])

# "BindableTableScan(table=[[main, customer]],
# filters=[[OR(AND(<($0, 15000), =($1, 5)), =($0, *($1, $1)), >=($1, 10), <=($2, 500))]],
# projects=[[0, 3, 5]], aliases=[[c_custkey, c_nationkey, c_acctbal]])"
# query = """select c_custkey, c_nationkey, c_acctbal
#             from
#               customer
#             where
#               c_custkey > 2990 and c_custkey < 3010
#             """

query = """select c_custkey, c_nationkey, c_acctbal
            from 
Пример #12
0
from dask.distributed import Client
from blazingsql import BlazingContext
from dask_cuda import LocalCUDACluster

# initalize BlazingContext with the Dask Client of local GPUs to distribute query execution
bc = BlazingContext(dask_client=Client(LocalCUDACluster()),
                    network_interface='lo')

# register public AWS S3 bucket
bc.s3('blazingsql-colab', bucket_name='blazingsql-colab')

# create a table from that S3 bucket
col_names = [
    'key', 'fare', 'pickup_x', 'pickup_y', 'dropoff_x', 'dropoff_y',
    'passenger_count'
]
bc.create_table('taxi',
                's3://blazingsql-colab/taxi_data/taxi_00.csv',
                names=col_names)

# query the table & write results locally as parquet
bc.sql('SELECT * FROM taxi').to_parquet(f'../../data/yellow_cab')
Пример #13
0
import time
import pprint
from blazingsql import BlazingContext
from dask.distributed import Client

from collections import OrderedDict
from blazingsql import BlazingContext

client = Client('127.0.0.1:8786')
bc = BlazingContext(dask_client=client, network_interface="lo")

bc.create_table('customer',
                '/home/aocsa/tpch/100MB2Part/tpch/customer_0_0.parquet')
bc.create_table('orders', '/home/aocsa/tpch/100MB2Part/tpch/orders_*.parquet')

algebra = """LogicalProject(c_custkey=[$0], c_nationkey=[$3])
  LogicalFilter(condition=[<($0, 10)])
    LogicalTableScan(table=[[main, customer]])
"""

plan = """
{
		'expr': 'LogicalProject(c_custkey=[$0], c_nationkey=[$3])',
		'children': [
			{
				'expr': 'LogicalFilter(condition=[<($0, 10)])',
				'children': [
					{
						'expr': 'LogicalTableScan(table=[[main, customer]])',
						'children': []
					}
Пример #14
0
import pprint
from blazingsql import BlazingContext
from dask.distributed import Client
# client = Client('127.0.0.1:8786')
# client.restart()
# bc = BlazingContext(dask_client=client, network_interface="lo")

bc = BlazingContext()

dir_data_fs = '/home/aocsa/tpch/'
nfiles = 4

# bc.create_table('customer', [dir_data_fs + '/customer_0_0.parquet', dir_data_fs + '/customer_1_0.parquet', dir_data_fs + '/customer_2_0.parquet'])
path_parquet = dir_data_fs + 'simple.parquet'
print(path_parquet)
bc.create_table('simple', path_parquet)

# "BindableTableScan(table=[[main, customer]],
# filters=[[OR(AND(<($0, 15000), =($1, 5)), =($0, *($1, $1)), >=($1, 10), <=($2, 500))]],
# projects=[[0, 3, 5]], aliases=[[c_custkey, c_nationkey, c_acctbal]])"
# query = """select c_custkey, c_nationkey, c_acctbal
#             from
#               customer
#             where
#               c_custkey > 2990 and c_custkey < 3010
#             """

query = """select *
            from 
              simple
              where int64_field < 300000 or  int64_field > 900000 
Пример #15
0
import pprint
from blazingsql import BlazingContext
from dask.distributed import Client

client = Client('127.0.0.1:8786')
client.restart()
bc = BlazingContext(dask_client=client, network_interface="lo")

# bc = BlazingContext()

dir_data_fs = '/home/aocsa/tpch/100MB2Part/tpch'
nfiles = 4

# bc.create_table('customer', [dir_data_fs + '/customer_0_0.parquet', dir_data_fs + '/customer_1_0.parquet', dir_data_fs + '/customer_2_0.parquet'])

bc.create_table('part', dir_data_fs + '/part_*.parquet')

# "BindableTableScan(table=[[main, customer]],
# filters=[[OR(AND(<($0, 15000), =($1, 5)), =($0, *($1, $1)), >=($1, 10), <=($2, 500))]],
# projects=[[0, 3, 5]], aliases=[[c_custkey, c_nationkey, c_acctbal]])"
# query = """select c_custkey, c_nationkey, c_acctbal
#             from
#               customer
#             where
#               c_custkey > 2990 and c_custkey < 3010
#             """

query = "select count(p_partkey), sum(p_partkey), max(p_partkey), min(p_partkey) from part where p_partkey < 100"

# [b'c_custkey', b'c_name', b'c_address', b'c_nationkey', b'c_phone', b'c_acctbal', b'c_mktsegment', b'c_comment']
lp = bc.explain(query)