Пример #1
0
def create_hive_partition_data(input, file_format, table_name, partitions, output, num_files):
	if not os.path.exists(output):
		os.makedirs(output)

	bc = BlazingContext(dask_client=None)
	if file_format == 'psv':
		dtypes = get_dtypes(table_name)
		col_names = get_column_names(table_name)
		bc.create_table(table_name, input, file_format='csv', delimiter="|", dtype=dtypes,names=col_names)
	else:
		bc.create_table(table_name, input)

	columns = bc.describe_table(table_name)
	data_partition_array_dict = []
	for partition in partitions:
		if partition in columns:
			result = bc.sql(f'select distinct({partition}) from {table_name}')

			if type(result) is dask_cudf.core.DataFrame:
				result = result.compute()

			valuesPartition = result.to_pandas().to_dict()
			finalValues = list(set(valuesPartition[partition].values()) & set(partitions[partition]))
			dictOfvalues = {i: finalValues[i] for i in range(0, len(finalValues))}
			valuesPartition[partition] = dictOfvalues
			data_partition_array_dict.append(valuesPartition)
		else:
			print('Column "' + partition + '" not exist')

	_save_partition_files(bc, table_name, data_partition_array_dict, output, file_format, num_files)
Пример #2
0
def create_hive_partition_data(input, table_name, partitions, output, num_files_per_parquet):
	if not os.path.exists(output):
		os.makedirs(output)

	bc = BlazingContext()
	bc.create_table(table_name, input)

	columns = bc.describe_table(table_name)
	data_partition_array_dict = []
	for partition in partitions:
		if partition in columns:
			values = bc.sql(f'select distinct({partition}) from {table_name}')
			data_partition_array_dict.append(values.to_pandas().to_dict())
		else:
			print('Column "' + partition + '" not exist')

	_save_partition_files(bc, table_name, data_partition_array_dict, output, num_files_per_parquet)
Пример #3
0
class BlazingSQLHelper:
    def __init__(self):
        cluster = LocalCUDACluster()
        client = Client(cluster)
        self._bc = BlazingContext(dask_client=client, network_interface='lo')

    """This function runs blazingSQL query. 
    
    :param config: Query related tables configuration.
    :type config: dict
    :return: Query results.
    :rtype: cudf.DataFrame
    """

    def run_query(self, config):
        for table in config["tables"]:
            table_name = table["table_name"]
            file_path = table["input_path"]
            kwargs = table.copy()
            del kwargs["table_name"]
            del kwargs["input_path"]
            self._bc.create_table(table_name, file_path, **kwargs)
        sql = config["sql"]
        log.debug("Executing query: %s" % (sql))
        result = self._bc.sql(sql)
        result = result.compute()
        return result

    """This function drops blazingSQL tables.
    :param table_names: List of table names to drop.
    :type table_names: List
    """

    def drop_table(self, table_names):
        for table_name in table_names:
            log.debug("Drop table: %s" % (table_name))
            self._bc.drop_table(table_name)
Пример #4
0
class BlazingSQLHelper:
    def __init__(self, pool=False):
        # Setting pool=True allocates half the GPU memory.
        self._bc = BlazingContext(pool=pool)

    """This function runs blazingSQL query. 
    
    :param config: Query related tables configuration.
    :type config: dict
    :return: Query results.
    :rtype: cudf.DataFrame
    """

    def run_query(self, config):
        for table in config["tables"]:
            table_name = table["table_name"]
            file_path = table["input_path"]
            kwargs = table.copy()
            del kwargs["table_name"]
            del kwargs["input_path"]
            self._bc.create_table(table_name, file_path, **kwargs)
        sql = config["sql"]
        log.debug("Executing query: %s" % (sql))
        result = self._bc.sql(sql)
        self.has_data = False
        return result

    """This function drops blazingSQL tables.
    :param table_names: List of table names to drop.
    :type table_names: List
    """

    def drop_table(self, table_names):
        for table_name in table_names:
            log.debug("Drop table: %s" % (table_name))
            self._bc.drop_table(table_name)
Пример #5
0
import subprocess

# client = Client('127.0.0.1:8786')
# client.restart()
# bc = BlazingContext(dask_client=client, network_interface="lo")

bc = BlazingContext()

authority = 'localhost:54310'
hdfs_host = 'localhost'
hdfs_port = 54310
hdfs_driver = 'libhdfs'
result, error_msg, fs = bc.hdfs(authority,
                                host=hdfs_host,
                                port=hdfs_port,
                                user='******',
                                driver=hdfs_driver)

cursor = hive.connect('localhost').cursor()

table = bc.create_table('ptransactions', cursor, file_format='parquet')
for i in range(11):
    query = "SELECT * FROM ptransactions where t_year=2017 and t_company_id={t_company_id} LIMIT 10".format(
        t_company_id=i)
    ddf = bc.sql(query)
    print(query)

    if isinstance(ddf, cudf.DataFrame):
        print(ddf)
    else:
        print(ddf.compute())
Пример #6
0
from dask.distributed import Client
from blazingsql import BlazingContext
from dask_cuda import LocalCUDACluster

# initalize BlazingContext with the Dask Client of local GPUs to distribute query execution
bc = BlazingContext(dask_client=Client(LocalCUDACluster()),
                    network_interface='lo')

# register public AWS S3 bucket
bc.s3('blazingsql-colab', bucket_name='blazingsql-colab')

# create a table from that S3 bucket
col_names = [
    'key', 'fare', 'pickup_x', 'pickup_y', 'dropoff_x', 'dropoff_y',
    'passenger_count'
]
bc.create_table('taxi',
                's3://blazingsql-colab/taxi_data/taxi_00.csv',
                names=col_names)

# query the table & write results locally as parquet
bc.sql('SELECT * FROM taxi').to_parquet(f'../../data/yellow_cab')