def parquet_to_dask(context: MLClientCtx, parquet_url: Union[DataItem, str, Path, IO[AnyStr]], inc_cols: Optional[List[str]] = None, index_cols: Optional[List[str]] = None, shards: int = 4, threads_per: int = 4, processes: bool = False, memory_limit: str = '2GB', persist: bool = True, dask_key: str = 'my_dask_dataframe', target_path: str = '') -> None: """Load parquet dataset into dask cluster If no cluster is found loads a new one and persist the data to it. It shouold not be necessary to create a new cluster when the function is run as a 'dask' job. :param context: the function context :param parquet_url: url of the parquet file or partitioned dataset as either artifact DataItem, string, or path object (see pandas read_csv) :param inc_cols: include only these columns (very fast) :param index_cols: list of index column names (can be a long-running process) :param shards: number of workers to launch :param threads_per: number of threads per worker :param processes: """ if hasattr(context, 'dask_client'): context.logger.info('found cluster...') dask_client = context.dask_client else: context.logger.info('starting new cluster...') cluster = LocalCluster(n_workers=shards, threads_per_worker=threads_per, processes=processes, memory_limit=memory_limit) dask_client = Client(cluster) context.logger.info(dask_client) df = dd.read_parquet(parquet_url) if persist and context: df = dask_client.persist(df) dask_client.publish_dataset(dask_key=df) context.dask_client = dask_client # share the scheduler filepath = os.path.join(target_path, 'scheduler.json') dask_client.write_scheduler_file(filepath) context.log_artifact('scheduler', target_path=filepath) print(df.head())
import os import numpy as np import pandas as pd import cudf import dask_cudf from dask.distributed import Client np.random.seed(12) df = cudf.DataFrame([('a', list(range(20))), ('b', list(reversed(range(20)))), ('c', list(range(20)))]) print(df) ddf = dask_cudf.from_cudf(df, npartitions=2) print(ddf.persist()) client = Client("127.0.0.1:8487") client.publish_dataset(shared_dataset1=ddf) print(client.list_datasets())
# df = dd.read_csv('s3://dask-data/nyc-taxi/2015/*.csv', # parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'], # storage_options={'anon': True}) print('read file') # https://www.gov.uk/guidance/about-the-price-paid-data#download-options column_names = [ 'id', 'price', 'transfer_date', 'postcode', 'property_type', 'old_new', 'duration', 'primary_address_obj', 'secondary_address_obj', 'street', 'locality', 'city_town', 'district', 'county', 'ppd_cat', 'record_stat' ] ### dont set an index for saving parquet format # df = dd.read_csv('./propdata/pp-complete.csv', header=None, names=column_names).set_index('id') df = dd.read_csv('./propdata/pp-complete.csv', header=None, names=column_names) # print(df.head()) print('persist dataframe') df = client.persist(df) # print('publish dataframe') df = client.publish_dataset(prop_paid=df) # print('run calc') # x = client.submit(lambda a: a.shape, df).result() # # # print(df) # print(x)