def parquet_to_dask(context: MLClientCtx, parquet_url: Union[DataItem, str, Path, IO[AnyStr]], inc_cols: Optional[List[str]] = None, index_cols: Optional[List[str]] = None, shards: int = 4, threads_per: int = 4, processes: bool = False, memory_limit: str = '2GB', persist: bool = True, dask_key: str = 'my_dask_dataframe', target_path: str = '') -> None: """Load parquet dataset into dask cluster If no cluster is found loads a new one and persist the data to it. It shouold not be necessary to create a new cluster when the function is run as a 'dask' job. :param context: the function context :param parquet_url: url of the parquet file or partitioned dataset as either artifact DataItem, string, or path object (see pandas read_csv) :param inc_cols: include only these columns (very fast) :param index_cols: list of index column names (can be a long-running process) :param shards: number of workers to launch :param threads_per: number of threads per worker :param processes: """ if hasattr(context, 'dask_client'): context.logger.info('found cluster...') dask_client = context.dask_client else: context.logger.info('starting new cluster...') cluster = LocalCluster(n_workers=shards, threads_per_worker=threads_per, processes=processes, memory_limit=memory_limit) dask_client = Client(cluster) context.logger.info(dask_client) df = dd.read_parquet(parquet_url) if persist and context: df = dask_client.persist(df) dask_client.publish_dataset(dask_key=df) context.dask_client = dask_client # share the scheduler filepath = os.path.join(target_path, 'scheduler.json') dask_client.write_scheduler_file(filepath) context.log_artifact('scheduler', target_path=filepath) print(df.head())
def load_dask( context: MLClientCtx, src_data: DataItem, dask_key: str = "dask_key", inc_cols: Optional[List[str]] = None, index_cols: Optional[List[str]] = None, dask_persist: bool = True, refresh_data: bool = True, scheduler_key: str = "scheduler" ) -> None: """Load dataset into an existing dask cluster dask jobs define the dask client parameters at the job level, this method will raise an error if no client is detected. :param context: the function context :param src_data: url of the data file or partitioned dataset as either artifact DataItem, string, or path object (similar to pandas read_csv) :param dask_key: destination key of data on dask cluster and artifact store :param inc_cols: include only these columns (very fast) :param index_cols: list of index column names (can be a long-running process) :param dask_persist: (True) should the data be persisted (through the `client.persist` op) :param refresh_data: (False) if the dask_key already exists in the dask cluster, this will raise an Exception. Set to True to replace the existing cluster data. :param scheduler_key: (scheduler) the dask scheduler configuration, json also logged as an artifact """ if hasattr(context, "dask_client"): dask_client = context.dask_client else: raise Exception("a dask client was not found in the execution context") df = src_data.as_df(df_module=dd) if dask_persist: df = dask_client.persist(df) if dask_client.datasets and dask_key in dask_client.datasets: dask_client.unpublish_dataset(dask_key) dask_client.publish_dataset(df, name=dask_key) if context: context.dask_client = dask_client # share the scheduler, whether data is persisted or not dask_client.write_scheduler_file(scheduler_key + ".json") # we don't use log_dataset here until it can take into account # dask origin and apply dask describe. context.log_artifact(scheduler_key, local_path=scheduler_key + ".json")