Exemplo n.º 1
0
def parquet_to_dask(context: MLClientCtx,
                    parquet_url: Union[DataItem, str, Path, IO[AnyStr]],
                    inc_cols: Optional[List[str]] = None,
                    index_cols: Optional[List[str]] = None,
                    shards: int = 4,
                    threads_per: int = 4,
                    processes: bool = False,
                    memory_limit: str = '2GB',
                    persist: bool = True,
                    dask_key: str = 'my_dask_dataframe',
                    target_path: str = '') -> None:
    """Load parquet dataset into dask cluster
    
    If no cluster is found loads a new one and persist the data to it. It
    shouold not be necessary to create a new cluster when the function
    is run as a 'dask' job.
    
    :param context:         the function context
    :param parquet_url:     url of the parquet file or partitioned dataset as either
                            artifact DataItem, string, or path object (see pandas read_csv)
    :param inc_cols:        include only these columns (very fast)
    :param index_cols:      list of index column names (can be a long-running process)
    :param shards:          number of workers to launch
    :param threads_per:     number of threads per worker
    :param processes:       
    """
    if hasattr(context, 'dask_client'):
        context.logger.info('found cluster...')
        dask_client = context.dask_client
    else:
        context.logger.info('starting new cluster...')
        cluster = LocalCluster(n_workers=shards,
                               threads_per_worker=threads_per,
                               processes=processes,
                               memory_limit=memory_limit)
        dask_client = Client(cluster)

    context.logger.info(dask_client)

    df = dd.read_parquet(parquet_url)

    if persist and context:
        df = dask_client.persist(df)
        dask_client.publish_dataset(dask_key=df)
        context.dask_client = dask_client

        # share the scheduler
        filepath = os.path.join(target_path, 'scheduler.json')
        dask_client.write_scheduler_file(filepath)
        context.log_artifact('scheduler', target_path=filepath)

        print(df.head())
Exemplo n.º 2
0
def load_dask(
        context: MLClientCtx,
        src_data: DataItem,
        dask_key: str = "dask_key",
        inc_cols: Optional[List[str]] = None,
        index_cols: Optional[List[str]] = None,
        dask_persist: bool = True,
        refresh_data: bool = True,
        scheduler_key: str = "scheduler"
) -> None:
    """Load dataset into an existing dask cluster

    dask jobs define the dask client parameters at the job level, this method will raise an error if no client is detected.

    :param context:         the function context
    :param src_data:        url of the data file or partitioned dataset as either
                            artifact DataItem, string, or path object (similar to
                            pandas read_csv)
    :param dask_key:        destination key of data on dask cluster and artifact store
    :param inc_cols:        include only these columns (very fast)
    :param index_cols:      list of index column names (can be a long-running process)
    :param dask_persist:    (True) should the data be persisted (through the `client.persist` op)
    :param refresh_data:    (False) if the dask_key already exists in the dask cluster, this will
                            raise an Exception.  Set to True to replace the existing cluster data.
    :param scheduler_key:   (scheduler) the dask scheduler configuration, json also logged as an artifact
    """
    if hasattr(context, "dask_client"):
        dask_client = context.dask_client
    else:
        raise Exception("a dask client was not found in the execution context")

    df = src_data.as_df(df_module=dd)

    if dask_persist:
        df = dask_client.persist(df)
        if dask_client.datasets and dask_key in dask_client.datasets:
            dask_client.unpublish_dataset(dask_key)
        dask_client.publish_dataset(df, name=dask_key)

    if context:
        context.dask_client = dask_client

    # share the scheduler, whether data is persisted or not
    dask_client.write_scheduler_file(scheduler_key + ".json")

    # we don't use log_dataset here until it can take into account
    # dask origin and apply dask describe.
    context.log_artifact(scheduler_key, local_path=scheduler_key + ".json")