def _finish_dataset(client, ddf, output_path, fs, output_format, cpu): # Finish data writing if client: client.cancel(ddf) ddf = None out = client.run(_worker_finish, output_path) general_md = [] special_md = [] for (gen, spec) in out.values(): general_md.append(gen) if spec: special_md.append(spec) general_md = _merge_general_metadata(general_md) special_md = dict(collections.ChainMap(*special_md)) else: ddf = None general_md, special_md = _worker_finish(output_path) # Write metadata on client if not isinstance(output_path, str): output_path = str(output_path) wc, fs = _writer_cls_factory(output_format, output_path, cpu) wc.write_general_metadata(general_md, fs, output_path) wc.write_special_metadata(special_md, fs, output_path) # Clean writer caches if client: client.run(clean_worker_cache, "writer") else: clean_worker_cache("writer")
def build_and_process_graph( self, dataset, end_phase=None, output_path=None, record_stats=True, shuffle=None, output_format=None, out_files_per_proc=None, apply_ops=True, num_io_threads=0, dtypes=None, ): """Build Dask-task graph for workflow. Full graph is only executed if `output_format` is specified. """ # Check shuffle argument shuffle = _check_shuffle_arg(shuffle) # Reorder tasks for two-phase workflows # TODO: Generalize this type of optimization self.reorder_tasks() end = end_phase if end_phase else len(self.phases) if output_format not in ("parquet", "hugectr", None): raise ValueError( f"Output format {output_format} not yet supported with Dask.") # Clear worker caches to be "safe" if self.client: self.client.run(clean_worker_cache) else: clean_worker_cache() self.set_ddf(dataset, shuffle=(shuffle is not None)) if apply_ops: self._base_phase = 0 # Set _base_phase for idx, _ in enumerate(self.phases[:end]): self.exec_phase(idx, record_stats=record_stats, update_ddf=(idx == (end - 1))) self._base_phase = 0 # Re-Set _base_phase if dtypes: ddf = self.get_ddf() _meta = _set_dtypes(ddf._meta, dtypes) self.set_ddf(ddf.map_partitions(_set_dtypes, dtypes, meta=_meta)) if output_format: output_path = output_path or "./" output_path = str(output_path) self.ddf_to_dataset( output_path, output_format=output_format, shuffle=shuffle, out_files_per_proc=out_files_per_proc, num_threads=num_io_threads, )
def build_and_process_graph( self, dataset, end_phase=None, output_path=None, record_stats=True, shuffle=None, output_format=None, out_files_per_proc=None, apply_ops=True, num_io_threads=0, ): """ Build Dask-task graph for workflow. Full graph is only executed if `output_format` is specified. """ end = end_phase if end_phase else len(self.phases) if output_format not in ("parquet", None): raise ValueError("Output format not yet supported with Dask.") # Reorder tasks for two-phase workflows # TODO: Generalize this type of optimization self.reorder_tasks(end) # Clear worker caches to be "safe" if self.client: self.client.run(clean_worker_cache) else: clean_worker_cache() self.set_ddf(dataset) if apply_ops: for idx, _ in enumerate(self.phases[:end]): self.exec_phase(idx, record_stats=record_stats) if output_format: output_path = output_path or "./" output_path = str(output_path) self.ddf_to_dataset( output_path, shuffle=shuffle, out_files_per_proc=out_files_per_proc, num_threads=num_io_threads, )
def _clear_worker_cache(self): # Clear worker caches to be "safe" if self.client: self.client.run(clean_worker_cache) else: clean_worker_cache()