Exemplo n.º 1
0
def _finish_dataset(client, ddf, output_path, fs, output_format, cpu):
    # Finish data writing
    if client:
        client.cancel(ddf)
        ddf = None
        out = client.run(_worker_finish, output_path)

        general_md = []
        special_md = []
        for (gen, spec) in out.values():
            general_md.append(gen)
            if spec:
                special_md.append(spec)

        general_md = _merge_general_metadata(general_md)
        special_md = dict(collections.ChainMap(*special_md))
    else:
        ddf = None
        general_md, special_md = _worker_finish(output_path)

    # Write metadata on client
    if not isinstance(output_path, str):
        output_path = str(output_path)

    wc, fs = _writer_cls_factory(output_format, output_path, cpu)
    wc.write_general_metadata(general_md, fs, output_path)
    wc.write_special_metadata(special_md, fs, output_path)

    # Clean writer caches
    if client:
        client.run(clean_worker_cache, "writer")
    else:
        clean_worker_cache("writer")
Exemplo n.º 2
0
    def build_and_process_graph(
        self,
        dataset,
        end_phase=None,
        output_path=None,
        record_stats=True,
        shuffle=None,
        output_format=None,
        out_files_per_proc=None,
        apply_ops=True,
        num_io_threads=0,
        dtypes=None,
    ):
        """Build Dask-task graph for workflow.

        Full graph is only executed if `output_format` is specified.
        """
        # Check shuffle argument
        shuffle = _check_shuffle_arg(shuffle)

        # Reorder tasks for two-phase workflows
        # TODO: Generalize this type of optimization
        self.reorder_tasks()

        end = end_phase if end_phase else len(self.phases)

        if output_format not in ("parquet", "hugectr", None):
            raise ValueError(
                f"Output format {output_format} not yet supported with Dask.")

        # Clear worker caches to be "safe"
        if self.client:
            self.client.run(clean_worker_cache)
        else:
            clean_worker_cache()

        self.set_ddf(dataset, shuffle=(shuffle is not None))
        if apply_ops:
            self._base_phase = 0  # Set _base_phase
            for idx, _ in enumerate(self.phases[:end]):
                self.exec_phase(idx,
                                record_stats=record_stats,
                                update_ddf=(idx == (end - 1)))
            self._base_phase = 0  # Re-Set _base_phase

        if dtypes:
            ddf = self.get_ddf()
            _meta = _set_dtypes(ddf._meta, dtypes)
            self.set_ddf(ddf.map_partitions(_set_dtypes, dtypes, meta=_meta))

        if output_format:
            output_path = output_path or "./"
            output_path = str(output_path)
            self.ddf_to_dataset(
                output_path,
                output_format=output_format,
                shuffle=shuffle,
                out_files_per_proc=out_files_per_proc,
                num_threads=num_io_threads,
            )
Exemplo n.º 3
0
    def build_and_process_graph(
        self,
        dataset,
        end_phase=None,
        output_path=None,
        record_stats=True,
        shuffle=None,
        output_format=None,
        out_files_per_proc=None,
        apply_ops=True,
        num_io_threads=0,
    ):
        """ Build Dask-task graph for workflow.

            Full graph is only executed if `output_format` is specified.
        """
        end = end_phase if end_phase else len(self.phases)

        if output_format not in ("parquet", None):
            raise ValueError("Output format not yet supported with Dask.")

        # Reorder tasks for two-phase workflows
        # TODO: Generalize this type of optimization
        self.reorder_tasks(end)

        # Clear worker caches to be "safe"
        if self.client:
            self.client.run(clean_worker_cache)
        else:
            clean_worker_cache()

        self.set_ddf(dataset)
        if apply_ops:
            for idx, _ in enumerate(self.phases[:end]):
                self.exec_phase(idx, record_stats=record_stats)
        if output_format:
            output_path = output_path or "./"
            output_path = str(output_path)
            self.ddf_to_dataset(
                output_path,
                shuffle=shuffle,
                out_files_per_proc=out_files_per_proc,
                num_threads=num_io_threads,
            )
Exemplo n.º 4
0
 def _clear_worker_cache(self):
     # Clear worker caches to be "safe"
     if self.client:
         self.client.run(clean_worker_cache)
     else:
         clean_worker_cache()