Exemplo n.º 1
0
 def __init__(self, horovod_kwargs=None, data_format=PARQUET, **kwargs):
     super().__init__(data_format=data_format, **kwargs)
     self._df_engine = DaskEngine()
     self._horovod_kwargs = horovod_kwargs or {}
     self._tensorflow_kwargs = {}
     if data_format != PARQUET:
         raise ValueError(
             f'Data format {data_format} is not supported when using the Ray backend. '
             f'Try setting to `parquet`.'
         )
Exemplo n.º 2
0
 def __init__(self, data_format=PARQUET, **kwargs):
     super().__init__(data_format=data_format, **kwargs)
     self._df_engine = DaskEngine()
     if data_format != PARQUET:
         raise ValueError(
             f'Data format {data_format} is not supported when using the Dask backend. '
             f'Try setting to `parquet`.')
Exemplo n.º 3
0
class RayBackend(RemoteTrainingMixin, Backend):
    def __init__(self, horovod_kwargs=None, data_format=PARQUET, **kwargs):
        super().__init__(data_format=data_format, **kwargs)
        self._df_engine = DaskEngine()
        self._horovod_kwargs = horovod_kwargs or {}
        self._tensorflow_kwargs = {}
        if data_format != PARQUET:
            raise ValueError(
                f'Data format {data_format} is not supported when using the Ray backend. '
                f'Try setting to `parquet`.'
            )

    def initialize(self):
        try:
            ray.init('auto', ignore_reinit_error=True)
        except ConnectionError:
            logger.info('Initializing new Ray cluster...')
            ray.init(ignore_reinit_error=True)

        dask.config.set(scheduler=ray_dask_get)
        self._df_engine.set_parallelism(**get_dask_kwargs())

    def initialize_tensorflow(self, **kwargs):
        # Make sure we don't claim any GPU resources on the head node
        initialize_tensorflow(gpus=-1)
        self._tensorflow_kwargs = kwargs

    def create_trainer(self, **kwargs):
        executable_kwargs = {**kwargs, **self._tensorflow_kwargs}
        return RayTrainer(self._horovod_kwargs, executable_kwargs)

    def create_predictor(self, **kwargs):
        executable_kwargs = {**kwargs, **self._tensorflow_kwargs}
        return RayPredictor(self._horovod_kwargs, executable_kwargs)

    @property
    def df_engine(self):
        return self._df_engine

    @property
    def supports_multiprocessing(self):
        return False

    def check_lazy_load_supported(self, feature):
        raise ValueError(f'RayBackend does not support lazy loading of data files at train time. '
                         f'Set preprocessing config `in_memory: True` for feature {feature[NAME]}')
Exemplo n.º 4
0
 def __init__(self, cache_format=PARQUET, engine=None, **kwargs):
     super().__init__(cache_format=cache_format, **kwargs)
     engine = engine or {}
     self._df_engine = DaskEngine(**engine)
     if cache_format not in [PARQUET, TFRECORD]:
         raise ValueError(
             f'Data format {cache_format} is not supported when using the Dask backend. '
             f'Try setting to `parquet`.')
Exemplo n.º 5
0
def _get_df_engine(engine_config):
    if engine_config is None:
        return DaskEngine()

    engine_config = engine_config.copy()

    dtype = engine_config.pop('type', 'dask')
    engine_cls = _engine_registry.get(dtype)
    return engine_cls(**engine_config)
Exemplo n.º 6
0
Arquivo: ray.py Projeto: cxz/ludwig
def _get_df_engine(processor):
    logger.info(f"Ray processor params: {processor}")
    if processor is None:
        # TODO ray: find an informed way to set the parallelism, in practice
        #  it looks like Dask handles this well on its own most of the time
        return DaskEngine()

    processor_kwargs = processor.copy()

    dtype = processor_kwargs.pop('type', 'dask')
    engine_cls = _engine_registry.get(dtype)

    return engine_cls(**processor_kwargs)
Exemplo n.º 7
0
 def __init__(self, horovod_kwargs=None):
     super().__init__()
     self._df_engine = DaskEngine()
     self._horovod_kwargs = horovod_kwargs or {}
     self._tensorflow_kwargs = {}
Exemplo n.º 8
0
import pytest

from ludwig.data.dataframe.pandas import PandasEngine
from ludwig.data.split import get_splitter

try:
    from ludwig.data.dataframe.dask import DaskEngine
except ImportError:
    DaskEngine = Mock


@pytest.mark.parametrize(
    ("df_engine", ),
    [
        pytest.param(PandasEngine(), id="pandas"),
        pytest.param(DaskEngine(_use_ray=False),
                     id="dask",
                     marks=pytest.mark.distributed),
    ],
)
def test_random_split(df_engine):
    nrows = 100
    npartitions = 10

    df = pd.DataFrame(np.random.randint(0, 100, size=(nrows, 3)),
                      columns=["A", "B", "C"])
    if isinstance(df_engine, DaskEngine):
        df = df_engine.df_lib.from_pandas(df, npartitions=npartitions)

    probs = (0.7, 0.1, 0.2)
    split_params = {
Exemplo n.º 9
0
 def __init__(self):
     super().__init__()
     self._df_engine = DaskEngine()
Exemplo n.º 10
0
def _create_dask_engine(**kwargs):
    from ludwig.data.dataframe.dask import DaskEngine

    return DaskEngine(**kwargs)