def __init__(self, horovod_kwargs=None, data_format=PARQUET, **kwargs): super().__init__(data_format=data_format, **kwargs) self._df_engine = DaskEngine() self._horovod_kwargs = horovod_kwargs or {} self._tensorflow_kwargs = {} if data_format != PARQUET: raise ValueError( f'Data format {data_format} is not supported when using the Ray backend. ' f'Try setting to `parquet`.' )
def __init__(self, data_format=PARQUET, **kwargs): super().__init__(data_format=data_format, **kwargs) self._df_engine = DaskEngine() if data_format != PARQUET: raise ValueError( f'Data format {data_format} is not supported when using the Dask backend. ' f'Try setting to `parquet`.')
class RayBackend(RemoteTrainingMixin, Backend): def __init__(self, horovod_kwargs=None, data_format=PARQUET, **kwargs): super().__init__(data_format=data_format, **kwargs) self._df_engine = DaskEngine() self._horovod_kwargs = horovod_kwargs or {} self._tensorflow_kwargs = {} if data_format != PARQUET: raise ValueError( f'Data format {data_format} is not supported when using the Ray backend. ' f'Try setting to `parquet`.' ) def initialize(self): try: ray.init('auto', ignore_reinit_error=True) except ConnectionError: logger.info('Initializing new Ray cluster...') ray.init(ignore_reinit_error=True) dask.config.set(scheduler=ray_dask_get) self._df_engine.set_parallelism(**get_dask_kwargs()) def initialize_tensorflow(self, **kwargs): # Make sure we don't claim any GPU resources on the head node initialize_tensorflow(gpus=-1) self._tensorflow_kwargs = kwargs def create_trainer(self, **kwargs): executable_kwargs = {**kwargs, **self._tensorflow_kwargs} return RayTrainer(self._horovod_kwargs, executable_kwargs) def create_predictor(self, **kwargs): executable_kwargs = {**kwargs, **self._tensorflow_kwargs} return RayPredictor(self._horovod_kwargs, executable_kwargs) @property def df_engine(self): return self._df_engine @property def supports_multiprocessing(self): return False def check_lazy_load_supported(self, feature): raise ValueError(f'RayBackend does not support lazy loading of data files at train time. ' f'Set preprocessing config `in_memory: True` for feature {feature[NAME]}')
def __init__(self, cache_format=PARQUET, engine=None, **kwargs): super().__init__(cache_format=cache_format, **kwargs) engine = engine or {} self._df_engine = DaskEngine(**engine) if cache_format not in [PARQUET, TFRECORD]: raise ValueError( f'Data format {cache_format} is not supported when using the Dask backend. ' f'Try setting to `parquet`.')
def _get_df_engine(engine_config): if engine_config is None: return DaskEngine() engine_config = engine_config.copy() dtype = engine_config.pop('type', 'dask') engine_cls = _engine_registry.get(dtype) return engine_cls(**engine_config)
def _get_df_engine(processor): logger.info(f"Ray processor params: {processor}") if processor is None: # TODO ray: find an informed way to set the parallelism, in practice # it looks like Dask handles this well on its own most of the time return DaskEngine() processor_kwargs = processor.copy() dtype = processor_kwargs.pop('type', 'dask') engine_cls = _engine_registry.get(dtype) return engine_cls(**processor_kwargs)
def __init__(self, horovod_kwargs=None): super().__init__() self._df_engine = DaskEngine() self._horovod_kwargs = horovod_kwargs or {} self._tensorflow_kwargs = {}
import pytest from ludwig.data.dataframe.pandas import PandasEngine from ludwig.data.split import get_splitter try: from ludwig.data.dataframe.dask import DaskEngine except ImportError: DaskEngine = Mock @pytest.mark.parametrize( ("df_engine", ), [ pytest.param(PandasEngine(), id="pandas"), pytest.param(DaskEngine(_use_ray=False), id="dask", marks=pytest.mark.distributed), ], ) def test_random_split(df_engine): nrows = 100 npartitions = 10 df = pd.DataFrame(np.random.randint(0, 100, size=(nrows, 3)), columns=["A", "B", "C"]) if isinstance(df_engine, DaskEngine): df = df_engine.df_lib.from_pandas(df, npartitions=npartitions) probs = (0.7, 0.1, 0.2) split_params = {
def __init__(self): super().__init__() self._df_engine = DaskEngine()
def _create_dask_engine(**kwargs): from ludwig.data.dataframe.dask import DaskEngine return DaskEngine(**kwargs)