Exemplo n.º 1
0
def _can_load_distributed(source: Data) -> bool:
    """Returns True if it might be possible to use distributed data loading"""
    from xgboost_ray.data_sources.ml_dataset import MLDataset
    from xgboost_ray.data_sources.modin import Modin

    if isinstance(source, (int, float, bool)):
        return False
    elif MLDataset.is_data_type(source):
        return True
    elif Modin.is_data_type(source):
        return True
    elif isinstance(source, str):
        # Strings should point to files or URLs
        # Usually parquet files point to directories
        return source.endswith(".parquet")
    elif isinstance(source, Sequence):
        # Sequence of strings should point to files or URLs
        return isinstance(source[0], str)
    elif isinstance(source, Iterable):
        # If we get an iterable but not a sequence, the best we can do
        # is check if we have a known non-distributed object
        if isinstance(source, (pd.DataFrame, pd.Series, np.ndarray)):
            return False

    # Per default, allow distributed loading.
    return True
Exemplo n.º 2
0
def _detect_distributed(source: Data) -> bool:
    """Returns True if we should try to use distributed data loading"""
    from xgboost_ray.data_sources.ml_dataset import MLDataset
    from xgboost_ray.data_sources.modin import Modin
    if not _can_load_distributed(source):
        return False
    if MLDataset.is_data_type(source):
        return True
    if Modin.is_data_type(source):
        return True
    if isinstance(source, Iterable) and not isinstance(source, str) and \
       not (isinstance(source, Sequence) and isinstance(source[0], str)):
        # This is an iterable but not a Sequence of strings, and not a
        # pandas dataframe, series, or numpy array.
        # Detect False per default, can be overridden by passing
        # `distributed=True` to the RayDMatrix object.
        return False

    # Otherwise, assume distributed loading is possible
    return True