예제 #1
0
def cast_array_to_feature(array: pa.Array,
                          feature: "FeatureType",
                          allow_number_to_str=True):
    """Cast an array to the arrow type that corresponds to the requested feature type.
    For custom features like Audio or Image, it takes into account the "cast_storage" methods
    they defined to enable casting from other arrow types.

    Args:
        array (pa.Array): the PyArrow array to cast
        feature (FeatureType): the target feature type
        allow_number_to_str (bool, default ``True``): Whether to allow casting numbers to strings.
            Defaults to True.

    Raises:
        pa.ArrowInvalidError: if the arrow data casting fails
        TypeError: if the target type is not supported according, e.g.

            - if a field is missing
            = if casting from numbers to strings and allow_number_to_str is False

    Returns:
        pa.Array: the casted array
    """
    from .features import Sequence, get_nested_type

    _c = partial(cast_array_to_feature,
                 allow_number_to_str=allow_number_to_str)

    if isinstance(array, pa.ExtensionArray):
        array = array.storage
    if hasattr(feature, "cast_storage"):
        return feature.cast_storage(array)
    elif pa.types.is_struct(array.type):
        # feature must be a dict or Sequence(subfeatures_dict)
        if isinstance(feature, Sequence) and isinstance(feature.feature, dict):
            feature = {
                name: Sequence(subfeature, length=feature.length)
                for name, subfeature in feature.feature.items()
            }
        if isinstance(feature, dict) and set(
                field.name for field in array.type) == set(feature):
            arrays = [
                _c(array.field(name), subfeature)
                for name, subfeature in feature.items()
            ]
            return pa.StructArray.from_arrays(arrays, names=list(feature))
    elif pa.types.is_list(array.type):
        # feature must be either [subfeature] or Sequence(subfeature)
        if isinstance(feature, list):
            return pa.ListArray.from_arrays(array.offsets,
                                            _c(array.values, feature[0]))
        elif isinstance(feature, Sequence):
            if feature.length > -1:
                if feature.length * len(array) == len(array.values):
                    return pa.FixedSizeListArray.from_arrays(
                        _c(array.values, feature.feature), feature.length)
            else:
                return pa.ListArray.from_arrays(
                    array.offsets, _c(array.values, feature.feature))
    elif pa.types.is_fixed_size_list(array.type):
        # feature must be either [subfeature] or Sequence(subfeature)
        if isinstance(feature, list):
            return pa.ListArray.from_arrays(array.offsets,
                                            _c(array.values, feature[0]))
        elif isinstance(feature, Sequence):
            if feature.length > -1:
                if feature.length * len(array) == len(array.values):
                    return pa.FixedSizeListArray.from_arrays(
                        _c(array.values, feature.feature), feature.length)
            else:
                offsets_arr = pa.array(range(len(array) + 1), pa.int32())
                return pa.ListArray.from_arrays(
                    offsets_arr, _c(array.values, feature.feature))
    if pa.types.is_null(array.type):
        return array_cast(array,
                          get_nested_type(feature),
                          allow_number_to_str=allow_number_to_str)
    elif not isinstance(feature, (Sequence, dict, list, tuple)):
        return array_cast(array,
                          feature(),
                          allow_number_to_str=allow_number_to_str)
    raise TypeError(
        f"Couldn't cast array of type\n{array.type}\nto\n{feature}")
예제 #2
0
def array_cast(array: pa.Array,
               pa_type: pa.DataType,
               allow_number_to_str=True):
    """Improved version of pa.Array.cast

    It supports casting pa.StructArray objects to re-order the fields.
    It also let you control certain aspects of the casting, e.g. whether
    to disable numbers (floats or ints) to strings.

    Args:
        array (pa.Array): PyArrow array to cast
        pa_type (pa.DataType): target PyArrow type
        allow_number_to_str (bool, default ``True``): Whether to allow casting numbers to strings.
            Defaults to True.

    Raises:
        pa.ArrowInvalidError: if the arrow data casting fails
        TypeError: if the target type is not supported according, e.g.

            - if a field is missing
            = if casting from numbers to strings and allow_number_to_str is False

    Returns:
        pa.Array: the casted array
    """
    _c = partial(array_cast, allow_number_to_str=allow_number_to_str)
    if isinstance(array, pa.ExtensionArray):
        array = array.storage
    if isinstance(pa_type, pa.ExtensionType):
        return pa_type.wrap_array(array)
    elif pa.types.is_struct(array.type):
        if pa.types.is_struct(pa_type) and (set(field.name
                                                for field in pa_type) == set(
                                                    field.name
                                                    for field in array.type)):
            arrays = [
                _c(array.field(field.name),
                   field.type,
                   allow_number_to_str=allow_number_to_str)
                for field in pa_type
            ]
            return pa.StructArray.from_arrays(arrays, fields=list(pa_type))
    elif pa.types.is_list(array.type):
        if pa.types.is_fixed_size_list(pa_type):
            if pa_type.list_size * len(array) == len(array.values):
                return pa.FixedSizeListArray.from_arrays(
                    _c(array.values,
                       pa_type.value_type,
                       allow_number_to_str=allow_number_to_str),
                    pa_type.list_size,
                )
        elif pa.types.is_list(pa_type):
            return pa.ListArray.from_arrays(
                array.offsets,
                _c(array.values,
                   pa_type.value_type,
                   allow_number_to_str=allow_number_to_str))
    elif pa.types.is_fixed_size_list(array.type):
        if pa.types.is_fixed_size_list(pa_type):
            return pa.FixedSizeListArray.from_arrays(
                _c(array.values,
                   pa_type.value_type,
                   allow_number_to_str=allow_number_to_str),
                pa_type.list_size,
            )
        elif pa.types.is_list(pa_type):
            offsets_arr = pa.array(range(len(array) + 1), pa.int32())
            return pa.ListArray.from_arrays(
                offsets_arr,
                _c(array.values,
                   pa_type.value_type,
                   allow_number_to_str=allow_number_to_str))
    else:
        if (not allow_number_to_str and pa.types.is_string(pa_type)
                and (pa.types.is_floating(array.type)
                     or pa.types.is_integer(array.type))):
            raise TypeError(
                f"Couldn't cast array of type {array.type} to {pa_type} since allow_number_to_str is set to {allow_number_to_str}"
            )
        return array.cast(pa_type)
    raise TypeError(
        f"Couldn't cast array of type\n{array.type}\nto\n{pa_type}")