Exemplo n.º 1
0
def sequence_to_pyseries(
    name: str,
    values: Sequence[Any],
    dtype: Optional[Type[DataType]] = None,
    strict: bool = True,
) -> "PySeries":
    """
    Construct a PySeries from a sequence.
    """
    # Empty sequence defaults to Float32 type
    if not values and dtype is None:
        dtype = Float32

    if dtype is not None:
        constructor = polars_type_to_constructor(dtype)
        pyseries = constructor(name, values, strict)

        if dtype in (Date, Datetime, Duration, Time, Categorical):
            pyseries = pyseries.cast(dtype, True)

        return pyseries

    else:
        value = _get_first_non_none(values)
        dtype_ = type(value) if value is not None else float

        if dtype_ in {date, datetime, timedelta}:
            if not _PYARROW_AVAILABLE:  # pragma: no cover
                raise ImportError(
                    "'pyarrow' is required for converting a Sequence of date or datetime values to a PySeries."
                )
            # let arrow infer dtype if not timedelta
            # arrow uses microsecond durations by default, not supported yet.
            return arrow_to_pyseries(name, pa.array(values))

        elif dtype_ == list or dtype_ == tuple:
            nested_value = _get_first_non_none(value)
            nested_dtype = type(nested_value) if value is not None else float

            # recursively call Series constructor
            if nested_dtype == list:
                return sequence_to_pyseries(
                    name=name,
                    values=[
                        sequence_to_pyseries(name,
                                             seq,
                                             dtype=None,
                                             strict=strict) for seq in values
                    ],
                    dtype=None,
                    strict=strict,
                )

            # logs will show a panic if we infer wrong dtype
            # and its hard to error from rust side
            # to reduce the likelihood of this happening
            # we infer the dtype of first 100 elements
            # if all() fails, we will hit the PySeries.new_object
            if not _PYARROW_AVAILABLE:
                # check lists for consistent inner types
                if isinstance(value, list):
                    count = 0
                    equal_to_inner = True
                    for lst in values:
                        for vl in lst:
                            equal_to_inner = type(vl) == nested_dtype
                            if not equal_to_inner or count > 50:
                                break
                            count += 1
                    if equal_to_inner:
                        dtype = py_type_to_dtype(nested_dtype)
                        try:
                            return PySeries.new_list(name, values, dtype)
                        except BaseException:
                            pass
                # pass we create an object if we get here
            else:
                try:
                    nested_arrow_dtype = py_type_to_arrow_type(nested_dtype)
                except ValueError as e:  # pragma: no cover
                    raise ValueError(
                        f"Cannot construct Series from sequence of {nested_dtype}."
                    ) from e

                try:
                    arrow_values = pa.array(values,
                                            pa.large_list(nested_arrow_dtype))
                    return arrow_to_pyseries(name, arrow_values)
                except pa.lib.ArrowInvalid:
                    pass

            # Convert mixed sequences like `[[12], "foo", 9]`
            return PySeries.new_object(name, values, strict)

        elif dtype_ == pli.Series:
            return PySeries.new_series_list(name, [v.inner() for v in values],
                                            strict)
        elif dtype_ == PySeries:
            return PySeries.new_series_list(name, values, strict)

        else:
            constructor = py_type_to_constructor(dtype_)

            if constructor == PySeries.new_object:
                np_constructor = numpy_type_to_constructor(dtype_)
                if np_constructor is not None:
                    values = np.array(values)  # type: ignore
                    constructor = np_constructor

            return constructor(name, values, strict)
Exemplo n.º 2
0
def sequence_to_pyseries(
    name: str,
    values: Sequence[Any],
    dtype: PolarsDataType | None = None,
    strict: bool = True,
) -> PySeries:
    """
    Construct a PySeries from a sequence.
    """
    dtype_: type | None = None
    nested_dtype: PolarsDataType | type | None = None
    temporal_unit: str | None = None

    # empty sequence defaults to Float32 type
    if not values and dtype is None:
        dtype = Float32
    # lists defer to subsequent handling; identify nested type
    elif dtype == List:
        nested_dtype = getattr(dtype, "inner", None)
        dtype_ = list

    # infer temporal type handling
    py_temporal_types = {date, datetime, timedelta, time}
    pl_temporal_types = {Date, Datetime, Duration, Time}

    value = _get_first_non_none(values)
    if value is not None:
        if dtype in py_temporal_types and isinstance(value, int):
            dtype = py_type_to_dtype(dtype)  # construct from integer
        elif (dtype in pl_temporal_types or type(dtype)
              in pl_temporal_types) and not isinstance(value, int):
            temporal_unit = getattr(dtype, "tu", None)
            dtype_ = dtype_to_py_type(dtype)  # type: ignore[arg-type]

    if (dtype is not None) and is_polars_dtype(dtype) and (dtype_ is None):
        constructor = polars_type_to_constructor(dtype)
        pyseries = constructor(name, values, strict)

        if dtype in (Date, Datetime, Duration, Time, Categorical):
            pyseries = pyseries.cast(dtype, True)
        return pyseries
    else:
        if dtype_ is None:
            dtype_ = float if (value is None) else type(value)

        if dtype_ in py_temporal_types:
            if not _PYARROW_AVAILABLE:  # pragma: no cover
                raise ImportError(
                    "'pyarrow' is required for converting a Sequence of date or datetime values to a PySeries."
                )
            # let arrow infer dtype if not timedelta
            # arrow uses microsecond durations by default, not supported yet.
            arrow_dtype = (dtype_to_arrow_type(dtype) if
                           (dtype is not None and temporal_unit) else None)
            return arrow_to_pyseries(name, pa.array(values, type=arrow_dtype))

        elif dtype_ == list or dtype_ == tuple:
            if nested_dtype is None:
                nested_value = _get_first_non_none(value)
                nested_dtype = type(
                    nested_value) if nested_value is not None else float

            # recursively call Series constructor
            if nested_dtype == list:
                return sequence_to_pyseries(
                    name=name,
                    values=[
                        sequence_to_pyseries(name,
                                             seq,
                                             dtype=None,
                                             strict=strict) for seq in values
                    ],
                    dtype=None,
                    strict=strict,
                )

            # logs will show a panic if we infer wrong dtype
            # and its hard to error from rust side
            # to reduce the likelihood of this happening
            # we infer the dtype of first 100 elements
            # if all() fails, we will hit the PySeries.new_object
            if not _PYARROW_AVAILABLE:
                # check lists for consistent inner types
                if isinstance(value, list):
                    count = 0
                    equal_to_inner = True
                    for lst in values:
                        for vl in lst:
                            equal_to_inner = type(vl) == nested_dtype
                            if not equal_to_inner or count > 50:
                                break
                            count += 1
                    if equal_to_inner:
                        dtype = py_type_to_dtype(nested_dtype)
                        try:
                            return PySeries.new_list(name, values, dtype)
                        except BaseException:
                            pass
                # pass we create an object if we get here
            else:
                try:
                    to_arrow_type = (dtype_to_arrow_type
                                     if is_polars_dtype(nested_dtype) else
                                     py_type_to_arrow_type)
                    nested_arrow_dtype = to_arrow_type(
                        nested_dtype  # type: ignore[arg-type]
                    )
                except ValueError:  # pragma: no cover
                    return sequence_from_anyvalue_or_object(name, values)
                try:
                    arrow_values = pa.array(values,
                                            pa.large_list(nested_arrow_dtype))
                    return arrow_to_pyseries(name, arrow_values)
                except (pa.lib.ArrowInvalid, pa.lib.ArrowTypeError):
                    pass

            # Convert mixed sequences like `[[12], "foo", 9]`
            return PySeries.new_object(name, values, strict)

        elif dtype_ == pli.Series:
            return PySeries.new_series_list(name, [v.inner() for v in values],
                                            strict)
        elif dtype_ == PySeries:
            return PySeries.new_series_list(name, values, strict)
        else:
            constructor = py_type_to_constructor(dtype_)

            if constructor == PySeries.new_object:
                try:
                    return PySeries.new_from_anyvalues(name, values)
                # raised if we cannot convert to Wrap<AnyValue>
                except RuntimeError:
                    return sequence_from_anyvalue_or_object(name, values)

            return constructor(name, values, strict)