Пример #1
0
def sequence_from_anyvalue_or_object(name: str, values: Sequence[Any]) -> "PySeries":
    """
    Last resort conversion. AnyValues are most flexible and if they fail we go for object types
    """

    try:
        return PySeries.new_from_anyvalues(name, values)
    # raised if we cannot convert to Wrap<AnyValue>
    except RuntimeError:
        return PySeries.new_object(name, values, False)
Пример #2
0
def sequence_to_pyseries(
    name: str,
    values: Sequence[Any],
    dtype: Optional[Type[DataType]] = None,
    strict: bool = True,
) -> "PySeries":
    """
    Construct a PySeries from a sequence.
    """
    # Empty sequence defaults to Float32 type
    if not values and dtype is None:
        dtype = Float32

    if dtype is not None:
        constructor = polars_type_to_constructor(dtype)
        pyseries = constructor(name, values, strict)
        if dtype == Date32:
            pyseries = pyseries.cast(str(pl.Date32), True)
        elif dtype == Date64:
            pyseries = pyseries.cast(str(pl.Date64), True)
        return pyseries

    else:
        value = _get_first_non_none(values)
        dtype_ = type(value) if value is not None else float

        if dtype_ == date or dtype_ == datetime:
            if not _PYARROW_AVAILABLE:
                raise ImportError(
                    "'pyarrow' is required for converting a Sequence of date or datetime values to a PySeries."
                )
            return arrow_to_pyseries(name, pa.array(values))

        elif dtype_ == list or dtype_ == tuple or dtype_ == pl.Series:
            nested_value = _get_first_non_none(value)
            nested_dtype = type(nested_value) if value is not None else float

            if not _PYARROW_AVAILABLE:
                raise ImportError(
                    f"'pyarrow' is required for converting a Sequence of {nested_dtype} to a PySeries."
                )

            try:
                nested_arrow_dtype = py_type_to_arrow_type(nested_dtype)
            except ValueError as e:
                raise ValueError(
                    f"Cannot construct Series from sequence of {nested_dtype}."
                ) from e

            try:
                arrow_values = pa.array(values,
                                        pa.large_list(nested_arrow_dtype))
                return arrow_to_pyseries(name, arrow_values)
            # failure expected for mixed sequences like `[[12], "foo", 9]`
            except pa.lib.ArrowInvalid:
                return PySeries.new_object(name, values, strict)

        else:
            constructor = py_type_to_constructor(dtype_)
            return constructor(name, values, strict)
Пример #3
0
def arrow_to_pyseries(name: str,
                      values: "pa.Array",
                      rechunk: bool = True) -> "PySeries":
    """
    Construct a PySeries from an Arrow array.
    """
    array = coerce_arrow(values)
    if hasattr(array, "num_chunks"):
        if array.num_chunks > 1:
            it = array.iterchunks()
            pys = PySeries.from_arrow(name, next(it))
            for a in it:
                pys.append(PySeries.from_arrow(name, a))
        else:
            pys = PySeries.from_arrow(name, array.combine_chunks())

        if rechunk:
            pys.rechunk(in_place=True)

        return pys
    return PySeries.from_arrow(name, array)
Пример #4
0
def sequence_to_pyseries(
    name: str,
    values: Sequence[Any],
    dtype: Optional[Type[DataType]] = None,
) -> "PySeries":
    """
    Construct a PySeries from a sequence.
    """
    # Empty sequence defaults to Float32 type
    if not values and dtype is None:
        dtype = Float32

    if dtype is not None:
        constructor = polars_type_to_constructor(dtype)
        pyseries = constructor(name, values)
        if dtype == Date32:
            pyseries = pyseries.cast_date32()
        elif dtype == Date64:
            pyseries = pyseries.cast_date64()
        return pyseries

    else:
        value = _get_first_non_none(values)
        dtype_ = type(value) if value is not None else float

        if dtype_ == date or dtype_ == datetime:
            return arrow_to_pyseries(name, pa.array(values))

        elif dtype_ == list or dtype_ == tuple:
            nested_value = _get_first_non_none(value)
            nested_dtype = type(nested_value) if value is not None else float

            try:
                nested_arrow_dtype = py_type_to_arrow_type(nested_dtype)
            except ValueError as e:
                raise ValueError(
                    f"Cannot construct Series from sequence of {nested_dtype}."
                ) from e

            try:
                arrow_values = pa.array(values,
                                        pa.large_list(nested_arrow_dtype))
                return arrow_to_pyseries(name, arrow_values)
            # failure expected for mixed sequences like `[[12], "foo", 9]`
            except pa.lib.ArrowInvalid:
                return PySeries.new_object(name, values)

        else:
            constructor = py_type_to_constructor(dtype_)
            return constructor(name, values)
Пример #5
0
def numpy_to_pyseries(
    name: str, values: np.ndarray, nullable: bool = True, strict: bool = True
) -> "PySeries":
    """
    Construct a PySeries from a numpy array.
    """
    if not values.data.contiguous:
        values = np.array(values)

    if len(values.shape) == 1:
        dtype = values.dtype.type
        constructor = numpy_type_to_constructor(dtype)
        if dtype == np.float32 or dtype == np.float64:
            return constructor(name, values, nullable)
        else:
            return constructor(name, values, strict)
    else:
        return PySeries.new_object(name, values)
Пример #6
0
def numpy_to_pyseries(name: str,
                      values: np.ndarray,
                      strict: bool = True,
                      nan_to_null: bool = False) -> "PySeries":
    """
    Construct a PySeries from a numpy array.
    """
    if not values.flags["C_CONTIGUOUS"]:
        values = np.array(values)

    if len(values.shape) == 1:
        dtype = values.dtype.type
        constructor = numpy_type_to_constructor(dtype)
        if dtype == np.float32 or dtype == np.float64:
            return constructor(name, values, nan_to_null)
        else:
            return constructor(name, values, strict)
    else:
        return PySeries.new_object(name, values, strict)
Пример #7
0
def sequence_to_pyseries(
    name: str,
    values: Sequence[Any],
    dtype: Optional[Type[DataType]] = None,
    strict: bool = True,
) -> "PySeries":
    """
    Construct a PySeries from a sequence.
    """
    # Empty sequence defaults to Float32 type
    if not values and dtype is None:
        dtype = Float32

    if dtype is not None:
        constructor = polars_type_to_constructor(dtype)
        pyseries = constructor(name, values, strict)

        if dtype in (Date, Datetime, Duration, Time, Categorical):
            pyseries = pyseries.cast(dtype, True)

        return pyseries

    else:
        value = _get_first_non_none(values)
        dtype_ = type(value) if value is not None else float

        if dtype_ in {date, datetime, timedelta}:
            if not _PYARROW_AVAILABLE:  # pragma: no cover
                raise ImportError(
                    "'pyarrow' is required for converting a Sequence of date or datetime values to a PySeries."
                )
            # let arrow infer dtype if not timedelta
            # arrow uses microsecond durations by default, not supported yet.
            return arrow_to_pyseries(name, pa.array(values))

        elif dtype_ == list or dtype_ == tuple:
            nested_value = _get_first_non_none(value)
            nested_dtype = type(nested_value) if value is not None else float

            # recursively call Series constructor
            if nested_dtype == list:
                return sequence_to_pyseries(
                    name=name,
                    values=[
                        sequence_to_pyseries(name,
                                             seq,
                                             dtype=None,
                                             strict=strict) for seq in values
                    ],
                    dtype=None,
                    strict=strict,
                )

            # logs will show a panic if we infer wrong dtype
            # and its hard to error from rust side
            # to reduce the likelihood of this happening
            # we infer the dtype of first 100 elements
            # if all() fails, we will hit the PySeries.new_object
            if not _PYARROW_AVAILABLE:
                # check lists for consistent inner types
                if isinstance(value, list):
                    count = 0
                    equal_to_inner = True
                    for lst in values:
                        for vl in lst:
                            equal_to_inner = type(vl) == nested_dtype
                            if not equal_to_inner or count > 50:
                                break
                            count += 1
                    if equal_to_inner:
                        dtype = py_type_to_dtype(nested_dtype)
                        try:
                            return PySeries.new_list(name, values, dtype)
                        except BaseException:
                            pass
                # pass we create an object if we get here
            else:
                try:
                    nested_arrow_dtype = py_type_to_arrow_type(nested_dtype)
                except ValueError as e:  # pragma: no cover
                    raise ValueError(
                        f"Cannot construct Series from sequence of {nested_dtype}."
                    ) from e

                try:
                    arrow_values = pa.array(values,
                                            pa.large_list(nested_arrow_dtype))
                    return arrow_to_pyseries(name, arrow_values)
                except pa.lib.ArrowInvalid:
                    pass

            # Convert mixed sequences like `[[12], "foo", 9]`
            return PySeries.new_object(name, values, strict)

        elif dtype_ == pli.Series:
            return PySeries.new_series_list(name, [v.inner() for v in values],
                                            strict)
        elif dtype_ == PySeries:
            return PySeries.new_series_list(name, values, strict)

        else:
            constructor = py_type_to_constructor(dtype_)

            if constructor == PySeries.new_object:
                np_constructor = numpy_type_to_constructor(dtype_)
                if np_constructor is not None:
                    values = np.array(values)  # type: ignore
                    constructor = np_constructor

            return constructor(name, values, strict)
Пример #8
0
def sequence_to_pyseries(
    name: str,
    values: Sequence[Any],
    dtype: PolarsDataType | None = None,
    strict: bool = True,
) -> PySeries:
    """
    Construct a PySeries from a sequence.
    """
    dtype_: type | None = None
    nested_dtype: PolarsDataType | type | None = None
    temporal_unit: str | None = None

    # empty sequence defaults to Float32 type
    if not values and dtype is None:
        dtype = Float32
    # lists defer to subsequent handling; identify nested type
    elif dtype == List:
        nested_dtype = getattr(dtype, "inner", None)
        dtype_ = list

    # infer temporal type handling
    py_temporal_types = {date, datetime, timedelta, time}
    pl_temporal_types = {Date, Datetime, Duration, Time}

    value = _get_first_non_none(values)
    if value is not None:
        if dtype in py_temporal_types and isinstance(value, int):
            dtype = py_type_to_dtype(dtype)  # construct from integer
        elif (dtype in pl_temporal_types or type(dtype)
              in pl_temporal_types) and not isinstance(value, int):
            temporal_unit = getattr(dtype, "tu", None)
            dtype_ = dtype_to_py_type(dtype)  # type: ignore[arg-type]

    if (dtype is not None) and is_polars_dtype(dtype) and (dtype_ is None):
        constructor = polars_type_to_constructor(dtype)
        pyseries = constructor(name, values, strict)

        if dtype in (Date, Datetime, Duration, Time, Categorical):
            pyseries = pyseries.cast(dtype, True)
        return pyseries
    else:
        if dtype_ is None:
            dtype_ = float if (value is None) else type(value)

        if dtype_ in py_temporal_types:
            if not _PYARROW_AVAILABLE:  # pragma: no cover
                raise ImportError(
                    "'pyarrow' is required for converting a Sequence of date or datetime values to a PySeries."
                )
            # let arrow infer dtype if not timedelta
            # arrow uses microsecond durations by default, not supported yet.
            arrow_dtype = (dtype_to_arrow_type(dtype) if
                           (dtype is not None and temporal_unit) else None)
            return arrow_to_pyseries(name, pa.array(values, type=arrow_dtype))

        elif dtype_ == list or dtype_ == tuple:
            if nested_dtype is None:
                nested_value = _get_first_non_none(value)
                nested_dtype = type(
                    nested_value) if nested_value is not None else float

            # recursively call Series constructor
            if nested_dtype == list:
                return sequence_to_pyseries(
                    name=name,
                    values=[
                        sequence_to_pyseries(name,
                                             seq,
                                             dtype=None,
                                             strict=strict) for seq in values
                    ],
                    dtype=None,
                    strict=strict,
                )

            # logs will show a panic if we infer wrong dtype
            # and its hard to error from rust side
            # to reduce the likelihood of this happening
            # we infer the dtype of first 100 elements
            # if all() fails, we will hit the PySeries.new_object
            if not _PYARROW_AVAILABLE:
                # check lists for consistent inner types
                if isinstance(value, list):
                    count = 0
                    equal_to_inner = True
                    for lst in values:
                        for vl in lst:
                            equal_to_inner = type(vl) == nested_dtype
                            if not equal_to_inner or count > 50:
                                break
                            count += 1
                    if equal_to_inner:
                        dtype = py_type_to_dtype(nested_dtype)
                        try:
                            return PySeries.new_list(name, values, dtype)
                        except BaseException:
                            pass
                # pass we create an object if we get here
            else:
                try:
                    to_arrow_type = (dtype_to_arrow_type
                                     if is_polars_dtype(nested_dtype) else
                                     py_type_to_arrow_type)
                    nested_arrow_dtype = to_arrow_type(
                        nested_dtype  # type: ignore[arg-type]
                    )
                except ValueError:  # pragma: no cover
                    return sequence_from_anyvalue_or_object(name, values)
                try:
                    arrow_values = pa.array(values,
                                            pa.large_list(nested_arrow_dtype))
                    return arrow_to_pyseries(name, arrow_values)
                except (pa.lib.ArrowInvalid, pa.lib.ArrowTypeError):
                    pass

            # Convert mixed sequences like `[[12], "foo", 9]`
            return PySeries.new_object(name, values, strict)

        elif dtype_ == pli.Series:
            return PySeries.new_series_list(name, [v.inner() for v in values],
                                            strict)
        elif dtype_ == PySeries:
            return PySeries.new_series_list(name, values, strict)
        else:
            constructor = py_type_to_constructor(dtype_)

            if constructor == PySeries.new_object:
                try:
                    return PySeries.new_from_anyvalues(name, values)
                # raised if we cannot convert to Wrap<AnyValue>
                except RuntimeError:
                    return sequence_from_anyvalue_or_object(name, values)

            return constructor(name, values, strict)
Пример #9
0
def arrow_to_pyseries(name: str, values: pa.Array) -> "PySeries":
    """
    Construct a PySeries from an Arrow array.
    """
    array = coerce_arrow(values)
    return PySeries.from_arrow(name, array)