Exemplo n.º 1
0
def sequence_to_pydf(
    data: Sequence[Any],
    columns: Optional[ColumnsType] = None,
    orient: Optional[str] = None,
) -> "PyDataFrame":
    """
    Construct a PyDataFrame from a sequence.
    """
    data_series: List["PySeries"]

    if len(data) == 0:
        return dict_to_pydf({}, columns=columns)

    elif isinstance(data[0], pli.Series):
        series_names = [s.name for s in data]
        columns, dtypes = _unpack_columns(columns or series_names, n_expected=len(data))
        data_series = []
        for i, s in enumerate(data):
            if not s.name:  # TODO: Replace by `if s.name is None` once allowed
                s.rename(columns[i], in_place=True)

            new_dtype = dtypes.get(columns[i])
            if new_dtype and new_dtype != s.dtype:
                s = s.cast(new_dtype)

            data_series.append(s.inner())

    elif isinstance(data[0], dict):
        pydf = PyDataFrame.read_dicts(data)
        if columns:
            pydf = _post_apply_columns(pydf, columns)
        return pydf

    elif isinstance(data[0], Sequence) and not isinstance(data[0], str):
        # Infer orientation
        if orient is None and columns is not None:
            orient = "col" if len(columns) == len(data) else "row"

        if orient == "row":
            pydf = PyDataFrame.read_rows(data)
            if columns:
                pydf = _post_apply_columns(pydf, columns)
            return pydf
        else:
            columns, dtypes = _unpack_columns(columns, n_expected=len(data))
            data_series = [
                pli.Series(columns[i], data[i], dtypes.get(columns[i])).inner()
                for i in range(len(data))
            ]

    else:
        columns, dtypes = _unpack_columns(columns, n_expected=1)
        data_series = [pli.Series(columns[0], data, dtypes.get(columns[0])).inner()]

    data_series = _handle_columns_arg(data_series, columns=columns)
    return PyDataFrame(data_series)
Exemplo n.º 2
0
def dict_to_pydf(data: dict[str, Sequence[Any]],
                 columns: ColumnsType | None = None) -> PyDataFrame:
    """
    Construct a PyDataFrame from a dictionary of sequences.
    """
    if columns is not None:
        # the columns arg may also set the dtype of the series
        columns, dtypes = _unpack_columns(columns, lookup_names=data.keys())

        if not data and dtypes:
            data_series = [
                pli.Series(name, [], dtypes.get(name)).inner()
                for name in columns
            ]
        else:
            data_series = [
                pli.Series(name, values, dtypes.get(name)).inner()
                for name, values in data.items()
            ]
        data_series = _handle_columns_arg(data_series, columns=columns)
        return PyDataFrame(data_series)

    if _NUMPY_AVAILABLE:
        all_numpy = True
        for val in data.values():
            # only start a thread pool from a reasonable size.
            all_numpy = all_numpy and isinstance(
                val, np.ndarray) and len(val) > 1000
            if not all_numpy:
                break

        if all_numpy:
            # yes, multi-threading was easier in python here
            # we cannot run multiple threads that run python code
            # and release the gil in pyo3
            # it will deadlock.

            # dummy is threaded
            import multiprocessing.dummy

            pool_size = threadpool_size()
            pool = multiprocessing.dummy.Pool(pool_size)
            data_series = pool.map(
                lambda t: pli.Series(t[0], t[1]).inner(),
                [(k, v) for k, v in data.items()],
            )
            return PyDataFrame(data_series)

    # fast path
    return PyDataFrame.read_dict(data)
Exemplo n.º 3
0
def sequence_to_pydf(
    data: Sequence[Any],
    columns: Optional[Sequence[str]] = None,
    orient: Optional[str] = None,
    nullable: bool = True,
) -> "PyDataFrame":
    """
    Construct a PyDataFrame from a sequence.
    """
    data_series: List["PySeries"]
    if len(data) == 0:
        data_series = []

    elif isinstance(data[0], pl.Series):
        data_series = []
        for i, s in enumerate(data):
            if not s.name:  # TODO: Replace by `if s.name is None` once allowed
                s.rename(f"column_{i}", in_place=True)
            data_series.append(s.inner())

    elif isinstance(data[0], dict):
        pydf = PyDataFrame.read_dicts(data)
        if columns is not None:
            pydf.set_column_names(columns)
        return pydf

    elif isinstance(data[0], Sequence) and not isinstance(data[0], str):
        # Infer orientation
        if orient is None and columns is not None:
            orient = "col" if len(columns) == len(data) else "row"

        if orient == "row":
            pydf = PyDataFrame.read_rows(data)
            if columns is not None:
                pydf.set_column_names(columns)
            return pydf
        else:
            data_series = [
                pl.Series(f"column_{i}", data[i], nullable=nullable).inner()
                for i in range(len(data))
            ]

    else:
        s = pl.Series("column_0", data, nullable=nullable).inner()
        data_series = [s]

    data_series = _handle_columns_arg(data_series,
                                      columns=columns,
                                      nullable=nullable)
    return PyDataFrame(data_series)
Exemplo n.º 4
0
def arrow_to_pydf(data: pa.Table,
                  columns: Optional[Sequence[str]] = None,
                  rechunk: bool = True) -> "PyDataFrame":
    """
    Construct a PyDataFrame from an Arrow Table.
    """
    if columns is not None:
        try:
            data = data.rename_columns(columns)
        except pa.lib.ArrowInvalid as e:
            raise ValueError(
                "Dimensions of columns arg must match data dimensions.") from e

    data_dict = {}
    for i, column in enumerate(data):
        # extract the name before casting
        if column._name is None:
            name = f"column_{i}"
        else:
            name = column._name

        column = coerce_arrow(column)
        data_dict[name] = column

    batches = pa.table(data_dict).to_batches()
    pydf = PyDataFrame.from_arrow_record_batches(batches)
    if rechunk:
        pydf = pydf.rechunk()
    return pydf
Exemplo n.º 5
0
def _post_apply_columns(pydf: PyDataFrame,
                        columns: ColumnsType) -> PyDataFrame:
    """
    Apply 'columns' param _after_ PyDataFrame creation (if no alternative).
    """
    pydf_columns, pydf_dtypes = pydf.columns(), pydf.dtypes()
    columns, dtypes = _unpack_columns(columns or pydf_columns)
    if columns != pydf_columns:
        pydf.set_column_names(columns)

    column_casts = [
        pli.col(col).cast(dtypes[col])._pyexpr for i, col in enumerate(columns)
        if col in dtypes and dtypes[col] != pydf_dtypes[i]
    ]
    if column_casts:
        pydf = pydf.lazy().with_columns(column_casts).collect()
    return pydf
Exemplo n.º 6
0
def numpy_to_pydf(
    data: np.ndarray,
    columns: Optional[ColumnsType] = None,
    orient: Optional[str] = None,
) -> "PyDataFrame":
    """
    Construct a PyDataFrame from a numpy ndarray.
    """
    shape = data.shape
    n_columns = (0 if shape == (0, ) else
                 (1 if len(shape) == 1 else
                  (shape[1] if orient in ("row", None) else shape[0])))
    columns, dtypes = _unpack_columns(columns, n_expected=n_columns)
    if columns and len(columns) != n_columns:
        raise ValueError(
            "Dimensions of columns arg must match data dimensions.")

    if shape == (0, ):
        data_series = []

    elif len(shape) == 1:
        data_series = [
            pli.Series(columns[0], data, dtypes.get(columns[0])).inner()
        ]

    elif len(shape) == 2:
        # Infer orientation
        if orient is None:
            warnings.warn(
                "Default orientation for constructing DataFrame from numpy "
                'array will change from "row" to "column" in a future version. '
                "Specify orientation explicitly to silence this warning.",
                DeprecationWarning,
                stacklevel=2,
            )
            orient = "row"
        # Exchange if-block above for block below when removing warning
        # if orientation is None and columns is not None:
        #     orientation = "col" if len(columns) == shape[0] else "row"
        if orient == "row":
            data_series = [
                pli.Series(columns[i], data[:, i],
                           dtypes.get(columns[i])).inner()
                for i in range(n_columns)
            ]
        else:
            data_series = [
                pli.Series(columns[i], data[i],
                           dtypes.get(columns[i])).inner()
                for i in range(n_columns)
            ]
    else:
        raise ValueError(
            "A numpy array should not have more than two dimensions.")

    data_series = _handle_columns_arg(data_series, columns=columns)
    return PyDataFrame(data_series)
Exemplo n.º 7
0
def arrow_to_pydf(data: "pa.Table",
                  columns: Optional[Sequence[str]] = None,
                  rechunk: bool = True) -> "PyDataFrame":
    """
    Construct a PyDataFrame from an Arrow Table.
    """
    if not _PYARROW_AVAILABLE:  # pragma: no cover
        raise ImportError(
            "'pyarrow' is required when constructing a PyDataFrame from an Arrow Table."
        )
    if columns is not None:
        try:
            data = data.rename_columns(columns)
        except pa.lib.ArrowInvalid as e:
            raise ValueError(
                "Dimensions of columns arg must match data dimensions.") from e

    data_dict = {}
    # dictionaries cannot be build in different batches (categorical does not allow that)
    # so we rechunk them and create them separate.
    dictionary_cols = {}
    names = []
    for i, column in enumerate(data):
        # extract the name before casting
        if column._name is None:
            name = f"column_{i}"
        else:
            name = column._name
        names.append(name)

        column = coerce_arrow(column)
        if pa.types.is_dictionary(column.type):
            ps = arrow_to_pyseries(name, column, rechunk)
            dictionary_cols[i] = pli.wrap_s(ps)
        else:
            data_dict[name] = column

    if len(data_dict) > 0:
        tbl = pa.table(data_dict)

        # path for table without rows that keeps datatype
        if tbl.shape[0] == 0:
            pydf = pli.DataFrame._from_pandas(tbl.to_pandas())._df
        else:
            pydf = PyDataFrame.from_arrow_record_batches(tbl.to_batches())
    else:
        pydf = pli.DataFrame([])._df
    if rechunk:
        pydf = pydf.rechunk()

    if len(dictionary_cols) > 0:
        df = pli.wrap_df(pydf)
        for i, s in dictionary_cols.items():
            df[s.name] = s
        df = df[names]
        pydf = df._df
    return pydf
Exemplo n.º 8
0
def series_to_pydf(
    data: "pli.Series",
    columns: Optional[Sequence[str]] = None,
) -> "PyDataFrame":
    """
    Construct a PyDataFrame from a Polars Series.
    """
    data_series = [data.inner()]
    data_series = _handle_columns_arg(data_series, columns=columns)
    return PyDataFrame(data_series)
Exemplo n.º 9
0
def dict_to_pydf(
    data: Dict[str, Sequence[Any]],
    columns: Optional[Sequence[str]] = None,
) -> "PyDataFrame":
    """
    Construct a PyDataFrame from a dictionary of sequences.
    """
    data_series = [
        pli.Series(name, values).inner() for name, values in data.items()
    ]
    data_series = _handle_columns_arg(data_series, columns=columns)
    return PyDataFrame(data_series)
Exemplo n.º 10
0
def numpy_to_pydf(
    data: np.ndarray,
    columns: Optional[Sequence[str]] = None,
    orient: Optional[str] = None,
    nullable: bool = True,
) -> "PyDataFrame":
    """
    Construct a PyDataFrame from a numpy ndarray.
    """
    shape = data.shape

    if shape == (0, ):
        data_series = []

    elif len(shape) == 1:
        s = pl.Series("column_0", data, nullable=False).inner()
        data_series = [s]

    elif len(shape) == 2:
        # Infer orientation
        if orient is None:
            warnings.warn(
                "Default orientation for constructing DataFrame from numpy "
                'array will change from "row" to "column" in a future version. '
                "Specify orientation explicitly to silence this warning.",
                DeprecationWarning,
                stacklevel=2,
            )
            orient = "row"
        # Exchange if-block above for block below when removing warning
        # if orientation is None and columns is not None:
        #     orientation = "col" if len(columns) == shape[0] else "row"

        if orient == "row":
            data_series = [
                pl.Series(f"column_{i}", data[:, i], nullable=False).inner()
                for i in range(shape[1])
            ]
        else:
            data_series = [
                pl.Series(f"column_{i}", data[i], nullable=False).inner()
                for i in range(shape[0])
            ]
    else:
        raise ValueError(
            "A numpy array should not have more than two dimensions.")

    data_series = _handle_columns_arg(data_series,
                                      columns=columns,
                                      nullable=nullable)

    return PyDataFrame(data_series)
Exemplo n.º 11
0
def series_to_pydf(data: pli.Series,
                   columns: ColumnsType | None = None) -> PyDataFrame:
    """
    Construct a PyDataFrame from a Polars Series.
    """
    data_series = [data.inner()]
    series_name = [s.name() for s in data_series]
    columns, dtypes = _unpack_columns(columns or series_name, n_expected=1)
    if dtypes:
        new_dtype = list(dtypes.values())[0]
        if new_dtype != data.dtype:
            data_series[0] = data_series[0].cast(new_dtype, True)

    data_series = _handle_columns_arg(data_series, columns=columns)
    return PyDataFrame(data_series)
Exemplo n.º 12
0
def dict_to_pydf(
    data: Dict[str, Sequence[Any]],
    columns: Optional[ColumnsType] = None,
) -> "PyDataFrame":
    """
    Construct a PyDataFrame from a dictionary of sequences.
    """
    if columns is not None:
        # the columns arg may also set the dtype of the series
        columns, dtypes = _unpack_columns(columns, lookup_names=data.keys())

        if not data and dtypes:
            data_series = [
                pli.Series(name, [], dtypes.get(name)).inner() for name in columns
            ]
        else:
            data_series = [
                pli.Series(name, values, dtypes.get(name)).inner()
                for name, values in data.items()
            ]
        data_series = _handle_columns_arg(data_series, columns=columns)
        return PyDataFrame(data_series)
    # fast path
    return PyDataFrame.read_dict(data)