Пример #1
0
def dict_to_pydf(data: dict[str, Sequence[Any]],
                 columns: ColumnsType | None = None) -> PyDataFrame:
    """
    Construct a PyDataFrame from a dictionary of sequences.
    """
    if columns is not None:
        # the columns arg may also set the dtype of the series
        columns, dtypes = _unpack_columns(columns, lookup_names=data.keys())

        if not data and dtypes:
            data_series = [
                pli.Series(name, [], dtypes.get(name)).inner()
                for name in columns
            ]
        else:
            data_series = [
                pli.Series(name, values, dtypes.get(name)).inner()
                for name, values in data.items()
            ]
        data_series = _handle_columns_arg(data_series, columns=columns)
        return PyDataFrame(data_series)

    if _NUMPY_AVAILABLE:
        all_numpy = True
        for val in data.values():
            # only start a thread pool from a reasonable size.
            all_numpy = all_numpy and isinstance(
                val, np.ndarray) and len(val) > 1000
            if not all_numpy:
                break

        if all_numpy:
            # yes, multi-threading was easier in python here
            # we cannot run multiple threads that run python code
            # and release the gil in pyo3
            # it will deadlock.

            # dummy is threaded
            import multiprocessing.dummy

            pool_size = threadpool_size()
            pool = multiprocessing.dummy.Pool(pool_size)
            data_series = pool.map(
                lambda t: pli.Series(t[0], t[1]).inner(),
                [(k, v) for k, v in data.items()],
            )
            return PyDataFrame(data_series)

    # fast path
    return PyDataFrame.read_dict(data)
Пример #2
0
def dict_to_pydf(
    data: Dict[str, Sequence[Any]],
    columns: Optional[ColumnsType] = None,
) -> "PyDataFrame":
    """
    Construct a PyDataFrame from a dictionary of sequences.
    """
    if columns is not None:
        # the columns arg may also set the dtype of the series
        columns, dtypes = _unpack_columns(columns, lookup_names=data.keys())

        if not data and dtypes:
            data_series = [
                pli.Series(name, [], dtypes.get(name)).inner() for name in columns
            ]
        else:
            data_series = [
                pli.Series(name, values, dtypes.get(name)).inner()
                for name, values in data.items()
            ]
        data_series = _handle_columns_arg(data_series, columns=columns)
        return PyDataFrame(data_series)
    # fast path
    return PyDataFrame.read_dict(data)