def dict_to_pydf(data: dict[str, Sequence[Any]], columns: ColumnsType | None = None) -> PyDataFrame: """ Construct a PyDataFrame from a dictionary of sequences. """ if columns is not None: # the columns arg may also set the dtype of the series columns, dtypes = _unpack_columns(columns, lookup_names=data.keys()) if not data and dtypes: data_series = [ pli.Series(name, [], dtypes.get(name)).inner() for name in columns ] else: data_series = [ pli.Series(name, values, dtypes.get(name)).inner() for name, values in data.items() ] data_series = _handle_columns_arg(data_series, columns=columns) return PyDataFrame(data_series) if _NUMPY_AVAILABLE: all_numpy = True for val in data.values(): # only start a thread pool from a reasonable size. all_numpy = all_numpy and isinstance( val, np.ndarray) and len(val) > 1000 if not all_numpy: break if all_numpy: # yes, multi-threading was easier in python here # we cannot run multiple threads that run python code # and release the gil in pyo3 # it will deadlock. # dummy is threaded import multiprocessing.dummy pool_size = threadpool_size() pool = multiprocessing.dummy.Pool(pool_size) data_series = pool.map( lambda t: pli.Series(t[0], t[1]).inner(), [(k, v) for k, v in data.items()], ) return PyDataFrame(data_series) # fast path return PyDataFrame.read_dict(data)
def dict_to_pydf( data: Dict[str, Sequence[Any]], columns: Optional[ColumnsType] = None, ) -> "PyDataFrame": """ Construct a PyDataFrame from a dictionary of sequences. """ if columns is not None: # the columns arg may also set the dtype of the series columns, dtypes = _unpack_columns(columns, lookup_names=data.keys()) if not data and dtypes: data_series = [ pli.Series(name, [], dtypes.get(name)).inner() for name in columns ] else: data_series = [ pli.Series(name, values, dtypes.get(name)).inner() for name, values in data.items() ] data_series = _handle_columns_arg(data_series, columns=columns) return PyDataFrame(data_series) # fast path return PyDataFrame.read_dict(data)