def from_mem_views(data_mem, mask_mem=None, null_count=None, name=None): """Create a Column object from a data device array (or nvstrings object), and an optional mask device array """ from cudf.dataframe import columnops if isinstance(data_mem, nvstrings.nvstrings): return columnops.build_column( name=name, buffer=data_mem, dtype=np.dtype("object"), null_count=null_count, ) else: data_buf = Buffer(data_mem) mask = None if mask_mem is not None: mask = Buffer(mask_mem) return columnops.build_column( name=name, buffer=data_buf, dtype=data_mem.dtype, mask=mask, null_count=null_count, )
def from_cffi_view(cffi_view): """Create a Column object from a cffi struct gdf_column*. """ from cudf.dataframe import columnops data_mem, mask_mem = _gdf.cffi_view_to_column_mem(cffi_view) dtype = _gdf.gdf_to_np_dtype(cffi_view.dtype) if isinstance(data_mem, nvstrings.nvstrings): return columnops.build_column(data_mem, dtype) else: data_buf = Buffer(data_mem) mask = None if mask_mem is not None: mask = Buffer(mask_mem) return columnops.build_column(data_buf, dtype, mask=mask)
def column_empty(row_count, dtype, masked, categories=None): """Allocate a new column like the given row_count and dtype. """ dtype = pd.api.types.pandas_dtype(dtype) if masked: mask = cudautils.make_empty_mask(row_count) else: mask = None if categories is None and is_categorical_dtype(dtype): categories = [] if dtype.categories is None else dtype.categories if categories is not None: dtype = min_scalar_type(len(categories)) mem = rmm.device_array((row_count, ), dtype=dtype) data = Buffer(mem) dtype = "category" elif dtype.kind in "OU": if row_count == 0: data = nvstrings.to_device([]) else: mem = rmm.device_array((row_count, ), dtype="float64") data = nvstrings.dtos(mem, len(mem), nulls=mask, bdevmem=True) else: mem = rmm.device_array((row_count, ), dtype=dtype) data = Buffer(mem) if mask is not None: mask = Buffer(mask) from cudf.dataframe.columnops import build_column return build_column(data, dtype, mask, categories)
def __init__(self, values, name=None): if isinstance(values, StringColumn): self._values = values.copy() elif isinstance(values, StringIndex): if name is None: name = values.name self._values = values.values.copy() else: self._values = columnops.build_column(nvstrings.to_device(values), dtype='object') self.name = name
def __init__(self, values, **kwargs): kwargs = _setdefault_name(values, kwargs) if isinstance(values, StringColumn): values = values.copy() elif isinstance(values, StringIndex): values = values._values.copy() else: values = columnops.build_column(nvstrings.to_device(values), dtype="object") super(StringIndex, self).__init__(values, **kwargs) assert self._values.null_count == 0
def sort_by_values(self, ascending=True, na_position="last"): if na_position == "last": nullfirst = False elif na_position == "first": nullfirst = True idx_dev_arr = rmm.device_array(len(self), dtype="int32") dev_ptr = get_ctype_ptr(idx_dev_arr) self.data.order(2, asc=ascending, nullfirst=nullfirst, devptr=dev_ptr) col_inds = columnops.build_column(Buffer(idx_dev_arr), idx_dev_arr.dtype, mask=None) col_keys = self[col_inds.data.mem] return col_keys, col_inds
def from_dlpack(pycapsule_obj): """Converts from a DLPack tensor to a cuDF object. DLPack is an open-source memory tensor structure: `dmlc/dlpack <https://github.com/dmlc/dlpack>`_. This function takes a PyCapsule object which contains a pointer to a DLPack tensor as input, and returns a cuDF object. This function deep copies the data in the DLPack tensor into a cuDF object. Parameters ---------- pycapsule_obj : PyCapsule Input DLPack tensor pointer which is encapsulated in a PyCapsule object. Returns ------- A cuDF DataFrame or Series depending on if the input DLPack tensor is 1D or 2D. """ try: res, valids = cpp_dlpack.from_dlpack(pycapsule_obj) except GDFError as err: if str(err) == "b'GDF_DATASET_EMPTY'": raise ValueError( "Cannot create a cuDF Object from a DLPack tensor of 0 size") else: raise err cols = [] for idx in range(len(valids)): mask = None if valids[idx]: mask = Buffer(valids[idx]) cols.append( columnops.build_column(Buffer(res[idx]), dtype=res[idx].dtype, mask=mask)) if len(cols) == 1: return Series(cols[0]) else: df = DataFrame() for idx, col in enumerate(cols): df[idx] = col return df
def len(self): """ Computes the length of each element in the Series/Index. Returns ------- Series or Index of int: A Series or Index of integer values indicating the length of each element in the Series or Index. """ from cudf.dataframe.series import Series out_dev_arr = rmm.device_array(len(self._parent), dtype='int32') ptr = get_ctype_ptr(out_dev_arr) self._parent.data.len(ptr) mask = None if self._parent.null_count > 0: mask = self._parent.mask column = columnops.build_column(Buffer(out_dev_arr), np.dtype('int32'), mask=mask) return Series(column, index=self._index)
def fillna(self, fill_value, inplace=False): """ Fill null values with *fill_value* """ if not self.has_null_mask: return self fill_is_scalar = np.isscalar(fill_value) if fill_is_scalar: if fill_value == self.default_na_value(): fill_value = self.data.dtype.type(fill_value) else: try: fill_value = self._encode( pd.Categorical(fill_value, categories=self.cat().categories)) fill_value = self.data.dtype.type(fill_value) except (ValueError) as err: err_msg = "fill value must be in categories" raise ValueError(err_msg) from err else: fill_value = columnops.as_column(fill_value, nan_as_null=False) # TODO: only required if fill_value has a subset of the categories: fill_value = fill_value.cat()._set_categories( self.cat().categories) fill_value = columnops.as_column(fill_value.data).astype( self.data.dtype) result = cpp_replace.apply_replace_nulls(self, fill_value) result = columnops.build_column( result.data, "category", result.mask, categories=self.cat().categories, ) return self._mimic_inplace(result.replace(mask=None), inplace)
def column_empty(row_count, dtype, masked, categories=None): """Allocate a new column like the given row_count and dtype. """ dtype = pd.api.types.pandas_dtype(dtype) if masked: mask = cudautils.make_mask(row_count) cudautils.fill_value(mask, 0) else: mask = None if ( categories is not None or pd.api.types.is_categorical_dtype(dtype) ): mem = rmm.device_array((row_count,), dtype=dtype) data = Buffer(mem) dtype = 'category' elif dtype.kind in 'OU': if row_count == 0: data = nvstrings.to_device([]) else: mem = rmm.device_array((row_count,), dtype='float64') data = nvstrings.dtos(mem, len(mem), nulls=mask, bdevmem=True) else: mem = rmm.device_array((row_count,), dtype=dtype) data = Buffer(mem) if mask is not None: mask = Buffer(mask) from cudf.dataframe.columnops import build_column return build_column(data, dtype, mask, categories)
elif flags != 0: raise NotImplementedError("`flags` parameter is not yet supported") elif na is not np.nan: raise NotImplementedError("`na` parameter is not yet supported") from cudf.dataframe import Series out_dev_arr = rmm.device_array(len(self._parent), dtype='bool') ptr = get_ctype_ptr(out_dev_arr) self._parent.data.contains(pat, regex=regex, devptr=ptr) mask = None if self._parent.null_count > 0: mask = self._parent.mask column = columnops.build_column(Buffer(out_dev_arr), np.dtype('bool'), mask=mask) return Series(column, index=self._index) def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): """ Replace occurences of pattern/regex in the Series/Index with some other string. Parameters ---------- pat : str String to be replaced as a character sequence or regular expression. repl : str
def as_column(arbitrary, nan_as_null=True, dtype=None): """Create a Column from an arbitrary object Currently support inputs are: * ``Column`` * ``Buffer`` * ``Series`` * ``Index`` * numba device array * cuda array interface * numpy array * pyarrow array * pandas.Categorical Returns ------- result : subclass of TypedColumnBase - CategoricalColumn for pandas.Categorical input. - DatetimeColumn for datetime input - NumericalColumn for all other inputs. """ from cudf.dataframe import numerical, categorical, datetime, string from cudf.dataframe.series import Series from cudf.dataframe.index import Index if isinstance(arbitrary, Column): categories = None if hasattr(arbitrary, "categories"): categories = arbitrary.categories data = build_column( arbitrary.data, arbitrary.dtype, mask=arbitrary.mask, categories=categories ) elif isinstance(arbitrary, Series): data = arbitrary._column elif isinstance(arbitrary, Index): data = arbitrary._values elif isinstance(arbitrary, Buffer): data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype) elif isinstance(arbitrary, nvstrings.nvstrings): data = string.StringColumn(data=arbitrary) elif cuda.devicearray.is_cuda_ndarray(arbitrary): data = as_column(Buffer(arbitrary)) if (data.dtype in [np.float16, np.float32, np.float64] and arbitrary.size > 0): if nan_as_null: mask = cudautils.mask_from_devary(arbitrary) data = data.set_mask(mask) elif cuda.is_cuda_array(arbitrary): # Use cuda array interface to do create a numba device array by # reference new_dev_array = cuda.as_cuda_array(arbitrary) # Allocate new output array using rmm and copy the numba device array # to an rmm owned device array out_dev_array = rmm.device_array_like(new_dev_array) out_dev_array.copy_to_device(new_dev_array) data = as_column(out_dev_array) elif isinstance(arbitrary, np.ndarray): # CUDF assumes values are always contiguous if not arbitrary.flags['C_CONTIGUOUS']: arbitrary = np.ascontiguousarray(arbitrary) if arbitrary.dtype.kind == 'M': data = datetime.DatetimeColumn.from_numpy(arbitrary) elif arbitrary.dtype.kind in ('O', 'U'): data = as_column(pa.Array.from_pandas(arbitrary)) else: data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.Array): if isinstance(arbitrary, pa.StringArray): count = len(arbitrary) null_count = arbitrary.null_count buffers = arbitrary.buffers() # Buffer of actual strings values if buffers[2] is not None: sbuf = np.frombuffer(buffers[2], dtype='int8') else: sbuf = np.empty(0, dtype='int8') # Buffer of offsets values obuf = np.frombuffer(buffers[1], dtype='int32') # Buffer of null bitmask nbuf = None if null_count > 0: nbuf = np.frombuffer(buffers[0], dtype='int8') data = as_column( nvstrings.from_offsets(sbuf, obuf, count, nbuf=nbuf, ncount=null_count) ) elif isinstance(arbitrary, pa.NullArray): new_dtype = dtype if (type(dtype) == str and dtype == 'empty') or dtype is None: new_dtype = np.dtype(arbitrary.type.to_pandas_dtype()) if pd.api.types.is_categorical_dtype(new_dtype): arbitrary = arbitrary.dictionary_encode() else: if nan_as_null: arbitrary = arbitrary.cast(np_to_pa_dtype(new_dtype)) else: # casting a null array doesn't make nans valid # so we create one with valid nans from scratch: if new_dtype == np.dtype("object"): arbitrary = utils.scalar_broadcast_to( None, (len(arbitrary),), dtype=new_dtype ) else: arbitrary = utils.scalar_broadcast_to( np.nan, (len(arbitrary),), dtype=new_dtype ) data = as_column(arbitrary, nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.DictionaryArray): pamask, padata = buffers_from_pyarrow(arbitrary) data = categorical.CategoricalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, categories=arbitrary.dictionary.to_pylist(), ordered=arbitrary.type.ordered, ) elif isinstance(arbitrary, pa.TimestampArray): arbitrary = arbitrary.cast(pa.timestamp('ms')) pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]') data = datetime.DatetimeColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype('M8[ms]') ) elif isinstance(arbitrary, pa.Date64Array): pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]') data = datetime.DatetimeColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype('M8[ms]') ) elif isinstance(arbitrary, pa.Date32Array): # No equivalent np dtype and not yet supported warnings.warn("Date32 values are not yet supported so this will " "be typecast to a Date64 value", UserWarning) arbitrary = arbitrary.cast(pa.date64()) data = as_column(arbitrary) elif isinstance(arbitrary, pa.BooleanArray): # Arrow uses 1 bit per value while we use int8 dtype = np.dtype(np.bool) # Needed because of bug in PyArrow # https://issues.apache.org/jira/browse/ARROW-4766 if len(arbitrary) > 0: arbitrary = arbitrary.cast(pa.int8()) else: arbitrary = pa.array([], type=pa.int8()) pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype) data = numerical.NumericalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=dtype ) else: pamask, padata = buffers_from_pyarrow(arbitrary) data = numerical.NumericalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype(arbitrary.type.to_pandas_dtype()) ) elif isinstance(arbitrary, pa.ChunkedArray): gpu_cols = [as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks] if dtype and dtype != 'empty': new_dtype = dtype else: pa_type = arbitrary.type if pa.types.is_dictionary(pa_type): new_dtype = 'category' else: new_dtype = np.dtype(pa_type.to_pandas_dtype()) data = Column._concat(gpu_cols, dtype=new_dtype) elif isinstance(arbitrary, (pd.Series, pd.Categorical)): if pd.api.types.is_categorical_dtype(arbitrary): data = as_column(pa.array(arbitrary, from_pandas=True)) elif arbitrary.dtype == np.bool: # Bug in PyArrow or HDF that requires us to do this data = as_column(pa.array(np.array(arbitrary), from_pandas=True)) else: data = as_column(pa.array(arbitrary, from_pandas=nan_as_null)) elif isinstance(arbitrary, pd.Timestamp): arbitrary = arbitrary.ceil('ms') # This will always treat NaTs as nulls since it's not technically a # discrete value like NaN data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True)) elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview): if hasattr(arbitrary, 'dtype'): data_type = np_to_pa_dtype(arbitrary.dtype) if data_type in (pa.date64(), pa.date32()): # PyArrow can't construct date64 or date32 arrays from np # datetime types arbitrary = arbitrary.astype('int64') data = as_column(pa.array([arbitrary], type=data_type)) else: data = as_column(pa.array([arbitrary]), nan_as_null=nan_as_null) elif isinstance(arbitrary, memoryview): data = as_column(np.array(arbitrary), dtype=dtype, nan_as_null=nan_as_null) else: try: data = as_column(memoryview(arbitrary)) except TypeError: try: pa_type = None if dtype is not None: if pd.api.types.is_categorical_dtype(dtype): raise TypeError else: np_type = np.dtype(dtype).type if np_type == np.bool_: pa_type = pa.bool_() else: pa_type = np_to_pa_dtype(np.dtype(dtype).type) data = as_column( pa.array(arbitrary, type=pa_type, from_pandas=nan_as_null), nan_as_null=nan_as_null ) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): np_type = None if pd.api.types.is_categorical_dtype(dtype): data = as_column( pd.Series(arbitrary, dtype='category'), nan_as_null=nan_as_null ) else: if dtype is None: np_type = None else: np_type = np.dtype(dtype) data = as_column( np.array(arbitrary, dtype=np_type), nan_as_null=nan_as_null ) return data
def as_column(arbitrary, nan_as_null=True, dtype=None, name=None): """Create a Column from an arbitrary object Currently support inputs are: * ``Column`` * ``Buffer`` * ``Series`` * ``Index`` * numba device array * cuda array interface * numpy array * pyarrow array * pandas.Categorical * Object exposing ``__cuda_array_interface__`` Returns ------- result : subclass of TypedColumnBase - CategoricalColumn for pandas.Categorical input. - DatetimeColumn for datetime input. - StringColumn for string input. - NumericalColumn for all other inputs. """ from cudf.dataframe import numerical, categorical, datetime, string from cudf.dataframe.series import Series from cudf.dataframe.index import Index from cudf.bindings.cudf_cpp import np_to_pa_dtype if name is None and hasattr(arbitrary, "name"): name = arbitrary.name if isinstance(arbitrary, Column): categories = None if hasattr(arbitrary, "categories"): categories = arbitrary.categories data = build_column( arbitrary.data, arbitrary.dtype, mask=arbitrary.mask, categories=categories, ) elif isinstance(arbitrary, Series): data = arbitrary._column if dtype is not None: data = data.astype(dtype) elif isinstance(arbitrary, Index): data = arbitrary._values if dtype is not None: data = data.astype(dtype) elif isinstance(arbitrary, Buffer): data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype) elif isinstance(arbitrary, nvstrings.nvstrings): data = string.StringColumn(data=arbitrary) elif cuda.devicearray.is_cuda_ndarray(arbitrary): data = as_column(Buffer(arbitrary)) if (data.dtype in [np.float16, np.float32, np.float64] and arbitrary.size > 0): if nan_as_null: mask = cudf.bindings.utils.mask_from_devary(data) data = data.set_mask(mask) elif hasattr(arbitrary, "__cuda_array_interface__"): from cudf.bindings.cudf_cpp import count_nonzero_mask desc = arbitrary.__cuda_array_interface__ data = _data_from_cuda_array_interface_desc(desc) mask = _mask_from_cuda_array_interface_desc(desc) if mask is not None: nelem = len(data.mem) nnz = count_nonzero_mask(mask.mem, size=nelem) null_count = nelem - nnz else: null_count = 0 return build_column(data, dtype=data.dtype, mask=mask, name=name, null_count=null_count) elif isinstance(arbitrary, np.ndarray): # CUDF assumes values are always contiguous if not arbitrary.flags["C_CONTIGUOUS"]: arbitrary = np.ascontiguousarray(arbitrary) if dtype is not None: arbitrary = arbitrary.astype(dtype) if arbitrary.dtype.kind == "M": data = datetime.DatetimeColumn.from_numpy(arbitrary) elif arbitrary.dtype.kind in ("O", "U"): data = as_column(pa.Array.from_pandas(arbitrary)) else: data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.Array): if isinstance(arbitrary, pa.StringArray): count = len(arbitrary) null_count = arbitrary.null_count buffers = arbitrary.buffers() # Buffer of actual strings values if buffers[2] is not None: sbuf = np.frombuffer(buffers[2], dtype="int8") else: sbuf = np.empty(0, dtype="int8") # Buffer of offsets values obuf = np.frombuffer(buffers[1], dtype="int32") # Buffer of null bitmask nbuf = None if null_count > 0: nbuf = np.frombuffer(buffers[0], dtype="int8") data = as_column( nvstrings.from_offsets(sbuf, obuf, count, nbuf=nbuf, ncount=null_count)) elif isinstance(arbitrary, pa.NullArray): new_dtype = pd.api.types.pandas_dtype(dtype) if (type(dtype) == str and dtype == "empty") or dtype is None: new_dtype = pd.api.types.pandas_dtype( arbitrary.type.to_pandas_dtype()) if is_categorical_dtype(new_dtype): arbitrary = arbitrary.dictionary_encode() else: if nan_as_null: arbitrary = arbitrary.cast(np_to_pa_dtype(new_dtype)) else: # casting a null array doesn't make nans valid # so we create one with valid nans from scratch: if new_dtype == np.dtype("object"): arbitrary = utils.scalar_broadcast_to( None, (len(arbitrary), ), dtype=new_dtype) else: arbitrary = utils.scalar_broadcast_to( np.nan, (len(arbitrary), ), dtype=new_dtype) data = as_column(arbitrary, nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.DictionaryArray): pamask, padata = buffers_from_pyarrow(arbitrary) data = categorical.CategoricalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, categories=arbitrary.dictionary, ordered=arbitrary.type.ordered, ) elif isinstance(arbitrary, pa.TimestampArray): dtype = np.dtype("M8[{}]".format(arbitrary.type.unit)) pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype) data = datetime.DatetimeColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=dtype, ) elif isinstance(arbitrary, pa.Date64Array): pamask, padata = buffers_from_pyarrow(arbitrary, dtype="M8[ms]") data = datetime.DatetimeColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype("M8[ms]"), ) elif isinstance(arbitrary, pa.Date32Array): # No equivalent np dtype and not yet supported warnings.warn( "Date32 values are not yet supported so this will " "be typecast to a Date64 value", UserWarning, ) data = as_column(arbitrary.cast(pa.int32())).astype("M8[ms]") elif isinstance(arbitrary, pa.BooleanArray): # Arrow uses 1 bit per value while we use int8 dtype = np.dtype(np.bool) # Needed because of bug in PyArrow # https://issues.apache.org/jira/browse/ARROW-4766 if len(arbitrary) > 0: arbitrary = arbitrary.cast(pa.int8()) else: arbitrary = pa.array([], type=pa.int8()) pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype) data = numerical.NumericalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=dtype, ) else: pamask, padata = buffers_from_pyarrow(arbitrary) data = numerical.NumericalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype(arbitrary.type.to_pandas_dtype()), ) elif isinstance(arbitrary, pa.ChunkedArray): gpu_cols = [ as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks ] if dtype and dtype != "empty": new_dtype = dtype else: pa_type = arbitrary.type if pa.types.is_dictionary(pa_type): new_dtype = "category" else: new_dtype = np.dtype(pa_type.to_pandas_dtype()) data = Column._concat(gpu_cols, dtype=new_dtype) elif isinstance(arbitrary, (pd.Series, pd.Categorical)): if is_categorical_dtype(arbitrary): data = as_column(pa.array(arbitrary, from_pandas=True)) elif arbitrary.dtype == np.bool: # Bug in PyArrow or HDF that requires us to do this data = as_column(pa.array(np.array(arbitrary), from_pandas=True)) else: data = as_column(pa.array(arbitrary, from_pandas=nan_as_null)) elif isinstance(arbitrary, pd.Timestamp): # This will always treat NaTs as nulls since it's not technically a # discrete value like NaN data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True)) elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview): if hasattr(arbitrary, "dtype"): data_type = np_to_pa_dtype(arbitrary.dtype) # PyArrow can't construct date64 or date32 arrays from np # datetime types if pa.types.is_date64(data_type) or pa.types.is_date32(data_type): arbitrary = arbitrary.astype("int64") data = as_column(pa.array([arbitrary], type=data_type)) else: data = as_column(pa.array([arbitrary]), nan_as_null=nan_as_null) elif isinstance(arbitrary, memoryview): data = as_column(np.array(arbitrary), dtype=dtype, nan_as_null=nan_as_null) else: try: data = as_column(memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null) except TypeError: pa_type = None np_type = None try: if dtype is not None: dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype): raise TypeError else: np_type = np.dtype(dtype).type if np_type == np.bool_: pa_type = pa.bool_() else: pa_type = np_to_pa_dtype(np.dtype(dtype)) data = as_column( pa.array(arbitrary, type=pa_type, from_pandas=nan_as_null), dtype=dtype, nan_as_null=nan_as_null, ) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): if is_categorical_dtype(dtype): data = as_column( pd.Series(arbitrary, dtype="category"), nan_as_null=nan_as_null, ) else: data = as_column( np.array(arbitrary, dtype=np_type), nan_as_null=nan_as_null, ) if hasattr(data, "name") and (name is not None): data.name = name return data