def as_column(arbitrary, nan_as_null=True, dtype=None): """Create a Column from an arbitrary object Currently support inputs are: * ``Column`` * ``Buffer`` * ``Series`` * ``Index`` * numba device array * cuda array interface * numpy array * pyarrow array * pandas.Categorical Returns ------- result : subclass of TypedColumnBase - CategoricalColumn for pandas.Categorical input. - DatetimeColumn for datetime input - NumericalColumn for all other inputs. """ from cudf.dataframe import numerical, categorical, datetime, string from cudf.dataframe.series import Series from cudf.dataframe.index import Index if isinstance(arbitrary, Column): categories = None if hasattr(arbitrary, "categories"): categories = arbitrary.categories data = build_column(arbitrary.data, arbitrary.dtype, mask=arbitrary.mask, categories=categories) elif isinstance(arbitrary, Series): data = arbitrary._column elif isinstance(arbitrary, Index): data = arbitrary._values elif isinstance(arbitrary, Buffer): data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype) elif isinstance(arbitrary, nvstrings.nvstrings): data = string.StringColumn(data=arbitrary) elif cuda.devicearray.is_cuda_ndarray(arbitrary): data = as_column(Buffer(arbitrary)) if (data.dtype in [np.float16, np.float32, np.float64] and arbitrary.size > 0): if nan_as_null: mask = cudautils.mask_from_devary(arbitrary) data = data.set_mask(mask) elif cuda.is_cuda_array(arbitrary): # Use cuda array interface to do create a numba device array by # reference new_dev_array = cuda.as_cuda_array(arbitrary) # Allocate new output array using rmm and copy the numba device array # to an rmm owned device array out_dev_array = rmm.device_array_like(new_dev_array) out_dev_array.copy_to_device(new_dev_array) data = as_column(out_dev_array) elif isinstance(arbitrary, np.ndarray): # CUDF assumes values are always contiguous if not arbitrary.flags['C_CONTIGUOUS']: arbitrary = np.ascontiguousarray(arbitrary) if arbitrary.dtype.kind == 'M': data = datetime.DatetimeColumn.from_numpy(arbitrary) elif arbitrary.dtype.kind in ('O', 'U'): data = as_column(pa.Array.from_pandas(arbitrary)) else: data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.Array): if isinstance(arbitrary, pa.StringArray): count = len(arbitrary) null_count = arbitrary.null_count buffers = arbitrary.buffers() # Buffer of actual strings values if buffers[2] is not None: sbuf = np.frombuffer(buffers[2], dtype='int8') else: sbuf = np.empty(0, dtype='int8') # Buffer of offsets values obuf = np.frombuffer(buffers[1], dtype='int32') # Buffer of null bitmask nbuf = None if null_count > 0: nbuf = np.frombuffer(buffers[0], dtype='int8') data = as_column( nvstrings.from_offsets(sbuf, obuf, count, nbuf=nbuf, ncount=null_count)) elif isinstance(arbitrary, pa.NullArray): new_dtype = dtype if (type(dtype) == str and dtype == 'empty') or dtype is None: new_dtype = np.dtype(arbitrary.type.to_pandas_dtype()) if pd.api.types.is_categorical_dtype(new_dtype): arbitrary = arbitrary.dictionary_encode() else: if nan_as_null: arbitrary = arbitrary.cast(_gdf.np_to_pa_dtype(new_dtype)) else: # casting a null array doesn't make nans valid # so we create one with valid nans from scratch: if new_dtype == np.dtype("object"): arbitrary = utils.scalar_broadcast_to( None, (len(arbitrary), ), dtype=new_dtype) else: arbitrary = utils.scalar_broadcast_to( np.nan, (len(arbitrary), ), dtype=new_dtype) data = as_column(arbitrary, nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.DictionaryArray): pamask, padata = buffers_from_pyarrow(arbitrary) data = categorical.CategoricalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, categories=arbitrary.dictionary.to_pylist(), ordered=arbitrary.type.ordered, ) elif isinstance(arbitrary, pa.TimestampArray): arbitrary = arbitrary.cast(pa.timestamp('ms')) pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]') data = datetime.DatetimeColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype('M8[ms]')) elif isinstance(arbitrary, pa.Date64Array): pamask, padata = buffers_from_pyarrow(arbitrary, dtype='M8[ms]') data = datetime.DatetimeColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype('M8[ms]')) elif isinstance(arbitrary, pa.Date32Array): # No equivalent np dtype and not yet supported warnings.warn( "Date32 values are not yet supported so this will " "be typecast to a Date64 value", UserWarning) arbitrary = arbitrary.cast(pa.date64()) data = as_column(arbitrary) elif isinstance(arbitrary, pa.BooleanArray): # Arrow uses 1 bit per value while we use int8 dtype = np.dtype(np.bool) # Needed because of bug in PyArrow # https://issues.apache.org/jira/browse/ARROW-4766 if len(arbitrary) > 0: arbitrary = arbitrary.cast(pa.int8()) else: arbitrary = pa.array([], type=pa.int8()) pamask, padata = buffers_from_pyarrow(arbitrary, dtype=dtype) data = numerical.NumericalColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=dtype) else: pamask, padata = buffers_from_pyarrow(arbitrary) data = numerical.NumericalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype(arbitrary.type.to_pandas_dtype())) elif isinstance(arbitrary, pa.ChunkedArray): gpu_cols = [ as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks ] if dtype and dtype != 'empty': new_dtype = dtype else: pa_type = arbitrary.type if pa.types.is_dictionary(pa_type): new_dtype = 'category' else: new_dtype = np.dtype(pa_type.to_pandas_dtype()) data = Column._concat(gpu_cols, dtype=new_dtype) elif isinstance(arbitrary, (pd.Series, pd.Categorical)): if pd.api.types.is_categorical_dtype(arbitrary): data = as_column(pa.array(arbitrary, from_pandas=True)) elif arbitrary.dtype == np.bool: # Bug in PyArrow or HDF that requires us to do this data = as_column(pa.array(np.array(arbitrary), from_pandas=True)) else: data = as_column(pa.array(arbitrary, from_pandas=nan_as_null)) elif isinstance(arbitrary, pd.Timestamp): # This will always treat NaTs as nulls since it's not technically a # discrete value like NaN data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True)) elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview): if hasattr(arbitrary, 'dtype'): data_type = _gdf.np_to_pa_dtype(arbitrary.dtype) if data_type in (pa.date64(), pa.date32()): # PyArrow can't construct date64 or date32 arrays from np # datetime types arbitrary = arbitrary.astype('int64') data = as_column(pa.array([arbitrary], type=data_type)) else: data = as_column(pa.array([arbitrary]), nan_as_null=nan_as_null) elif isinstance(arbitrary, memoryview): data = as_column(np.array(arbitrary), dtype=dtype, nan_as_null=nan_as_null) else: try: data = as_column(memoryview(arbitrary)) except TypeError: try: pa_type = None if dtype is not None: if pd.api.types.is_categorical_dtype(dtype): raise TypeError else: np_type = np.dtype(dtype).type if np_type == np.bool_: pa_type = pa.bool_() else: pa_type = _gdf.np_to_pa_dtype(np.dtype(dtype).type) data = as_column(pa.array(arbitrary, type=pa_type, from_pandas=nan_as_null), nan_as_null=nan_as_null) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): np_type = None if pd.api.types.is_categorical_dtype(dtype): data = as_column(pd.Series(arbitrary, dtype='category'), nan_as_null=nan_as_null) else: if dtype is None: np_type = None else: np_type = np.dtype(dtype) data = as_column(np.array(arbitrary, dtype=np_type), nan_as_null=nan_as_null) return data
def as_column(arbitrary, nan_as_null=True): """Create a Column from an arbitrary object Currently support inputs are: * ``Column`` * ``Buffer`` * numba device array * numpy array * pandas.Categorical Returns ------- result : subclass of TypedColumnBase - CategoricalColumn for pandas.Categorical input. - NumericalColumn for all other inputs. """ from . import numerical, categorical, datetime if isinstance(arbitrary, Column): if not isinstance(arbitrary, TypedColumnBase): # interpret as numeric data = arbitrary.view(numerical.NumericalColumn, dtype=arbitrary.dtype) else: data = arbitrary elif isinstance(arbitrary, Buffer): data = numerical.NumericalColumn(data=arbitrary, dtype=arbitrary.dtype) elif cuda.devicearray.is_cuda_ndarray(arbitrary): data = as_column(Buffer(arbitrary)) if (data.dtype in [np.float16, np.float32, np.float64] and arbitrary.size > 0): if nan_as_null: mask = cudautils.mask_from_devary(arbitrary) data = data.set_mask(mask) elif isinstance(arbitrary, np.ndarray): if arbitrary.dtype.kind == 'M': data = datetime.DatetimeColumn.from_numpy(arbitrary) else: data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.Array): if isinstance(arbitrary, pa.StringArray): raise NotImplementedError("Strings are not yet supported") elif isinstance(arbitrary, pa.NullArray): pamask = Buffer(np.empty(0, dtype='int8')) padata = Buffer(np.empty(0, dtype=arbitrary.type.to_pandas_dtype())) data = numerical.NumericalColumn( data=padata, mask=pamask, null_count=0, dtype=np.dtype(arbitrary.type.to_pandas_dtype())) elif isinstance(arbitrary, pa.DictionaryArray): if arbitrary.buffers()[0]: pamask = Buffer(np.array(arbitrary.buffers()[0])) else: pamask = None padata = Buffer( np.array(arbitrary.buffers()[1]).view( arbitrary.indices.type.to_pandas_dtype())) data = categorical.CategoricalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, categories=arbitrary.dictionary.to_pylist(), ordered=arbitrary.type.ordered, ) elif isinstance(arbitrary, pa.TimestampArray): arbitrary = arbitrary.cast(pa.timestamp('ms')) if arbitrary.buffers()[0]: pamask = Buffer(np.array(arbitrary.buffers()[0])) else: pamask = None padata = Buffer( np.array(arbitrary.buffers()[1]).view(np.dtype('M8[ms]'))) data = datetime.DatetimeColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype('M8[ms]')) elif isinstance(arbitrary, pa.Date64Array): if arbitrary.buffers()[0]: pamask = Buffer(np.array(arbitrary.buffers()[0])) else: pamask = None padata = Buffer( np.array(arbitrary.buffers()[1]).view(np.dtype('M8[ms]'))) data = datetime.DatetimeColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype('M8[ms]')) elif isinstance(arbitrary, pa.Date32Array): # No equivalent np dtype and not yet supported warnings.warn( "Date32 values are not yet supported so this will " "be typecast to a Date64 value", UserWarning) arbitrary = arbitrary.cast(pa.date64()) data = as_column(arbitrary) elif isinstance(arbitrary, pa.BooleanArray): # Arrow uses 1 bit per value while we use int8 dtype = np.dtype(np.bool) arbitrary = arbitrary.cast(pa.int8()) if arbitrary.buffers()[0]: pamask = Buffer(np.array(arbitrary.buffers()[0])) else: pamask = None padata = Buffer(np.array(arbitrary.buffers()[1]).view(dtype)) data = numerical.NumericalColumn(data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=dtype) else: if arbitrary.buffers()[0]: pamask = Buffer(np.array(arbitrary.buffers()[0])) else: pamask = None padata = Buffer( np.array(arbitrary.buffers()[1]).view( np.dtype(arbitrary.type.to_pandas_dtype()))) data = numerical.NumericalColumn( data=padata, mask=pamask, null_count=arbitrary.null_count, dtype=np.dtype(arbitrary.type.to_pandas_dtype())) elif isinstance(arbitrary, (pd.Series, pd.Categorical)): if pd.core.common.is_categorical_dtype(arbitrary): data = as_column(pa.array(arbitrary, from_pandas=True)) else: data = as_column(pa.array(arbitrary, from_pandas=nan_as_null)) elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview): if hasattr(arbitrary, 'dtype'): data_type = _gdf.np_to_pa_dtype(arbitrary.dtype) if data_type in (pa.date64(), pa.date32()): # PyArrow can't construct date64 or date32 arrays from np # datetime types arbitrary = arbitrary.astype('int64') data = as_column(pa.array([arbitrary], type=data_type)) else: data = as_column(pa.array([arbitrary])) elif isinstance(arbitrary, memoryview): data = as_column(np.array(arbitrary)) else: try: data = as_column(memoryview(arbitrary)) except TypeError: data = as_column(pa.array(arbitrary)) return data