def test_dtype_np_bool_to_pa_bool(): """This test case captures that utility np_to_pa_dtype should map np.bool_ to pa.bool_, nuances on bit width difference should be handled elsewhere. """ assert np_to_pa_dtype(np.dtype("bool")) == pa.bool_()
def to_arrow(self): mask = None if self.nullable: mask = pa.py_buffer(self.mask_array_view.copy_to_host()) data = pa.py_buffer(self.as_numerical.data_array_view.copy_to_host()) pa_dtype = np_to_pa_dtype(self.dtype) return pa.Array.from_buffers( type=pa_dtype, length=len(self), buffers=[mask, data], null_count=self.null_count, )
def to_arrow(self): mask = None if self.has_null_mask: mask = pa.py_buffer(self.nullmask.mem.copy_to_host()) data = pa.py_buffer(self.data.mem.copy_to_host()) pa_dtype = np_to_pa_dtype(self.dtype) out = pa.Array.from_buffers( type=pa_dtype, length=len(self), buffers=[mask, data], null_count=self.null_count, ) if self.dtype == np.bool: return out.cast(pa.bool_()) else: return out
def assert_column_array_dtype_equal(column: ColumnBase, array: pa.array): """ In cudf, each column holds its dtype. And since column may have child columns, child columns also holds their datatype. This method tests that every level of `column` matches the type of the given `array` recursively. """ if isinstance(column.dtype, ListDtype): return array.type.equals( column.dtype.to_arrow()) and assert_column_array_dtype_equal( column.base_children[1], array.values) elif isinstance(column.dtype, StructDtype): return array.type.equals(column.dtype.to_arrow()) and all([ assert_column_array_dtype_equal(child, array.field(i)) for i, child in enumerate(column.base_children) ]) elif isinstance(column.dtype, Decimal64Dtype): return array.type.equals(column.dtype.to_arrow()) elif isinstance(column.dtype, CategoricalDtype): raise NotImplementedError() else: return array.type.equals(np_to_pa_dtype(column.dtype))
def as_column(arbitrary, nan_as_null=True, dtype=None, length=None): """Create a Column from an arbitrary object Parameters ---------- arbitrary : object Object to construct the Column from. See *Notes*. nan_as_null : bool,optional If True (default), treat NaN values in arbitrary as null. dtype : optional Optionally typecast the construted Column to the given dtype. length : int, optional If `arbitrary` is a scalar, broadcast into a Column of the given length. Returns ------- A Column of the appropriate type and size. Notes ----- Currently support inputs are: * ``Column`` * ``Series`` * ``Index`` * Scalars (can be broadcasted to a specified `length`) * Objects exposing ``__cuda_array_interface__`` (e.g., numba device arrays) * Objects exposing ``__array_interface__``(e.g., numpy arrays) * pyarrow array * pandas.Categorical objects """ from cudf.core.column import numerical, categorical, datetime, string from cudf.core.series import Series from cudf.core.index import Index if isinstance(arbitrary, ColumnBase): if dtype is not None: return arbitrary.astype(dtype) else: return arbitrary elif isinstance(arbitrary, Series): data = arbitrary._column if dtype is not None: data = data.astype(dtype) elif isinstance(arbitrary, Index): data = arbitrary._values if dtype is not None: data = data.astype(dtype) elif isinstance(arbitrary, nvstrings.nvstrings): byte_count = arbitrary.byte_count() if byte_count > libcudfxx.MAX_STRING_COLUMN_BYTES: raise MemoryError( "Cannot construct string columns " "containing > {} bytes. " "Consider using dask_cudf to partition " "your data.".format(libcudfxx.MAX_STRING_COLUMN_BYTES_STR) ) sbuf = Buffer.empty(arbitrary.byte_count()) obuf = Buffer.empty( (arbitrary.size() + 1) * np.dtype("int32").itemsize ) nbuf = None if arbitrary.null_count() > 0: mask_size = calc_chunk_size(arbitrary.size(), mask_bitsize) nbuf = Buffer.empty(mask_size) arbitrary.set_null_bitmask(nbuf.ptr, bdevmem=True) arbitrary.to_offsets(sbuf.ptr, obuf.ptr, None, bdevmem=True) children = ( build_column(obuf, dtype="int32"), build_column(sbuf, dtype="int8"), ) data = build_column( data=None, dtype="object", mask=nbuf, children=children ) data._nvstrings = arbitrary elif isinstance(arbitrary, Buffer): if dtype is None: raise TypeError(f"dtype cannot be None if 'arbitrary' is a Buffer") data = build_column(arbitrary, dtype=dtype) elif cuda.devicearray.is_cuda_ndarray(arbitrary): data = as_column(Buffer(arbitrary), dtype=arbitrary.dtype) if ( data.dtype in [np.float16, np.float32, np.float64] and arbitrary.size > 0 ): if nan_as_null: mask = libcudf.unaryops.nans_to_nulls(data) data = data.set_mask(mask) elif data.dtype.kind == "M": null = column_empty_like(data, masked=True, newsize=1) col = libcudf.replace.replace( as_column(Buffer(arbitrary), dtype=arbitrary.dtype), as_column( Buffer(np.array([np.datetime64("NaT")], dtype=data.dtype)), dtype=arbitrary.dtype, ), null, ) data = datetime.DatetimeColumn( data=Buffer(arbitrary), dtype=data.dtype, mask=col.mask ) elif hasattr(arbitrary, "__cuda_array_interface__"): desc = arbitrary.__cuda_array_interface__ data = _data_from_cuda_array_interface_desc(arbitrary) mask = _mask_from_cuda_array_interface_desc(arbitrary) dtype = np.dtype(desc["typestr"]) col = build_column(data, dtype=dtype, mask=mask) return col elif isinstance(arbitrary, np.ndarray): # CUDF assumes values are always contiguous if not arbitrary.flags["C_CONTIGUOUS"]: arbitrary = np.ascontiguousarray(arbitrary) if dtype is not None: arbitrary = arbitrary.astype(dtype) if arbitrary.dtype.kind == "M": data = datetime.DatetimeColumn.from_numpy(arbitrary) elif arbitrary.dtype.kind in ("O", "U"): data = as_column(pa.Array.from_pandas(arbitrary)) else: data = as_column(rmm.to_device(arbitrary), nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.Array): if isinstance(arbitrary, pa.StringArray): pa_size, pa_offset, nbuf, obuf, sbuf = buffers_from_pyarrow( arbitrary ) children = ( build_column(data=obuf, dtype="int32"), build_column(data=sbuf, dtype="int8"), ) data = string.StringColumn( mask=nbuf, children=children, size=pa_size, offset=pa_offset ) elif isinstance(arbitrary, pa.NullArray): new_dtype = pd.api.types.pandas_dtype(dtype) if (type(dtype) == str and dtype == "empty") or dtype is None: new_dtype = pd.api.types.pandas_dtype( arbitrary.type.to_pandas_dtype() ) if is_categorical_dtype(new_dtype): arbitrary = arbitrary.dictionary_encode() else: if nan_as_null: arbitrary = arbitrary.cast(np_to_pa_dtype(new_dtype)) else: # casting a null array doesn't make nans valid # so we create one with valid nans from scratch: if new_dtype == np.dtype("object"): arbitrary = utils.scalar_broadcast_to( None, (len(arbitrary),), dtype=new_dtype ) else: arbitrary = utils.scalar_broadcast_to( np.nan, (len(arbitrary),), dtype=new_dtype ) data = as_column(arbitrary, nan_as_null=nan_as_null) elif isinstance(arbitrary, pa.DictionaryArray): codes = as_column(arbitrary.indices) if isinstance(arbitrary.dictionary, pa.NullArray): categories = as_column([], dtype="object") else: categories = as_column(arbitrary.dictionary) dtype = CategoricalDtype( categories=categories, ordered=arbitrary.type.ordered ) data = categorical.CategoricalColumn( dtype=dtype, mask=codes.base_mask, children=(codes,), size=codes.size, offset=codes.offset, ) elif isinstance(arbitrary, pa.TimestampArray): dtype = np.dtype("M8[{}]".format(arbitrary.type.unit)) pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary, dtype=dtype ) data = datetime.DatetimeColumn( data=padata, mask=pamask, dtype=dtype, size=pa_size, offset=pa_offset, ) elif isinstance(arbitrary, pa.Date64Array): raise NotImplementedError pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary, dtype="M8[ms]" ) data = datetime.DatetimeColumn( data=padata, mask=pamask, dtype=np.dtype("M8[ms]"), size=pa_size, offset=pa_offset, ) elif isinstance(arbitrary, pa.Date32Array): # No equivalent np dtype and not yet supported warnings.warn( "Date32 values are not yet supported so this will " "be typecast to a Date64 value", UserWarning, ) data = as_column(arbitrary.cast(pa.int32())).astype("M8[ms]") elif isinstance(arbitrary, pa.BooleanArray): # Arrow uses 1 bit per value while we use int8 dtype = np.dtype(np.bool) # Needed because of bug in PyArrow # https://issues.apache.org/jira/browse/ARROW-4766 if len(arbitrary) > 0: arbitrary = arbitrary.cast(pa.int8()) else: arbitrary = pa.array([], type=pa.int8()) pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary, dtype=dtype ) data = numerical.NumericalColumn( data=padata, mask=pamask, dtype=dtype, size=pa_size, offset=pa_offset, ) else: pa_size, pa_offset, pamask, padata, _ = buffers_from_pyarrow( arbitrary ) data = numerical.NumericalColumn( data=padata, dtype=np.dtype(arbitrary.type.to_pandas_dtype()), mask=pamask, size=pa_size, offset=pa_offset, ) elif isinstance(arbitrary, pa.ChunkedArray): gpu_cols = [ as_column(chunk, dtype=dtype) for chunk in arbitrary.chunks ] if dtype and dtype != "empty": new_dtype = dtype else: pa_type = arbitrary.type if pa.types.is_dictionary(pa_type): new_dtype = "category" else: new_dtype = np.dtype(pa_type.to_pandas_dtype()) data = ColumnBase._concat(gpu_cols, dtype=new_dtype) elif isinstance(arbitrary, (pd.Series, pd.Categorical)): if is_categorical_dtype(arbitrary): data = as_column(pa.array(arbitrary, from_pandas=True)) elif arbitrary.dtype == np.bool: # Bug in PyArrow or HDF that requires us to do this data = as_column(pa.array(np.asarray(arbitrary), from_pandas=True)) else: data = as_column(pa.array(arbitrary, from_pandas=nan_as_null)) elif isinstance(arbitrary, pd.Timestamp): # This will always treat NaTs as nulls since it's not technically a # discrete value like NaN data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True)) elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview): length = length or 1 data = as_column( utils.scalar_broadcast_to(arbitrary, length, dtype=dtype) ) if not nan_as_null: data = data.fillna(np.nan) elif isinstance(arbitrary, memoryview): data = as_column( np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null ) else: try: data = as_column( memoryview(arbitrary), dtype=dtype, nan_as_null=nan_as_null ) except TypeError: pa_type = None np_type = None try: if dtype is not None: dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype): raise TypeError else: np_type = np.dtype(dtype).type if np_type == np.bool_: pa_type = pa.bool_() else: pa_type = np_to_pa_dtype(np.dtype(dtype)) data = as_column( pa.array(arbitrary, type=pa_type, from_pandas=nan_as_null), dtype=dtype, nan_as_null=nan_as_null, ) except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): if is_categorical_dtype(dtype): sr = pd.Series(arbitrary, dtype="category") data = as_column(sr, nan_as_null=nan_as_null) elif np_type == np.str_: sr = pd.Series(arbitrary, dtype="str") data = as_column(sr, nan_as_null=nan_as_null) else: data = as_column( np.asarray(arbitrary, dtype=np.dtype(dtype)), nan_as_null=nan_as_null, ) return data
def _generate_column(column_params, num_rows): # If cardinality is specified, we create a set to sample from. # Otherwise, we simply use the given generator to generate each value. if column_params.cardinality is not None: # Construct set of values to sample from where # set size = cardinality if (isinstance(column_params.dtype, str) and column_params.dtype == "category"): vals = pa.array( column_params.generator, size=column_params.cardinality, safe=False, ) return pa.DictionaryArray.from_arrays( dictionary=vals, indices=np.random.randint(low=0, high=len(vals), size=num_rows), mask=np.random.choice( [True, False], size=num_rows, p=[ column_params.null_frequency, 1 - column_params.null_frequency, ], ) if column_params.null_frequency > 0.0 else None, ) if hasattr(column_params.dtype, "to_arrow"): arrow_type = column_params.dtype.to_arrow() elif column_params.dtype is not None: arrow_type = np_to_pa_dtype(cudf.dtype(column_params.dtype)) else: arrow_type = None if not isinstance(arrow_type, pa.lib.Decimal128Type): vals = pa.array( column_params.generator, size=column_params.cardinality, safe=False, type=arrow_type, ) vals = pa.array( np.random.choice(column_params.generator, size=num_rows) if isinstance(arrow_type, pa.lib.Decimal128Type) else np.random.choice(vals, size=num_rows), mask=np.random.choice( [True, False], size=num_rows, p=[ column_params.null_frequency, 1 - column_params.null_frequency, ], ) if column_params.null_frequency > 0.0 else None, size=num_rows, safe=False, type=None if isinstance(arrow_type, pa.lib.Decimal128Type) else arrow_type, ) if isinstance(arrow_type, pa.lib.Decimal128Type): vals = vals.cast(arrow_type, safe=False) return vals else: # Generate data for current column return pa.array( column_params.generator, mask=np.random.choice( [True, False], size=num_rows, p=[ column_params.null_frequency, 1 - column_params.null_frequency, ], ) if column_params.null_frequency > 0.0 else None, size=num_rows, safe=False, )
def get_dataframe(parameters, use_threads): # Initialize seeds if parameters.seed is not None: np.random.seed(parameters.seed) # For each column, use a generic Mimesis producer to create an Iterable # for generating data for i, column_params in enumerate(parameters.column_parameters): if column_params.dtype is None: column_params.generator = column_params.generator( Generic("en", seed=parameters.seed)) else: column_params.generator = column_params.generator() # Get schema for each column table_fields = [] for i, column_params in enumerate(parameters.column_parameters): if (isinstance(column_params.dtype, str) and column_params.dtype == "category"): arrow_type = pa.dictionary( index_type=pa.int64(), value_type=np_to_pa_dtype( cudf.dtype(type(next(iter(column_params.generator))))), ) elif hasattr(column_params.dtype, "to_arrow"): arrow_type = column_params.dtype.to_arrow() else: arrow_type = np_to_pa_dtype( cudf.dtype(type(next(iter(column_params.generator)))) if column_params.dtype is None else column_params.dtype) table_fields.append( pa.field( name=str(i), type=arrow_type, nullable=column_params.null_frequency > 0, )) schema = pa.schema(table_fields) # Initialize column data and which columns should be sorted column_data = [None] * len(parameters.column_parameters) columns_to_sort = [ str(i) for i, column_params in enumerate(parameters.column_parameters) if column_params.is_sorted ] # Generate data if not use_threads: for i, column_params in enumerate(parameters.column_parameters): column_data[i] = _generate_column(column_params, parameters.num_rows) else: pool = Pool(pa.cpu_count()) column_data = pool.starmap( _generate_column, [(column_params, parameters.num_rows) for i, column_params in enumerate(parameters.column_parameters)], ) pool.close() pool.join() # Convert to Pandas DataFrame and sort columns appropriately tbl = pa.Table.from_arrays( column_data, schema=schema, ) if columns_to_sort: tbl = tbl.to_pandas() tbl = tbl.sort_values(columns_to_sort) tbl = pa.Table.from_pandas(tbl, schema) return tbl