class DataMapping: """ Map primary data between different supported data frameworks, preserving equivalent data types. DataMapping is for primary data, to map metadata types and values use :py:class:`TypeMapping <tracdap.rt.impl.type_system.TypeMapping>` and :py:class:`TypeMapping <tracdap.rt.impl.type_system.MetadataCodec>`. """ __log = _util.logger_for_namespace(_DataInternal.__module__ + ".DataMapping") # Matches TRAC_ARROW_TYPE_MAPPING in ArrowSchema, tracdap-lib-data __TRAC_DECIMAL_PRECISION = 38 __TRAC_DECIMAL_SCALE = 12 __TRAC_TIMESTAMP_UNIT = "ms" __TRAC_TIMESTAMP_ZONE = None __TRAC_TO_ARROW_BASIC_TYPE_MAPPING = { _meta.BasicType.BOOLEAN: pa.bool_(), _meta.BasicType.INTEGER: pa.int64(), _meta.BasicType.FLOAT: pa.float64(), _meta.BasicType.DECIMAL: pa.decimal128(__TRAC_DECIMAL_PRECISION, __TRAC_DECIMAL_SCALE), _meta.BasicType.STRING: pa.utf8(), _meta.BasicType.DATE: pa.date32(), _meta.BasicType.DATETIME: pa.timestamp(__TRAC_TIMESTAMP_UNIT, __TRAC_TIMESTAMP_ZONE) } # Check the Pandas dtypes for handling floats are available before setting up the type mapping __PANDAS_FLOAT_DTYPE_CHECK = _DataInternal.float_dtype_check() __PANDAS_DATETIME_TYPE = pd.to_datetime([]).dtype # Only partial mapping is possible, decimal and temporal dtypes cannot be mapped this way __ARROW_TO_PANDAS_TYPE_MAPPING = { pa.bool_(): pd.BooleanDtype(), pa.int8(): pd.Int8Dtype(), pa.int16(): pd.Int16Dtype(), pa.int32(): pd.Int32Dtype(), pa.int64(): pd.Int64Dtype(), pa.uint8(): pd.UInt8Dtype(), pa.uint16(): pd.UInt16Dtype(), pa.uint32(): pd.UInt32Dtype(), pa.uint64(): pd.UInt64Dtype(), pa.float16(): pd.Float32Dtype(), pa.float32(): pd.Float32Dtype(), pa.float64(): pd.Float64Dtype(), pa.utf8(): pd.StringDtype() } @staticmethod def arrow_to_python_type(arrow_type: pa.DataType) -> type: if pa.types.is_boolean(arrow_type): return bool if pa.types.is_integer(arrow_type): return int if pa.types.is_floating(arrow_type): return float if pa.types.is_decimal(arrow_type): return decimal.Decimal if pa.types.is_string(arrow_type): return str if pa.types.is_date(arrow_type): return dt.date if pa.types.is_timestamp(arrow_type): return dt.datetime raise _ex.ETracInternal( f"No Python type mapping available for Arrow type [{arrow_type}]") @classmethod def python_to_arrow_type(cls, python_type: type) -> pa.DataType: if python_type == bool: return pa.bool_() if python_type == int: return pa.int64() if python_type == float: return pa.float64() if python_type == decimal.Decimal: return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION, cls.__TRAC_DECIMAL_SCALE) if python_type == str: return pa.utf8() if python_type == dt.date: return pa.date32() if python_type == dt.datetime: return pa.timestamp(cls.__TRAC_TIMESTAMP_UNIT, cls.__TRAC_TIMESTAMP_ZONE) raise _ex.ETracInternal( f"No Arrow type mapping available for Python type [{python_type}]") @classmethod def trac_to_arrow_type(cls, trac_type: _meta.TypeDescriptor) -> pa.DataType: return cls.trac_to_arrow_basic_type(trac_type.basicType) @classmethod def trac_to_arrow_basic_type( cls, trac_basic_type: _meta.BasicType) -> pa.DataType: arrow_type = cls.__TRAC_TO_ARROW_BASIC_TYPE_MAPPING.get( trac_basic_type) if arrow_type is None: raise _ex.ETracInternal( f"No Arrow type mapping available for TRAC type [{trac_basic_type}]" ) return arrow_type @classmethod def trac_to_arrow_schema(cls, trac_schema: _meta.SchemaDefinition) -> pa.Schema: if trac_schema.schemaType != _meta.SchemaType.TABLE: raise _ex.ETracInternal( f"Schema type [{trac_schema.schemaType}] cannot be converted for Apache Arrow" ) arrow_fields = [(f.fieldName, cls.trac_to_arrow_basic_type(f.fieldType)) for f in trac_schema.table.fields] return pa.schema(arrow_fields, metadata={}) @classmethod def trac_arrow_decimal_type(cls) -> pa.Decimal128Type: return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION, cls.__TRAC_DECIMAL_SCALE) @classmethod def pandas_datetime_type(cls): return cls.__PANDAS_DATETIME_TYPE @classmethod def view_to_pandas(cls, view: DataView, part: DataPartKey) -> pd.DataFrame: deltas = view.parts.get(part) # Sanity checks if not view.arrow_schema: raise _ex.ETracInternal(f"Data view schema not set") if not deltas: raise _ex.ETracInternal( f"Data view for part [{part.opaque_key}] does not contain any items" ) if len(deltas) == 1: return cls.item_to_pandas(deltas[0]) batches = { batch for delta in deltas for batch in ( delta.batches if delta.batches else delta.table.to_batches()) } table = pa.Table.from_batches(batches) # noqa return table.to_pandas() @classmethod def item_to_pandas(cls, item: DataItem) -> pd.DataFrame: if item.pandas is not None: return item.pandas.copy() if item.table is not None: return cls.arrow_to_pandas(item.table) if item.batches is not None: table = pa.Table.from_batches(item.batches, item.schema) # noqa return cls.arrow_to_pandas(table) raise _ex.ETracInternal(f"Data item does not contain any usable data") @classmethod def arrow_to_pandas(cls, table: pa.Table) -> pd.DataFrame: return table.to_pandas( ignore_metadata=True, # noqa date_as_object=False, # noqa timestamp_as_object=False, # noqa types_mapper=cls.__ARROW_TO_PANDAS_TYPE_MAPPING.get) @classmethod def pandas_to_view(cls, df: pd.DataFrame, prior_view: DataView, part: DataPartKey): item = cls.pandas_to_item(df, prior_view.arrow_schema) return cls.add_item_to_view(prior_view, part, item) @classmethod def pandas_to_item(cls, df: pd.DataFrame, schema: tp.Optional[pa.Schema]) -> DataItem: table = cls.pandas_to_arrow(df, schema) return DataItem(table.schema, table) @classmethod def pandas_to_arrow(cls, df: pd.DataFrame, schema: tp.Optional[pa.Schema] = None) -> pa.Table: # Here we convert the whole Pandas df and then pass it to conformance # An optimization would be to filter columns before applying conformance # To do this, we'd need the case-insensitive field matching logic, including output of warnings # Also, note that schema is not applied in from_pandas # This is because the conformance logic allows for a wider range of conversions # Applying the schema directly would fail for some types where casting is possible if len(df) == 0: df_schema = pa.Schema.from_pandas(df, preserve_index=False) # noqa table = pa.Table.from_batches(list(), df_schema) # noqa else: table = pa.Table.from_pandas(df, preserve_index=False) # noqa # If there is no explict schema, give back the table exactly as it was received from Pandas # There could be an option here to coerce types to the appropriate TRAC standard types # E.g. unsigned int 32 -> signed int 64, TRAC standard integer type if schema is None: return table else: return DataConformance.conform_to_schema(table, schema, df.dtypes) @classmethod def add_item_to_view(cls, view: DataView, part: DataPartKey, item: DataItem) -> DataView: prior_deltas = view.parts.get(part) or list() deltas = [*prior_deltas, item] parts = {**view.parts, part: deltas} return DataView(view.trac_schema, view.arrow_schema, parts)
), ], ) def test_as_column_buffer(data, expected): actual_column = cudf.core.column.as_column(cudf.core.Buffer(data), dtype=data.dtype) assert_eq(cudf.Series(actual_column), cudf.Series(expected)) @pytest.mark.parametrize( "pd_dtype,expect_dtype", [ # TODO: Nullable float is coming (pd.StringDtype(), np.dtype("O")), (pd.UInt8Dtype(), np.dtype("uint8")), (pd.UInt16Dtype(), np.dtype("uint16")), (pd.UInt32Dtype(), np.dtype("uint32")), (pd.UInt64Dtype(), np.dtype("uint64")), (pd.Int8Dtype(), np.dtype("int8")), (pd.Int16Dtype(), np.dtype("int16")), (pd.Int32Dtype(), np.dtype("int32")), (pd.Int64Dtype(), np.dtype("int64")), (pd.BooleanDtype(), np.dtype("bool")), ], ) def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype): if pd_dtype == pd.StringDtype(): data = ["a", pd.NA, "c", pd.NA, "e"] elif pd_dtype == pd.BooleanDtype(): data = [True, pd.NA, False, pd.NA, True] else:
assert sr.memory_usage() == 44 assert sr[3:].memory_usage() == 9 # z assert sr[:1].memory_usage() == 19 # hello world @pytest.mark.parametrize( "sr,expected_psr", [ ( cudf.Series([1, 2, None, 3], dtype="uint8"), pd.Series([1, 2, None, 3], dtype=pd.UInt8Dtype()), ), ( cudf.Series([23, None, None, 32], dtype="uint16"), pd.Series([23, None, None, 32], dtype=pd.UInt16Dtype()), ), ( cudf.Series([None, 123, None, 1], dtype="uint32"), pd.Series([None, 123, None, 1], dtype=pd.UInt32Dtype()), ), ( cudf.Series([234, 2323, 23432, None, None, 224], dtype="uint64"), pd.Series( [234, 2323, 23432, None, None, 224], dtype=pd.UInt64Dtype() ), ), ( cudf.Series([-10, 1, None, -1, None, 3], dtype="int8"), pd.Series([-10, 1, None, -1, None, 3], dtype=pd.Int8Dtype()), ),
np.int32: pa.int32(), np.int16: pa.int16(), np.int8: pa.int8(), np.bool_: pa.int8(), np.uint64: pa.uint64(), np.uint32: pa.uint32(), np.uint16: pa.uint16(), np.uint8: pa.uint8(), np.datetime64: pa.date64(), np.object_: pa.string(), np.str_: pa.string(), } cudf_dtypes_to_pandas_dtypes = { np.dtype("uint8"): pd.UInt8Dtype(), np.dtype("uint16"): pd.UInt16Dtype(), np.dtype("uint32"): pd.UInt32Dtype(), np.dtype("uint64"): pd.UInt64Dtype(), np.dtype("int8"): pd.Int8Dtype(), np.dtype("int16"): pd.Int16Dtype(), np.dtype("int32"): pd.Int32Dtype(), np.dtype("int64"): pd.Int64Dtype(), np.dtype("bool_"): pd.BooleanDtype(), np.dtype("object"): pd.StringDtype(), } pyarrow_dtypes_to_pandas_dtypes = { pa.uint8(): pd.UInt8Dtype(), pa.uint16(): pd.UInt16Dtype(), pa.uint32(): pd.UInt32Dtype(), pa.uint64(): pd.UInt64Dtype(),
def generate_logs(self, num_offline_users: int, agent: Agent = None, num_organic_offline_users: int = 0): """ Produce logs of applying an Agent in the Environment for the specified amount of Users. If the Agent is not provided, then the default Agent is used that randomly selects an Action. """ if agent: old_agent = self.agent self.agent = agent data = { 't': [], 'u': [], 'z': [], 'v': [], 'a': [], 'c': [], 'ps': [], 'ps-a': [], } def _store_organic(observation): assert (observation is not None) assert (observation.sessions() is not None) for session in observation.sessions(): data['t'].append(session['t']) data['u'].append(session['u']) data['z'].append('organic') data['v'].append(session['v']) data['a'].append(None) data['c'].append(None) data['ps'].append(None) data['ps-a'].append(None) def _store_bandit(action, reward): if action: assert (reward is not None) data['t'].append(action['t']) data['u'].append(action['u']) data['z'].append('bandit') data['v'].append(None) data['a'].append(action['a']) data['c'].append(reward) data['ps'].append(action['ps']) data['ps-a'].append(action['ps-a'] if 'ps-a' in action else ()) unique_user_id = 0 for _ in trange(num_organic_offline_users, desc='Organic Users'): self.reset(unique_user_id) unique_user_id += 1 observation, _, _, _ = self.step(None) _store_organic(observation) for _ in trange(num_offline_users, desc='Users'): self.reset(unique_user_id) unique_user_id += 1 observation, reward, done, _ = self.step(None) while not done: _store_organic(observation) action, observation, reward, done, _ = self.step_offline( observation, reward, done) _store_bandit(action, reward) _store_organic(observation) action, _, reward, done, _ = self.step_offline( observation, reward, done) assert done, 'Done must not be changed!' _store_bandit(action, reward) data['t'] = np.array(data['t'], dtype=np.float32) data['u'] = pd.array(data['u'], dtype=pd.UInt16Dtype()) data['v'] = pd.array(data['v'], dtype=pd.UInt16Dtype()) data['a'] = pd.array(data['a'], dtype=pd.UInt16Dtype()) data['c'] = np.array(data['c'], dtype=np.float32) if agent: self.agent = old_agent return pd.DataFrame().from_dict(data)
np.int32: pa.int32(), np.int16: pa.int16(), np.int8: pa.int8(), np.bool_: pa.int8(), np.uint64: pa.uint64(), np.uint32: pa.uint32(), np.uint16: pa.uint16(), np.uint8: pa.uint8(), np.datetime64: pa.date64(), np.object_: pa.string(), np.str_: pa.string(), } cudf_dtypes_to_pandas_dtypes = { np.dtype("uint8"): pd.UInt8Dtype(), np.dtype("uint16"): pd.UInt16Dtype(), np.dtype("uint32"): pd.UInt32Dtype(), np.dtype("uint64"): pd.UInt64Dtype(), np.dtype("int8"): pd.Int8Dtype(), np.dtype("int16"): pd.Int16Dtype(), np.dtype("int32"): pd.Int32Dtype(), np.dtype("int64"): pd.Int64Dtype(), np.dtype("bool_"): pd.BooleanDtype(), np.dtype("object"): pd.StringDtype(), } SIGNED_INTEGER_TYPES = {"int8", "int16", "int32", "int64"} UNSIGNED_TYPES = {"uint8", "uint16", "uint32", "uint64"} INTEGER_TYPES = SIGNED_INTEGER_TYPES | UNSIGNED_TYPES FLOAT_TYPES = {"float32", "float64"} SIGNED_TYPES = SIGNED_INTEGER_TYPES | FLOAT_TYPES