def coerce_not_float_cols_nans(cls, self): """Coerce cols with floats and nans to the correct integer dtype.""" cols = self.not_float_cols_nans int8_val = 127 int16_val = 32767 int32_val = 2147483648 for col in cols: min = self.df[col].min() max = self.df[col].max() if min >= 0: if max < 255: self.df[col] = self.df[col].astype(pd.UInt8Dtype()) elif max < 65535: self.df[col] = self.df[col].astype(pd.UInt16Dtype()) elif max < 4294967295: self.df[col] = self.df[col].astype(pd.UInt32Dtype()) else: if min > -int8_val and max < int8_val: self.df[col] = self.df[col].astype(pd.Int8Dtype()) elif min > -int16_val and max < int16_val: self.df[col] = self.df[col].astype(pd.Int16Dtype()) elif min > -int32_val and max < int32_val: self.df[col] = self.df[col].astype(pd.Int32Dtype())
def pyarrow2pandas_extension( # pylint: disable=too-many-branches,too-many-return-statements dtype: pa.DataType, ) -> Optional[pd.api.extensions.ExtensionDtype]: """Pyarrow to Pandas data types conversion.""" if pa.types.is_int8(dtype): return pd.Int8Dtype() if pa.types.is_int16(dtype): return pd.Int16Dtype() if pa.types.is_int32(dtype): return pd.Int32Dtype() if pa.types.is_int64(dtype): return pd.Int64Dtype() if pa.types.is_uint8(dtype): return pd.UInt8Dtype() if pa.types.is_uint16(dtype): return pd.UInt16Dtype() if pa.types.is_uint32(dtype): return pd.UInt32Dtype() if pa.types.is_uint64(dtype): return pd.UInt64Dtype() if pa.types.is_boolean(dtype): return pd.BooleanDtype() if pa.types.is_string(dtype): return pd.StringDtype() return None
def integer_type_mapping( use_extension_types: bool) -> Mapping[IntegerType, DtypeObj]: if use_extension_types: return { IntegerType.INT8: pd.Int8Dtype(), IntegerType.UINT8: pd.UInt8Dtype(), IntegerType.INT16: pd.Int16Dtype(), IntegerType.UINT16: pd.UInt16Dtype(), IntegerType.INT24: pd.Int32Dtype(), IntegerType.UINT24: pd.Int32Dtype(), IntegerType.INT32: pd.Int32Dtype(), IntegerType.UINT32: pd.UInt32Dtype(), IntegerType.INT64: pd.Int64Dtype(), IntegerType.UINT64: pd.UInt64Dtype(), } else: return { IntegerType.INT8: np.int8, IntegerType.UINT8: np.uint8, IntegerType.INT16: np.int16, IntegerType.UINT16: np.uint16, IntegerType.INT24: np.int32, IntegerType.UINT24: np.uint32, IntegerType.INT32: np.int32, IntegerType.UINT32: np.uint32, IntegerType.INT64: np.int64, IntegerType.UINT64: np.uint64, }
def _dtypes(self, categories=None): """ Implied types of the columns in the schema """ import pandas as pd if self.has_pandas_metadata: md = self.pandas_metadata['columns'] tz = { c['name']: c['metadata']['timezone'] for c in md if (c.get('metadata', {}) or {}).get('timezone', None) } else: tz = None self.tz = tz categories = self.check_categories(categories) dtype = OrderedDict( (name, (converted_types.typemap(f) if f.num_children in [None, 0] else np.dtype("O"))) for name, f in self.schema.root.children.items() if getattr(f, 'isflat', False) is False) for i, (col, dt) in enumerate(dtype.copy().items()): if dt.kind in ['i', 'b']: # int/bool columns that may have nulls become float columns num_nulls = 0 for rg in self.row_groups: chunk = rg.columns[i] if chunk.meta_data.statistics is None: num_nulls = True break if chunk.meta_data.statistics.null_count is None: num_nulls = True break if chunk.meta_data.statistics.null_count: num_nulls = True break if num_nulls: if dt.kind == "b": dtype[col] = pd.BooleanDtype() elif dtype[col].itemsize == 1: dtype[col] = pd.Int8Dtype() elif dtype[col].itemsize == 2: dtype[col] = pd.Int16Dtype() elif dtype[col].itemsize == 4: dtype[col] = pd.Int32Dtype() else: dtype[col] = pd.Int64Dtype() elif dt.kind == "M": if tz is not None and tz.get(col, False): dtype[col] = pd.Series([], dtype='M8[ns]').dt.tz_localize( tz[col]).dtype elif dt == 'S12': dtype[col] = 'M8[ns]' for field in categories: dtype[field] = 'category' for cat in self.cats: dtype[cat] = "category" self.dtypes = dtype return dtype
def __init__(self, pandas_obj): # validate and assign object self._validate(pandas_obj) self._obj = pandas_obj # define incorporated modules - columns consisting of others will not have the dtype changed self._INCORPORATED_MODULES = ['builtins', 'numpy', 'pandas'] # define a possible list of null values self._NULL_VALS = [ None, np.nan, 'np.nan', 'nan', np.inf, 'np.inf', 'inf', -np.inf, '-np.inf', '', 'n/a', 'na', 'N/A', 'NA', 'unknown', 'unk', 'UNKNOWN', 'UNK' ] # assign dtypes and limits # boolean BOOL_STRINGS_TRUE = ['t', 'true', 'yes', 'on'] BOOL_STRINGS_FALSE = ['f', 'false', 'no', 'off'] self._BOOL_MAP_DICT = {i: True for i in BOOL_STRINGS_TRUE }.update({i: False for i in BOOL_STRINGS_FALSE}) self._DTYPE_BOOL_BASE = np.bool self._DTYPE_BOOL_NULLABLE = pd.BooleanDtype() # unsigned integers - base and nullable self._DTYPES_UINT_BASE = [np.uint8, np.uint16, np.uint32, np.uint64] self._DTYPES_UINT_NULLABLE = [ pd.UInt8Dtype(), pd.UInt16Dtype(), pd.UInt32Dtype(), pd.UInt64Dtype() ] self._LIMIT_LOW_UINT = [ np.iinfo(i).min for i in self._DTYPES_UINT_BASE ] self._LIMIT_HIGH_UINT = [ np.iinfo(i).max for i in self._DTYPES_UINT_BASE ] # signed integers - base and nullable self._DTYPES_INT_BASE = [np.int8, np.int16, np.int32, np.int64] self._DTYPES_INT_NULLABLE = [ pd.Int8Dtype(), pd.Int16Dtype(), pd.Int32Dtype(), pd.Int64Dtype() ] self._LIMIT_LOW_INT = [np.iinfo(i).min for i in self._DTYPES_INT_BASE] self._LIMIT_HIGH_INT = [np.iinfo(i).max for i in self._DTYPES_INT_BASE] # floats - nullable by default self._DTYPES_FLOAT = [np.float16, np.float32, np.float64] # datetime - nullable by default self._DTYPE_DATETIME = np.datetime64 # string self._DTYPE_STRING = pd.StringDtype() # categorical - nullable by default self._DTYPE_CATEGORICAL = pd.CategoricalDtype()
def test_intdtypes() -> None: pd.Int8Dtype() pd.Int16Dtype() pd.Int32Dtype() pd.Int64Dtype() pd.UInt8Dtype() pd.UInt16Dtype() pd.UInt32Dtype() pd.UInt64Dtype()
def read( self, path, columns=None, use_nullable_dtypes=False, storage_options: StorageOptions = None, **kwargs, ): kwargs["use_pandas_metadata"] = True to_pandas_kwargs = {} if use_nullable_dtypes: if LooseVersion(self.api.__version__) >= "0.16": import pandas as pd mapping = { self.api.int8(): pd.Int8Dtype(), self.api.int16(): pd.Int16Dtype(), self.api.int32(): pd.Int32Dtype(), self.api.int64(): pd.Int64Dtype(), self.api.uint8(): pd.UInt8Dtype(), self.api.uint16(): pd.UInt16Dtype(), self.api.uint32(): pd.UInt32Dtype(), self.api.uint64(): pd.UInt64Dtype(), self.api.bool_(): pd.BooleanDtype(), self.api.string(): pd.StringDtype(), } to_pandas_kwargs["types_mapper"] = mapping.get else: raise ValueError( "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 " f"({self.api.__version__} is installed" ) manager = get_option("mode.data_manager") if manager == "array": to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle( path, kwargs.pop("filesystem", None), storage_options=storage_options, mode="rb", ) try: result = self.api.parquet.read_table( path_or_handle, columns=columns, **kwargs ).to_pandas(**to_pandas_kwargs) if manager == "array": result = result._as_manager("array", copy=False) return result finally: if handles is not None: handles.close()
def test_numeric_dtypes(self): dtypes = [ bool, np.byte, np.ubyte, np.short, np.ushort, np.single, np.int32, np.intc, np.half, np.float16, np.double, np.float64, pd.StringDtype(), pd.Int64Dtype(), pd.UInt64Dtype(), pd.Int32Dtype(), pd.UInt32Dtype(), pd.Int16Dtype(), pd.UInt16Dtype(), pd.Int8Dtype(), pd.UInt8Dtype(), ] for suffix, fn in [ (".snappy", "parquet"), (".feather", "feather"), (".xml", "xml"), (".csv", "csv"), (".tsv", "tsv"), (".json", "json"), (".xlsx", "xlsx"), (".xls", "xls"), (".xlsb", "xlsb"), (".ods", "ods"), (".pickle", "pickle"), ]: with tmpfile(suffix) as path: for dtype in dtypes: try: df = Ind2Col2.convert(Ind2Col2( sample_data_ind2_col2())).astype(dtype) assert list(df.index.names) == ["qqq", "rrr"] assert list(df.columns) == ["abc", "xyz"] getattr(df, "to_" + fn)(path) df2 = getattr(Ind2Col2, "read_" + fn)(path) assert list(df2.index.names) == ["qqq", "rrr"] assert list(df2.columns) == ["abc", "xyz"] except Exception: logger.error(f"Failed on path {path}, dtype {dtype}") raise
def read( self, path, columns=None, use_nullable_dtypes=False, storage_options: StorageOptions = None, **kwargs, ) -> DataFrame: kwargs["use_pandas_metadata"] = True to_pandas_kwargs = {} if use_nullable_dtypes: import pandas as pd mapping = { self.api.int8(): pd.Int8Dtype(), self.api.int16(): pd.Int16Dtype(), self.api.int32(): pd.Int32Dtype(), self.api.int64(): pd.Int64Dtype(), self.api.uint8(): pd.UInt8Dtype(), self.api.uint16(): pd.UInt16Dtype(), self.api.uint32(): pd.UInt32Dtype(), self.api.uint64(): pd.UInt64Dtype(), self.api.bool_(): pd.BooleanDtype(), self.api.string(): pd.StringDtype(), self.api.float32(): pd.Float32Dtype(), self.api.float64(): pd.Float64Dtype(), } to_pandas_kwargs["types_mapper"] = mapping.get manager = get_option("mode.data_manager") if manager == "array": to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle( path, kwargs.pop("filesystem", None), storage_options=storage_options, mode="rb", ) try: result = self.api.parquet.read_table( path_or_handle, columns=columns, **kwargs).to_pandas(**to_pandas_kwargs) if manager == "array": result = result._as_manager("array", copy=False) return result finally: if handles is not None: handles.close()
def test_numeric_nullable_dtypes(self): dtypes = [ pd.StringDtype(), pd.BooleanDtype(), pd.Float64Dtype(), pd.Float32Dtype(), pd.Int64Dtype(), pd.UInt64Dtype(), pd.Int32Dtype(), pd.UInt32Dtype(), pd.Int16Dtype(), pd.UInt16Dtype(), pd.Int8Dtype(), pd.UInt8Dtype(), pd.StringDtype(), ] # TODO: Re-add (".xml", "xml"), # TODO: See https://github.com/dmyersturnbull/typed-dfs/issues/46 for suffix, fn in [ (".snappy", "parquet"), (".feather", "feather"), (".csv", "csv"), (".tsv", "tsv"), (".json", "json"), (".xlsx", "xlsx"), (".xls", "xls"), (".xlsb", "xlsb"), (".ods", "ods"), (".pickle", "pickle"), ]: # TODO: include xml for dtype in dtypes: with tmpfile(suffix) as path: try: df = Ind2Col2.convert( Ind2Col2( sample_data_ind2_col2_pd_na())).astype(dtype) assert list(df.index.names) == ["qqq", "rrr"] assert list(df.columns) == ["abc", "xyz"] getattr(df, "to_" + fn)(path) df2 = getattr(Ind2Col2, "read_" + fn)(path) assert list(df2.index.names) == ["qqq", "rrr"] assert list(df2.columns) == ["abc", "xyz"] except Exception: logger.error(f"Failed on path {path}, dtype {dtype}") raise
def test_to_pandas_dtype_integer_nullable(): expectations = { (-100, 100): pd.Int8Dtype(), (0, 240): pd.UInt8Dtype(), (-10000, 10000): pd.Int16Dtype(), (500, 40000): pd.UInt16Dtype(), (-200000000, 200000000): pd.Int32Dtype(), (25, 4000000000): pd.UInt32Dtype(), (-9000000000000000000, 2000000000): pd.Int64Dtype(), (25, 10000000000000000000): pd.UInt64Dtype(), (25, 1000000000000000000000000000): np.float128, (None, None): pd.Int64Dtype(), } for (min_, max_), expected_pandas_type in expectations.items(): constraints = RecordsSchemaFieldIntegerConstraints(required=True, unique=None, min_=min_, max_=max_) yield with_nullable( True, check_dtype), "integer", constraints, expected_pandas_type
def gettags(filepath, track_id): """ Fetches the ID3 tags from the mp3 file and extracts the cover image Returns: extracted tags and cover image link """ try: track = EasyID3(filepath) except: track = [] logging.critical(track) if 'website' in track: coverimg = track['website'] else: coverimg = [''] if 'artist' in track: artist = track['artist'] else: artist = ['Unknown Artist'] if 'title' in track: track_title = track['title'] else: track_title = ['Unknown Track'] if 'album' in track: album = track['album'] else: album = [''] if 'date' in track: releaseyear = track['date'] year = [int(releaseyear[0])] else: year = pd.array([None], dtype=pd.Int8Dtype()) if 'genre' in track: genre = track['genre'] else: genre = [''] if 'website' in track: url = track['website'] else: url = [''] return coverimg, artist, track_title, album, year, genre, url
def test_df_pdv1_types(): pdv1_test_mapping = { 'int8col': { 'vals': [1, 2, 3], 'pd_type': pd.Int8Dtype() }, 'int16col': { 'vals': [1, 2, 3], 'pd_type': pd.Int16Dtype() }, 'int32col': { 'vals': [1, 2, 3], 'pd_type': pd.Int32Dtype() }, 'int64col': { 'vals': [1, 2, 3], 'pd_type': pd.Int64Dtype() }, 'stringcol': { 'vals': ['one', 'two', 'three'], 'pd_type': pd.StringDtype() }, 'boolcol': { 'vals': [True, False, True], 'pd_type': pd.BooleanDtype() } } pdv1_df = pd.DataFrame({ col_name: col_meta['vals'] for col_name, col_meta in pdv1_test_mapping.items() }) pdv1_df = pdv1_df.astype({ col_name: col_meta['pd_type'] for col_name, col_meta in pdv1_test_mapping.items() }) return pdv1_df
def test_to_table_nullable(self): boolean_array = pd.array([True, False, None], dtype=pd.BooleanDtype()) int8_array = pd.array([1, 2, None], dtype=pd.Int8Dtype()) int16_array = pd.array([1, 2, None], dtype=pd.Int16Dtype()) int32_array = pd.array([1, 2, None], dtype=pd.Int32Dtype()) int64_array = pd.array([1, 2, None], dtype=pd.Int64Dtype()) float_array = pd.array([1.1, 2.2, None], dtype=pd.Float32Dtype()) double_array = pd.array([1.1, 2.2, None], dtype=pd.Float64Dtype()) string_array = pd.array(["s11", "s22", None], dtype=pd.StringDtype()) object_array = pd.array([pd.NA, "s22", None], dtype=object) df = pd.DataFrame({ "NullableBoolean": boolean_array, "NullableInt8": int8_array, "NullableInt16": int16_array, "NullableInt32": int32_array, "NullableInt64": int64_array, "NullableFloat": float_array, "NullableDouble": double_array, "NullableString": string_array, "NullableObject": object_array, }) table = to_table(df) self.assertIs(table.columns[0].data_type, dtypes.bool_) self.assertIs(table.columns[1].data_type, dtypes.int8) self.assertIs(table.columns[2].data_type, dtypes.int16) self.assertIs(table.columns[3].data_type, dtypes.int32) self.assertIs(table.columns[4].data_type, dtypes.int64) self.assertIs(table.columns[5].data_type, dtypes.float32) self.assertIs(table.columns[6].data_type, dtypes.double) self.assertIs(table.columns[7].data_type, dtypes.string) self.assertIs(table.columns[8].data_type, dtypes.PyObject) self.assertEqual(table.size, 3) table_string = table.to_string() self.assertEqual(9, table_string.count("null"))
assert_eq(gdf7, pdf7) # dict input: pdf8 = pdf1.replace({"a": 0, "b": 0}, {"a": 4, "b": 5}) gdf8 = gdf1.replace({"a": 0, "b": 0}, {"a": None, "b": 5}).fillna(4) assert_eq(gdf8, pdf8) gdf1 = DataFrame({"a": [0, 1, 2, 3], "b": [0, 1, 2, None]}) gdf9 = gdf1.replace([0, 1], [4, 5]).fillna(3) assert_eq(gdf9, pdf6) @pytest.mark.parametrize( "psr", [ pd.Series([0, 1, None, 2, None], dtype=pd.Int8Dtype()), pd.Series([0, 1, np.nan, 2, np.nan]), ], ) @pytest.mark.parametrize("data_dtype", NUMERIC_TYPES) @pytest.mark.parametrize("fill_value", [10, pd.Series([10, 20, 30, 40, 50])]) @pytest.mark.parametrize("inplace", [True, False]) def test_series_fillna_numerical(psr, data_dtype, fill_value, inplace): test_psr = psr.copy(deep=True) # TODO: These tests should use Pandas' nullable int type # when we support a recent enough version of Pandas # https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html if np.dtype(data_dtype).kind not in ("f") and test_psr.dtype.kind == "i": test_psr = test_psr.astype( cudf.utils.dtypes.cudf_dtypes_to_pandas_dtypes[np.dtype( data_dtype)])
from datetime import timedelta, datetime, timezone from dask_sql.java import SqlTypeName # Default mapping between python types and SQL types _PYTHON_TO_SQL = { np.float64: SqlTypeName.DOUBLE, np.float32: SqlTypeName.FLOAT, np.int64: SqlTypeName.BIGINT, pd.Int64Dtype(): SqlTypeName.BIGINT, np.int32: SqlTypeName.INTEGER, pd.Int32Dtype(): SqlTypeName.INTEGER, np.int16: SqlTypeName.SMALLINT, pd.Int16Dtype(): SqlTypeName.SMALLINT, np.int8: SqlTypeName.TINYINT, pd.Int8Dtype(): SqlTypeName.TINYINT, np.uint64: SqlTypeName.BIGINT, pd.UInt64Dtype(): SqlTypeName.BIGINT, np.uint32: SqlTypeName.INTEGER, pd.UInt32Dtype(): SqlTypeName.INTEGER, np.uint16: SqlTypeName.SMALLINT, pd.UInt16Dtype(): SqlTypeName.SMALLINT, np.uint8: SqlTypeName.TINYINT, pd.UInt8Dtype(): SqlTypeName.TINYINT, np.bool8: SqlTypeName.BOOLEAN, pd.BooleanDtype(): SqlTypeName.BOOLEAN, np.object_: SqlTypeName.VARCHAR, pd.StringDtype(): SqlTypeName.VARCHAR, np.datetime64: SqlTypeName.TIMESTAMP, }
def __init__( self, assertions=None, strings=None, nodes=None, edges=None, node_types=None, link_types=None, assertion_tags=None, edge_tags=None, node_metadata_tables=None, big_id_dtype=pd.Int32Dtype(), small_id_dtype=pd.Int8Dtype() ): self.assertions = assertions self.strings = strings self.nodes = nodes self.edges = edges self.node_types = node_types self.link_types = link_types self.assertion_tags = assertion_tags self.edge_tags = edge_tags node_metadata_tables = node_metadata_tables self.big_id_dtype = big_id_dtype self.small_id_dtype = small_id_dtype self._string_side_tables = [ 'strings', 'assertions', 'link_types', 'assertion_tags' ] self._node_side_tables = [ 'nodes', 'edges', 'node_types', 'edge_tags' ] if node_metadata_tables is None: self.node_metadata_tables = {} self._assertions_dtypes = { 'inp_string_id': self.big_id_dtype, 'src_string_id': self.big_id_dtype, 'tgt_string_id': self.big_id_dtype, 'ref_string_id': self.big_id_dtype, 'link_type_id': self.small_id_dtype, 'date_inserted': 'object', 'date_modified': 'object' } self._assertions_index_dtype = self.big_id_dtype self._strings_dtypes = { 'node_id': self.big_id_dtype, 'string': str, 'date_inserted': 'object', 'date_modified': 'object' } self._strings_index_dtype = self.big_id_dtype self._nodes_dtypes = { 'node_type_id': self.small_id_dtype, 'name_string_id': self.big_id_dtype, 'abbr_string_id': self.big_id_dtype, 'date_inserted': 'object', 'date_modified': 'object' } self._nodes_index_dtype = self.big_id_dtype self._edges_dtypes = { 'src_node_id': self.big_id_dtype, 'tgt_node_id': self.big_id_dtype, 'ref_node_id': self.big_id_dtype, 'link_type_id': self.small_id_dtype, 'date_inserted': 'object', 'date_modified': 'object' } self._edges_index_dtype = self.big_id_dtype self._node_types_dtypes = { 'node_type': 'object', 'description': 'object' } self._node_types_index_dtype = self.small_id_dtype self._link_types_dtypes = { 'link_type': 'object', 'description': 'object' } self._link_types_index_dtype = self.small_id_dtype self._assertion_tags_dtypes = { 'assertion_id': self.big_id_dtype, 'tag_string_id': self.big_id_dtype } self._assertion_tags_index_dtype = self.big_id_dtype self._edge_tags_dtypes = { 'edge_id': self.big_id_dtype, 'tag_string_id': self.big_id_dtype } self._edge_tags_index_dtype = self.big_id_dtype
np.bool_: pa.int8(), np.uint64: pa.uint64(), np.uint32: pa.uint32(), np.uint16: pa.uint16(), np.uint8: pa.uint8(), np.datetime64: pa.date64(), np.object_: pa.string(), np.str_: pa.string(), } cudf_dtypes_to_pandas_dtypes = { np.dtype("uint8"): pd.UInt8Dtype(), np.dtype("uint16"): pd.UInt16Dtype(), np.dtype("uint32"): pd.UInt32Dtype(), np.dtype("uint64"): pd.UInt64Dtype(), np.dtype("int8"): pd.Int8Dtype(), np.dtype("int16"): pd.Int16Dtype(), np.dtype("int32"): pd.Int32Dtype(), np.dtype("int64"): pd.Int64Dtype(), np.dtype("bool_"): pd.BooleanDtype(), np.dtype("object"): pd.StringDtype(), } pyarrow_dtypes_to_pandas_dtypes = { pa.uint8(): pd.UInt8Dtype(), pa.uint16(): pd.UInt16Dtype(), pa.uint32(): pd.UInt32Dtype(), pa.uint64(): pd.UInt64Dtype(), pa.int8(): pd.Int8Dtype(), pa.int16(): pd.Int16Dtype(), pa.int32(): pd.Int32Dtype(),
class INT8(INT16): """Semantic representation of a :class:`pandas.Int8Dtype`.""" type = pd.Int8Dtype() bit_width: int = 8
"""Semantic representation of a :class:`pandas.Int32Dtype`.""" type = pd.Int32Dtype() bit_width: int = 32 @Engine.register_dtype(equivalents=[pd.Int16Dtype, pd.Int16Dtype()]) @immutable class INT16(INT32): """Semantic representation of a :class:`pandas.Int16Dtype`.""" type = pd.Int16Dtype() bit_width: int = 16 @Engine.register_dtype(equivalents=[pd.Int8Dtype, pd.Int8Dtype()]) @immutable class INT8(INT16): """Semantic representation of a :class:`pandas.Int8Dtype`.""" type = pd.Int8Dtype() bit_width: int = 8 ############################################################################### # unsigned integer ############################################################################### _register_numpy_numbers( builtin_name="uint", pandera_name="UInt",
"uint16": "UInt16", "uint8": "UInt8", "float64": "Float64", "float32": "Float32", "int64": "Int64", "int32": "Int32", "int16": "Int16", "int8": "Int8", "datetime64[D]": "Date", "datetime64[ns]": "DateTime", } PD2CH = keymap(np.dtype, MAPPING) PD_INT_TYPES = [ pd.Int8Dtype(), pd.Int16Dtype(), pd.Int32Dtype(), pd.Int64Dtype(), pd.UInt8Dtype(), pd.UInt16Dtype(), pd.UInt32Dtype(), pd.UInt64Dtype(), ] for typ in PD_INT_TYPES: PD2CH[typ] = f"Nullable({typ.name})" CH2PD = itemmap(reversed, MAPPING) CH2PD["Null"] = "object" CH2PD["Nothing"] = "object"
# %% # Aggregate valid trial counts. df_counts = df.loc[~df['outlier']].groupby(['user', 'session', 'block_id', 'block', 'condition'], observed=True)\ .size().rename('valid trials count').reset_index() # Display some more information about users. users = pd.read_csv( data_path / 'raw/users.csv' ) # When using Int8DType for gaming_exp NAType causes TypeError in plot. df_counts['gender'] = df_counts['user'].map(users['gender']) df_counts['age_group'] = df_counts['user'].map(users['age_group']) df_counts['gaming_exp'] = df_counts['user'].map(users['gaming_exp']) # How did they rate the block? Can use Int8Dtype since answer is mandatory and hence no NAType is present. blocks = pd.read_csv(data_path / 'raw/blocks.csv', index_col='id', dtype={'rating': pd.Int8Dtype()}) df_counts['rating'] = df_counts['block_id'].map(blocks['rating']) # %% # Bar plot. fig_exclusions = px.bar( df_counts, x='user', y='valid trials count', color='block', barmode='group', opacity=0.9, hover_data=['condition', 'gender', 'age_group', 'gaming_exp', 'rating'], labels=dict(zip(df_counts.columns, df_counts.columns.str.title())), width=800) fig_exclusions.update_layout(bargap=0.3, bargroupgap=0.01)
__all__ = ('BatchRowsAsDataFrame', 'generate_proxy', 'UnbatchPandas', 'element_type_from_dataframe') T = TypeVar('T', bound=NamedTuple) PD_MAJOR = int(pd.__version__.split('.')[0]) # Generate type map (presented visually in the docstring) _BIDIRECTIONAL = [ (np.bool, np.bool), (np.int8, np.int8), (np.int16, np.int16), (np.int32, np.int32), (np.int64, np.int64), (pd.Int8Dtype(), Optional[np.int8]), (pd.Int16Dtype(), Optional[np.int16]), (pd.Int32Dtype(), Optional[np.int32]), (pd.Int64Dtype(), Optional[np.int64]), (np.float32, Optional[np.float32]), (np.float64, Optional[np.float64]), (np.object, Any), ] if PD_MAJOR >= 1: _BIDIRECTIONAL.extend([ (pd.StringDtype(), Optional[str]), (pd.BooleanDtype(), Optional[np.bool]), ]) PANDAS_TO_BEAM = {
pyarrow_data = pa.array(data, **pyarrow_kwargs) cudf_from_pyarrow = as_column(pyarrow_data) expected = as_column(data, **cudf_kwargs) assert_eq(cudf.Series(cudf_from_pyarrow), cudf.Series(expected)) @pytest.mark.parametrize( "pd_dtype,expect_dtype", [ # TODO: Nullable float is coming (pd.StringDtype(), np.dtype("O")), (pd.UInt8Dtype(), np.dtype("uint8")), (pd.UInt16Dtype(), np.dtype("uint16")), (pd.UInt32Dtype(), np.dtype("uint32")), (pd.UInt64Dtype(), np.dtype("uint64")), (pd.Int8Dtype(), np.dtype("int8")), (pd.Int16Dtype(), np.dtype("int16")), (pd.Int32Dtype(), np.dtype("int32")), (pd.Int64Dtype(), np.dtype("int64")), (pd.BooleanDtype(), np.dtype("bool")), ], ) def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype): if pd_dtype == pd.StringDtype(): data = ["a", pd.NA, "c", pd.NA, "e"] elif pd_dtype == pd.BooleanDtype(): data = [True, pd.NA, False, pd.NA, True] else: data = [1, pd.NA, 3, pd.NA, 5] pd_data = pd.DataFrame.from_dict({"a": data}, dtype=pd_dtype)
( cudf.Series([23, None, None, 32], dtype="uint16"), pd.Series([23, None, None, 32], dtype=pd.UInt16Dtype()), ), ( cudf.Series([None, 123, None, 1], dtype="uint32"), pd.Series([None, 123, None, 1], dtype=pd.UInt32Dtype()), ), ( cudf.Series([234, 2323, 23432, None, None, 224], dtype="uint64"), pd.Series([234, 2323, 23432, None, None, 224], dtype=pd.UInt64Dtype()), ), ( cudf.Series([-10, 1, None, -1, None, 3], dtype="int8"), pd.Series([-10, 1, None, -1, None, 3], dtype=pd.Int8Dtype()), ), ( cudf.Series([111, None, 222, None, 13], dtype="int16"), pd.Series([111, None, 222, None, 13], dtype=pd.Int16Dtype()), ), ( cudf.Series([11, None, 22, 33, None, 2, None, 3], dtype="int32"), pd.Series([11, None, 22, 33, None, 2, None, 3], dtype=pd.Int32Dtype()), ), ( cudf.Series([32431, None, None, 32322, 0, 10, -32324, None], dtype="int64"), pd.Series( [32431, None, None, 32322, 0, 10, -32324, None],
parquet_thrift.ConvertedType.UINT_8: np.dtype("uint8"), parquet_thrift.ConvertedType.UINT_16: np.dtype("uint16"), parquet_thrift.ConvertedType.UINT_32: np.dtype('uint32'), parquet_thrift.ConvertedType.UINT_64: np.dtype('uint64'), parquet_thrift.ConvertedType.INT_8: np.dtype("int8"), parquet_thrift.ConvertedType.INT_16: np.dtype("int16"), parquet_thrift.ConvertedType.INT_32: np.dtype('int32'), parquet_thrift.ConvertedType.INT_64: np.dtype('int64'), parquet_thrift.ConvertedType.TIME_MILLIS: np.dtype('<m8[ns]'), parquet_thrift.ConvertedType.DATE: np.dtype('<M8[ns]'), parquet_thrift.ConvertedType.TIMESTAMP_MILLIS: np.dtype('<M8[ns]'), parquet_thrift.ConvertedType.TIME_MICROS: np.dtype('<m8[ns]'), parquet_thrift.ConvertedType.TIMESTAMP_MICROS: np.dtype('<M8[ns]') } nullable = { np.dtype('int8'): pd.Int8Dtype(), np.dtype('int16'): pd.Int16Dtype(), np.dtype('int32'): pd.Int32Dtype(), np.dtype('int64'): pd.Int64Dtype(), np.dtype('uint8'): pd.UInt8Dtype(), np.dtype('uint16'): pd.UInt16Dtype(), np.dtype('uint32'): pd.UInt32Dtype(), np.dtype('uint64'): pd.UInt64Dtype(), np.dtype('bool'): pd.BooleanDtype() } pandas_nullable = { "Int8": pd.Int8Dtype(), "Int16": pd.Int16Dtype(), "Int32": pd.Int32Dtype(), "Int64": pd.Int64Dtype(), "UInt8": pd.UInt8Dtype(),
# Copyright (c) 2020, NVIDIA CORPORATION. import random import pandas as pd import pyarrow as pa pyarrow_dtypes_to_pandas_dtypes = { pa.uint8(): pd.UInt8Dtype(), pa.uint16(): pd.UInt16Dtype(), pa.uint32(): pd.UInt32Dtype(), pa.uint64(): pd.UInt64Dtype(), pa.int8(): pd.Int8Dtype(), pa.int16(): pd.Int16Dtype(), pa.int32(): pd.Int32Dtype(), pa.int64(): pd.Int64Dtype(), pa.bool_(): pd.BooleanDtype(), pa.string(): pd.StringDtype(), } def _generate_rand_meta(obj, dtypes_list): obj._current_params = {} num_rows = obj._rand(obj._max_rows) num_cols = obj._rand(obj._max_columns) dtypes_meta = [] for _ in range(num_cols): dtype = random.choice(dtypes_list) null_frequency = random.uniform(0, 1)
np.bool_: pa.int8(), np.uint64: pa.uint64(), np.uint32: pa.uint32(), np.uint16: pa.uint16(), np.uint8: pa.uint8(), np.datetime64: pa.date64(), np.object_: pa.string(), np.str_: pa.string(), } cudf_dtypes_to_pandas_dtypes = { np.dtype("uint8"): pd.UInt8Dtype(), np.dtype("uint16"): pd.UInt16Dtype(), np.dtype("uint32"): pd.UInt32Dtype(), np.dtype("uint64"): pd.UInt64Dtype(), np.dtype("int8"): pd.Int8Dtype(), np.dtype("int16"): pd.Int16Dtype(), np.dtype("int32"): pd.Int32Dtype(), np.dtype("int64"): pd.Int64Dtype(), np.dtype("bool_"): pd.BooleanDtype(), np.dtype("object"): pd.StringDtype(), } SIGNED_INTEGER_TYPES = {"int8", "int16", "int32", "int64"} UNSIGNED_TYPES = {"uint8", "uint16", "uint32", "uint64"} INTEGER_TYPES = SIGNED_INTEGER_TYPES | UNSIGNED_TYPES FLOAT_TYPES = {"float32", "float64"} SIGNED_TYPES = SIGNED_INTEGER_TYPES | FLOAT_TYPES NUMERIC_TYPES = SIGNED_TYPES | UNSIGNED_TYPES DATETIME_TYPES = { "datetime64[s]",
import numpy as np import pandas as pd import pyorc import cudf from cudf.testing._utils import assert_eq from cudf.utils.dtypes import ( pandas_dtypes_to_cudf_dtypes, pyarrow_dtypes_to_pandas_dtypes, ) ALL_POSSIBLE_VALUES = "ALL_POSSIBLE_VALUES" _PANDAS_TO_AVRO_SCHEMA_MAP = { cudf.dtype("int8"): "int", pd.Int8Dtype(): ["int", "null"], pd.Int16Dtype(): ["int", "null"], pd.Int32Dtype(): ["int", "null"], pd.Int64Dtype(): ["long", "null"], pd.BooleanDtype(): ["boolean", "null"], pd.StringDtype(): ["string", "null"], cudf.dtype("bool_"): "boolean", cudf.dtype("int16"): "int", cudf.dtype("int32"): "int", cudf.dtype("int64"): "long", cudf.dtype("O"): "string", cudf.dtype("str"): "string", cudf.dtype("float32"): "float", cudf.dtype("float64"): "double", cudf.dtype("<M8[ns]"): {"type": "long", "logicalType": "timestamp-millis"}, cudf.dtype("<M8[ms]"): {"type": "long", "logicalType": "timestamp-millis"},
# This will take quite some time! for user in model_comp.df['user'].unique(): model_comp.compare_models(user) # %% # Augment posterior data. columns = model_comp.posteriors.columns # Condition. conditions = trial_data.loc[ trial_data['user'].isin(model_comp.posteriors.index), ['user', 'condition']].drop_duplicates().set_index('user') model_comp.posteriors = model_comp.posteriors.join(conditions) # Gaming experience. users_path = data_path / 'raw/users.csv' exp = pd.read_csv(users_path, dtype={ 'gaming_exp': pd.Int8Dtype() }).loc[model_comp.posteriors.index, 'gaming_exp'] model_comp.posteriors = model_comp.posteriors.join(exp) # %% [markdown] # ## Visualize Results # %% fig_posteriors = px.imshow(model_comp.posteriors.drop( ['condition', 'gaming_exp'], axis='columns').reset_index(drop=True), labels=dict(x="Model", y="Participant", color="Posterior<br>Probability"), color_continuous_scale='Greys', zmin=0, zmax=1,