def from_pandas(cls, df): df = df if isinstance(df, TextFileReader) else [df] res = [] for chunk in df: for col in chunk.columns: dtype = chunk[col].dtype if dtype == "object" or dtype == "str" or dtype == "bool": chunk[col] = chunk[col].astype("category") if len(res) == 0: res = chunk else: for col in chunk.columns: if chunk[col].dtype.name == "category": uc = union_categoricals([res[col], chunk[col]]) res[col] = pd.Categorical(res[col], categories=uc.categories) chunk[col] = pd.Categorical(chunk[col], categories=uc.categories) res = pd.concat([res, chunk]) datacols = [] for col in res.columns: dtype = res[col].dtype if dtype == np.float32 or dtype == np.float64 or dtype == np.int8 or dtype == np.int16 or dtype == np.int32 or dtype == np.int64 or dtype == np.uint8 or dtype == np.uint16 or dtype == np.uint32 or dtype == np.uint64: datacols.append(Covariate.from_array(res[col].values, col)) elif dtype.name == "category": factor = res[col] codes = factor.cat.codes.values levels = [MISSINGLEVEL] + [str(v) for v in factor.cat.categories.values] data = np.empty(len(codes), dtype=np.uint8) if len(levels) <= 256 else np.empty(len(codes), dtype=np.uint16) incr_(codes, data) datacols.append(Factor.from_array(col, levels, data)) else: pass return cls(datacols)
def concatenate(dfs): """ Concatenate pandas DataFrames with saving 'category' dtype. All dataframes' columns must be equal to each other. Parameters ---------- dfs : list List of pandas DataFrames to concatenate. Returns ------- pandas.DataFrame A pandas DataFrame. """ for df in dfs: assert df.columns.equals(dfs[0].columns) for i in range(len(dfs[0].columns)): if dfs[0].dtypes.iloc[i].name != "category": continue columns = [df.iloc[:, i] for df in dfs] union = union_categoricals(columns) for df in dfs: df.iloc[:, i] = pandas.Categorical( df.iloc[:, i], categories=union.categories ) return pandas.concat(dfs)
def get_result(self): """ :return: stored query result. """ for packet in self.packet_generator: self.store(packet) if self.columnar: data = [] # Transpose to a list of columns, each column is list of chunks for column_chunks in zip(*self.data): # Concatenate chunks for each column if isinstance(column_chunks[0], np.ndarray): column = np.concatenate(column_chunks) elif isinstance(column_chunks[0], pd.Categorical): column = union_categoricals(column_chunks) else: column = tuple(chain.from_iterable(column_chunks)) data.append(column) else: data = self.data if self.with_column_types: return data, self.columns_with_types else: return data
def _union(dfs: List[pd.DataFrame], ignore_index: bool) -> pd.DataFrame: cats: Tuple[Set[str], ...] = tuple(set(df.select_dtypes(include="category").columns) for df in dfs) for col in set.intersection(*cats): cat = union_categoricals([df[col] for df in dfs]) for df in dfs: df[col] = pd.Categorical(df[col].values, categories=cat.categories) return pd.concat(objs=dfs, sort=False, copy=False, ignore_index=ignore_index)
def concat_categorical(df_a, df_b, ignore_index=True): for cat_col in df_a.select_dtypes(["category"]): a = df_a[cat_col] b = df_b[cat_col] a_b = union_categoricals([a, b], ignore_order=True) a.cat.set_categories(a_b.categories, inplace=True) b.cat.set_categories(a_b.categories, inplace=True) return pd.concat([df_a, df_b], ignore_index=ignore_index)
def union_categoricals(to_union, sort_categories=False, ignore_order=False): warnings.warn( "pandas.types.concat.union_categoricals is " "deprecated and will be removed in a future version.\n" "use pandas.api.types.union_categoricals", FutureWarning, stacklevel=2) from pandas.api.types import union_categoricals return union_categoricals(to_union, sort_categories=sort_categories, ignore_order=ignore_order)
def merge_chunk(lhs, *args, **kwargs): empty_index_dtype = kwargs.pop("empty_index_dtype", None) categorical_columns = kwargs.pop("categorical_columns", None) rhs, *args = args left_index = kwargs.get("left_index", False) right_index = kwargs.get("right_index", False) if categorical_columns is not None and PANDAS_GT_100: for col in categorical_columns: left = None right = None if col in lhs: left = lhs[col] elif col == kwargs.get("right_on", None) and left_index: if is_categorical_dtype(lhs.index): left = lhs.index if col in rhs: right = rhs[col] elif col == kwargs.get("left_on", None) and right_index: if is_categorical_dtype(rhs.index): right = rhs.index dtype = "category" if left is not None and right is not None: dtype = union_categoricals([ left.astype("category").values, right.astype("category").values ]).dtype if left is not None: if isinstance(left, pd.Index): lhs.index = left.astype(dtype) else: lhs[col] = left.astype(dtype) if right is not None: if isinstance(right, pd.Index): rhs.index = right.astype(dtype) else: rhs[col] = right.astype(dtype) out = lhs.merge(rhs, *args, **kwargs) # Workaround pandas bug where if the output result of a merge operation is # an empty dataframe, the output index is `int64` in all cases, regardless # of input dtypes. if len(out) == 0 and empty_index_dtype is not None: out.index = out.index.astype(empty_index_dtype) return out
def concatenate(dfs): """ From https://stackoverflow.com/questions/45639350/retaining-categorical-dtype-upon-dataframe-concatenation Concatenate while preserving categorical columns. NB: We change the categories in-place for the input dataframes""" from pandas.api.types import union_categoricals import pandas as pd # Iterate on categorical columns common to all dfs for col in set.intersection( *[set(df.select_dtypes(include='category').columns) for df in dfs]): # Generate the union category across dfs for this column uc = union_categoricals([df[col] for df in dfs]) # Change to union category for all dataframes for df in dfs: df[col] = pd.Categorical(df[col], categories=uc.categories) return pd.concat(dfs)
def convert(cls, df, outdir): path = Path(outdir) assert path.is_dir() if not path.exists(): path.mkdir() df = df if isinstance(df, TextFileReader) else [df] colimporters = {} v = [] for chunk in df: for col in chunk.columns: dtype = chunk[col].dtype if dtype == "object" or dtype == "str" or dtype == "bool": chunk[col] = chunk[col].astype("category") elif dtype == np.float32 or dtype == np.float64 or dtype == np.int8 or dtype == np.int16 or dtype == np.int32 or dtype == np.int64 or dtype == np.uint8 or dtype == np.uint16 or dtype == np.uint32 or dtype == np.uint64: chunk[col] = chunk[col].astype(np.float32) else: pass if len(v) == 0: v = chunk else: for col in chunk.columns: if chunk[col].dtype.name == "category": uc = union_categoricals([v[col], chunk[col]]) v[col] = pd.Categorical(v[col], categories=uc.categories) chunk[col] = pd.Categorical(chunk[col], categories=uc.categories) for col in chunk.columns: if not col in colimporters: iscat = chunk[col].dtype.name == "category" colimporters[col] = ColImporter(col, iscat, path) colimporter = colimporters[col] if colimporter.iscategorical: factor = chunk[col] levels = [MISSINGLEVEL] + [str(v) for v in factor.cat.categories.values] colimporter.importcategorical(chunk[col].cat.codes.values, levels) else: colimporter.importnumeric(chunk[col].values) d = {"datacolumns": [c.asdict() for _, c in colimporters.items()]} jsonstr = json.dumps(d) with open((path / "header.json").resolve(), "w") as f: f.write(jsonstr)
def _union(dfs: List[pd.DataFrame], ignore_index: Optional[bool]) -> pd.DataFrame: if ignore_index is None: ignore_index = False for df in dfs: if hasattr(df, "_awswrangler_ignore_index"): if df._awswrangler_ignore_index is True: # pylint: disable=protected-access ignore_index = True break cats: Tuple[Set[str], ...] = tuple( set(df.select_dtypes(include="category").columns) for df in dfs) for col in set.intersection(*cats): cat = union_categoricals([df[col] for df in dfs]) for df in dfs: df[col] = pd.Categorical(df[col].values, categories=cat.categories) return pd.concat(objs=dfs, sort=False, copy=False, ignore_index=ignore_index)
def merge_categories(results: Sequence[pd.DataFrame]): to_cat = [] df = results[0] for col in df: if isinstance(df[col].dtype, pd.CategoricalDtype): to_cat.append(col) if to_cat: unify = {} for col in to_cat: u = unify[col] = [] for result in results: u.append(result[col]) unified = { col: union_categoricals(v).categories for col, v in unify.items() } for res in results: for col in to_cat: res[col] = pd.Categorical(res[col].values, categories=unified[col])
def concatenate(cls, dfs): """ Concatenate Pandas DataFrames with saving 'category' dtype Parameters ---------- dfs: list of DataFrames Returns ------- A Pandas DataFrame """ categoricals_columns = set.intersection( *[set(df.select_dtypes("category").columns.tolist()) for df in dfs] ) for col in categoricals_columns: uc = union_categoricals([df[col] for df in dfs]) for df in dfs: df[col] = pandas.Categorical(df[col], categories=uc.categories) return pandas.concat(dfs)
def _read( self, name, index_col=None, filter_: Optional[Filter] = None ) -> Optional[pd.DataFrame]: # Get all paths for split files. filename = f"{name}_*.csv" srcs = glob.glob(os.path.join(self.DATA_DIR, filename)) if not srcs: return # Read data from each file. dfs = [] for src in srcs: df = pd.read_csv(src, dtype=self.DTYPE[name], parse_dates=self.PARSE_DATES[name], encoding=self.ENCODING) if filter_ is not None: filter_(df) dfs.append(df) # Concatenate them. if len(dfs) == 1: concatenated_df = dfs[0] else: # Combine categoricals. for cname, dtype in self.DTYPE[name].items(): if dtype == "category": union = types.union_categoricals([df[cname] for df in dfs]) for df in dfs: df[cname] = pd.Categorical( df[cname], categories=union.categories) concatenated_df = pd.concat(dfs) if index_col is not None: concatenated_df.set_index(index_col, drop=False, inplace=True) concatenated_df.sort_index(inplace=True) return concatenated_df
def concat_pandas(dfs, axis=0, join='outer', uniform=False, filter_warning=True): if axis == 1: return pd.concat(dfs, axis=axis, join=join, **concat_kwargs) # Support concatenating indices along axis 0 if isinstance(dfs[0], pd.Index): if isinstance(dfs[0], pd.CategoricalIndex): return pd.CategoricalIndex(union_categoricals(dfs), name=dfs[0].name) elif isinstance(dfs[0], pd.MultiIndex): first, rest = dfs[0], dfs[1:] if all((isinstance(o, pd.MultiIndex) and o.nlevels >= first.nlevels) for o in rest): arrays = [concat([_get_level_values(i, n) for i in dfs]) for n in range(first.nlevels)] return pd.MultiIndex.from_arrays(arrays, names=first.names) to_concat = (first.values, ) + tuple(k._values for k in rest) new_tuples = np.concatenate(to_concat) try: return pd.MultiIndex.from_tuples(new_tuples, names=first.names) except Exception: return pd.Index(new_tuples) return dfs[0].append(dfs[1:]) # Handle categorical index separately dfs0_index = dfs[0].index has_categoricalindex = ( isinstance(dfs0_index, pd.CategoricalIndex) or (isinstance(dfs0_index, pd.MultiIndex) and any(isinstance(i, pd.CategoricalIndex) for i in dfs0_index.levels))) if has_categoricalindex: dfs2 = [df.reset_index(drop=True) for df in dfs] ind = concat([df.index for df in dfs]) else: dfs2 = dfs ind = None # Concatenate the partitions together, handling categories as needed if (isinstance(dfs2[0], pd.DataFrame) if uniform else any(isinstance(df, pd.DataFrame) for df in dfs2)): if uniform: dfs3 = dfs2 cat_mask = dfs2[0].dtypes == 'category' else: # When concatenating mixed dataframes and series on axis 1, Pandas # converts series to dataframes with a single column named 0, then # concatenates. dfs3 = [df if isinstance(df, pd.DataFrame) else df.to_frame().rename(columns={df.name: 0}) for df in dfs2] # pandas may raise a RuntimeWarning for comparing ints and strs with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) if filter_warning: warnings.simplefilter('ignore', FutureWarning) cat_mask = pd.concat([(df.dtypes == 'category').to_frame().T for df in dfs3], join=join, **concat_kwargs).any() if cat_mask.any(): not_cat = cat_mask[~cat_mask].index # this should be aligned, so no need to filter warning out = pd.concat([df[df.columns.intersection(not_cat)] for df in dfs3], join=join, **concat_kwargs) temp_ind = out.index for col in cat_mask.index.difference(not_cat): # Find an example of categoricals in this column for df in dfs3: sample = df.get(col) if sample is not None: break # Extract partitions, subbing in missing if needed parts = [] for df in dfs3: if col in df.columns: parts.append(df[col]) else: codes = np.full(len(df), -1, dtype='i8') data = pd.Categorical.from_codes(codes, sample.cat.categories, sample.cat.ordered) parts.append(data) out[col] = union_categoricals(parts) # Pandas resets index type on assignment if frame is empty # https://github.com/pandas-dev/pandas/issues/17101 if not len(temp_ind): out.index = temp_ind out = out.reindex(columns=cat_mask.index) else: # pandas may raise a RuntimeWarning for comparing ints and strs with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) if filter_warning: warnings.simplefilter("ignore", FutureWarning) out = pd.concat(dfs3, join=join, **concat_kwargs) else: if is_categorical_dtype(dfs2[0].dtype): if ind is None: ind = concat([df.index for df in dfs2]) return pd.Series(union_categoricals(dfs2), index=ind, name=dfs2[0].name) with warnings.catch_warnings(): if filter_warning: warnings.simplefilter('ignore', FutureWarning) out = pd.concat(dfs2, join=join, **concat_kwargs) # Re-add the index if needed if ind is not None: out.index = ind return out
def time_union(self): union_categoricals([self.a, self.b])
def concat(dfs, axis=0, join='outer', uniform=False): """Concatenate, handling some edge cases: - Unions categoricals between partitions - Ignores empty partitions Parameters ---------- dfs : list of DataFrame, Series, or Index axis : int or str, optional join : str, optional uniform : bool, optional Whether to treat ``dfs[0]`` as representative of ``dfs[1:]``. Set to True if all arguments have the same columns and dtypes (but not necessarily categories). Default is False. """ if axis == 1: return pd.concat(dfs, axis=axis, join=join) if len(dfs) == 1: return dfs[0] # Support concatenating indices along axis 0 if isinstance(dfs[0], pd.Index): if isinstance(dfs[0], pd.CategoricalIndex): return pd.CategoricalIndex(union_categoricals(dfs), name=dfs[0].name) return dfs[0].append(dfs[1:]) # Handle categorical index separately if isinstance(dfs[0].index, pd.CategoricalIndex): dfs2 = [df.reset_index(drop=True) for df in dfs] ind = concat([df.index for df in dfs]) else: dfs2 = dfs ind = None # Concatenate the partitions together, handling categories as needed if (isinstance(dfs2[0], pd.DataFrame) if uniform else any(isinstance(df, pd.DataFrame) for df in dfs2)): if uniform: dfs3 = dfs2 cat_mask = dfs2[0].dtypes == 'category' else: # When concatenating mixed dataframes and series on axis 1, Pandas # converts series to dataframes with a single column named 0, then # concatenates. dfs3 = [df if isinstance(df, pd.DataFrame) else df.to_frame().rename(columns={df.name: 0}) for df in dfs2] cat_mask = pd.concat([(df.dtypes == 'category').to_frame().T for df in dfs3], join=join).any() if cat_mask.any(): not_cat = cat_mask[~cat_mask].index out = pd.concat([df[df.columns.intersection(not_cat)] for df in dfs3], join=join) for col in cat_mask.index.difference(not_cat): # Find an example of categoricals in this column for df in dfs3: sample = df.get(col) if sample is not None: break # Extract partitions, subbing in missing if needed parts = [] for df in dfs3: if col in df.columns: parts.append(df[col]) else: codes = np.full(len(df), -1, dtype='i8') data = pd.Categorical.from_codes(codes, sample.cat.categories, sample.cat.ordered) parts.append(data) out[col] = union_categoricals(parts) out = out.reindex_axis(cat_mask.index, axis=1) else: out = pd.concat(dfs3, join=join) else: if is_categorical_dtype(dfs2[0].dtype): if ind is None: ind = concat([df.index for df in dfs2]) return pd.Series(union_categoricals(dfs2), index=ind, name=dfs2[0].name) out = pd.concat(dfs2, join=join) # Re-add the index if needed if ind is not None: out.index = ind return out
def concat(dfs, axis=0, join='outer', uniform=False): """Concatenate, handling some edge cases: - Unions categoricals between partitions - Ignores empty partitions Parameters ---------- dfs : list of DataFrame, Series, or Index axis : int or str, optional join : str, optional uniform : bool, optional Whether to treat ``dfs[0]`` as representative of ``dfs[1:]``. Set to True if all arguments have the same columns and dtypes (but not necessarily categories). Default is False. """ if axis == 1: return pd.concat(dfs, axis=axis, join=join) if len(dfs) == 1: return dfs[0] # Support concatenating indices along axis 0 if isinstance(dfs[0], pd.Index): if isinstance(dfs[0], pd.CategoricalIndex): return pd.CategoricalIndex(union_categoricals(dfs), name=dfs[0].name) elif isinstance(dfs[0], pd.MultiIndex): first, rest = dfs[0], dfs[1:] if all((isinstance(o, pd.MultiIndex) and o.nlevels >= first.nlevels) for o in rest): arrays = [concat([_get_level_values(i, n) for i in dfs]) for n in range(first.nlevels)] return pd.MultiIndex.from_arrays(arrays, names=first.names) to_concat = (first.values, ) + tuple(k._values for k in rest) new_tuples = np.concatenate(to_concat) try: return pd.MultiIndex.from_tuples(new_tuples, names=first.names) except Exception: return pd.Index(new_tuples) return dfs[0].append(dfs[1:]) # Handle categorical index separately dfs0_index = dfs[0].index has_categoricalindex = ( isinstance(dfs0_index, pd.CategoricalIndex) or (isinstance(dfs0_index, pd.MultiIndex) and any(isinstance(i, pd.CategoricalIndex) for i in dfs0_index.levels))) if has_categoricalindex: dfs2 = [df.reset_index(drop=True) for df in dfs] ind = concat([df.index for df in dfs]) else: dfs2 = dfs ind = None # Concatenate the partitions together, handling categories as needed if (isinstance(dfs2[0], pd.DataFrame) if uniform else any(isinstance(df, pd.DataFrame) for df in dfs2)): if uniform: dfs3 = dfs2 cat_mask = dfs2[0].dtypes == 'category' else: # When concatenating mixed dataframes and series on axis 1, Pandas # converts series to dataframes with a single column named 0, then # concatenates. dfs3 = [df if isinstance(df, pd.DataFrame) else df.to_frame().rename(columns={df.name: 0}) for df in dfs2] # pandas may raise a RuntimeWarning for comparing ints and strs with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) cat_mask = pd.concat([(df.dtypes == 'category').to_frame().T for df in dfs3], join=join).any() if cat_mask.any(): not_cat = cat_mask[~cat_mask].index out = pd.concat([df[df.columns.intersection(not_cat)] for df in dfs3], join=join) temp_ind = out.index for col in cat_mask.index.difference(not_cat): # Find an example of categoricals in this column for df in dfs3: sample = df.get(col) if sample is not None: break # Extract partitions, subbing in missing if needed parts = [] for df in dfs3: if col in df.columns: parts.append(df[col]) else: codes = np.full(len(df), -1, dtype='i8') data = pd.Categorical.from_codes(codes, sample.cat.categories, sample.cat.ordered) parts.append(data) out[col] = union_categoricals(parts) # Pandas resets index type on assignment if frame is empty # https://github.com/pandas-dev/pandas/issues/17101 if not len(temp_ind): out.index = temp_ind out = out.reindex(columns=cat_mask.index) else: # pandas may raise a RuntimeWarning for comparing ints and strs with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) out = pd.concat(dfs3, join=join) else: if is_categorical_dtype(dfs2[0].dtype): if ind is None: ind = concat([df.index for df in dfs2]) return pd.Series(union_categoricals(dfs2), index=ind, name=dfs2[0].name) out = pd.concat(dfs2, join=join) # Re-add the index if needed if ind is not None: out.index = ind return out
def concat(dfs, axis=0, join='outer', uniform=False): """Concatenate, handling some edge cases: - Unions categoricals between partitions - Ignores empty partitions Parameters ---------- dfs : list of DataFrame, Series, or Index axis : int or str, optional join : str, optional uniform : bool, optional Whether to treat ``dfs[0]`` as representative of ``dfs[1:]``. Set to True if all arguments have the same columns and dtypes (but not necessarily categories). Default is False. """ if axis == 1: return pd.concat(dfs, axis=axis, join=join) if len(dfs) == 1: return dfs[0] # Support concatenating indices along axis 0 if isinstance(dfs[0], pd.Index): if isinstance(dfs[0], pd.CategoricalIndex): return pd.CategoricalIndex(union_categoricals(dfs), name=dfs[0].name) elif isinstance(dfs[0], pd.MultiIndex): first, rest = dfs[0], dfs[1:] if all( (isinstance(o, pd.MultiIndex) and o.nlevels >= first.nlevels) for o in rest): arrays = [ concat([_get_level_values(i, n) for i in dfs]) for n in range(first.nlevels) ] return pd.MultiIndex.from_arrays(arrays, names=first.names) to_concat = (first.values, ) + tuple(k._values for k in rest) new_tuples = np.concatenate(to_concat) try: return pd.MultiIndex.from_tuples(new_tuples, names=first.names) except Exception: return pd.Index(new_tuples) return dfs[0].append(dfs[1:]) # Handle categorical index separately dfs0_index = dfs[0].index if (isinstance(dfs0_index, pd.CategoricalIndex) or (isinstance(dfs0_index, pd.MultiIndex) and any( isinstance(i, pd.CategoricalIndex) for i in dfs0_index.levels))): dfs2 = [df.reset_index(drop=True) for df in dfs] ind = concat([df.index for df in dfs]) else: dfs2 = dfs ind = None # Concatenate the partitions together, handling categories as needed if (isinstance(dfs2[0], pd.DataFrame) if uniform else any( isinstance(df, pd.DataFrame) for df in dfs2)): if uniform: dfs3 = dfs2 cat_mask = dfs2[0].dtypes == 'category' else: # When concatenating mixed dataframes and series on axis 1, Pandas # converts series to dataframes with a single column named 0, then # concatenates. dfs3 = [ df if isinstance(df, pd.DataFrame) else df.to_frame().rename( columns={df.name: 0}) for df in dfs2 ] # pandas may raise a RuntimeWarning for comparing ints and strs with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) cat_mask = pd.concat([(df.dtypes == 'category').to_frame().T for df in dfs3], join=join).any() if cat_mask.any(): not_cat = cat_mask[~cat_mask].index out = pd.concat( [df[df.columns.intersection(not_cat)] for df in dfs3], join=join) for col in cat_mask.index.difference(not_cat): # Find an example of categoricals in this column for df in dfs3: sample = df.get(col) if sample is not None: break # Extract partitions, subbing in missing if needed parts = [] for df in dfs3: if col in df.columns: parts.append(df[col]) else: codes = np.full(len(df), -1, dtype='i8') data = pd.Categorical.from_codes( codes, sample.cat.categories, sample.cat.ordered) parts.append(data) out[col] = union_categoricals(parts) out = out.reindex(columns=cat_mask.index) else: # pandas may raise a RuntimeWarning for comparing ints and strs with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) out = pd.concat(dfs3, join=join) else: if is_categorical_dtype(dfs2[0].dtype): if ind is None: ind = concat([df.index for df in dfs2]) return pd.Series(union_categoricals(dfs2), index=ind, name=dfs2[0].name) out = pd.concat(dfs2, join=join) # Re-add the index if needed if ind is not None: out.index = ind return out
print(user_info.cat.rename_categories(['A+', 'AB+', 'B+', 'O+'])) print('-------------------snip---------------') # 除了重命名,也会遇到添加类别,删除分类的操作,这些都可以通过 .cat.add_categories ,.cat.remove_categories 来实现 # 分类数据也支持使用 value_counts 方法来查看数据分布 print(user_info.value_counts()) print('-------------------snip---------------') # 分类数据也支持使用 .str 属性来访问的 print(user_info.str.contains('O')) print('-------------------snip---------------') # 合并数据,借助 pd.concat 来完成 blood_type1 = pd.Categorical(['A', 'AB']) blood_type2 = pd.Categorical(['B', 'O']) print(pd.concat([pd.Series(blood_type1), pd.Series(blood_type2)])) print('-------------------snip---------------') # 分类数据经过 pd.concat 合并后类型转为了 object 类型。如果想要保持分类类型的话,可以借助 union_categoricals 来完成 from pandas.api.types import union_categoricals print(union_categoricals([blood_type1, blood_type2])) print('-------------------snip---------------') # 3. 内存使用量的陷阱 # Categorical 的内存使用量是与分类数乘以数据长度成正比,object 类型的数据是一个常数乘以数据的长度 blood_type = pd.Series(['AB', 'O', 'A', 'B', np.nan, 'A'] * 1000) print('object 类型的数据的长度:\n', blood_type.nbytes) print('-------------------snip---------------') print('转换成分类数据的长度:\n', blood_type.astype('category').nbytes) print('-------------------snip---------------') # 当类别的数量接近数据的长度,那么 Categorical 将使用与等效的 object 表示几乎相同或更多的内存 blood_type = pd.Series(['AB%04d' % i for i in range(2000)]) print('object 类型的数据的长度:\n', blood_type.nbytes) print('-------------------snip---------------') print('转换成分类数据的长度:\n', blood_type.astype('category').nbytes)
'type': 'category', 'id': 'category', 'io': 'category', 'timestamp': 'datetime64[s]' }) #df.loc['timestamp'] = df['timestamp'].str[:-4] #df.loc['timestamp'] = pd.to_datetime(df['timestamp'], format="%Y-%m-%d %H:%M:%S") log_dfs.append(df) # get levels for categories print('key levels') key_cats = union_categoricals([x['key'] for x in log_dfs]).categories print('type levels') type_cats = union_categoricals([x['type'] for x in log_dfs]).categories print('id levels') id_cats = union_categoricals([x['id'] for x in log_dfs]).categories print('io levels') io_cats = union_categoricals([x.io for x in log_dfs]).categories # set levels for x in tqdm(log_dfs, desc="setting levels"): x['key'] = pd.Categorical(x['key'], categories=key_cats) x['type'] = pd.Categorical(x['type'], categories=type_cats) x['id'] = pd.Categorical(x['id'], categories=id_cats) x['io'] = pd.Categorical(x['io'], categories=io_cats) great_df = pd.concat(log_dfs, axis=0, ignore_index=True)
# %% df2 = pd.read_csv(c.predicted_18M) # %% df2 = df2[['monolayer1', 'monolayer2']] # %% cats2 = pd.unique(df2.to_numpy().ravel()) print(f"size of cats2: {cats2.size:,}") # %% cats1_type = pd.CategoricalDtype(categories=cats1, ordered=False) cats2_type = pd.CategoricalDtype(categories=cats2, ordered=False) # %% cats1 = pd.Categorical(cats1) cats2 = pd.Categorical(cats2) cats_union = union_categoricals([cats1, cats2]).unique() cats_union.sort_values(inplace=True) print(f"size of cats union: {cats_union.size:,}") # %% # save cats to file catsdf = pd.DataFrame( {"codes": cats_union.codes}, index=cats_union.categories.to_series(), ) catsdf.index.names = ['categories'] catsdf.to_csv(c.layer_categories)