Exemplo n.º 1
0
    def from_pandas(cls, df):

        df = df if isinstance(df, TextFileReader) else [df]
        res = []
        for chunk in df:
            for col in chunk.columns:
                dtype = chunk[col].dtype
                if dtype == "object" or dtype == "str" or dtype == "bool":
                    chunk[col] = chunk[col].astype("category")

            if len(res) == 0:
                res = chunk
            else:
                for col in chunk.columns:
                    if chunk[col].dtype.name == "category":
                        uc = union_categoricals([res[col], chunk[col]])
                        res[col] = pd.Categorical(res[col], categories=uc.categories)
                        chunk[col] = pd.Categorical(chunk[col], categories=uc.categories)
                res = pd.concat([res, chunk])

        datacols = []
        for col in res.columns:
            dtype = res[col].dtype
            if dtype == np.float32 or dtype == np.float64 or dtype == np.int8 or dtype == np.int16 or dtype == np.int32 or dtype == np.int64 or dtype == np.uint8 or dtype == np.uint16 or dtype == np.uint32 or dtype == np.uint64:
                datacols.append(Covariate.from_array(res[col].values, col))
            elif dtype.name == "category":
                factor = res[col]
                codes = factor.cat.codes.values
                levels = [MISSINGLEVEL] +  [str(v) for v in factor.cat.categories.values]
                data = np.empty(len(codes), dtype=np.uint8) if len(levels) <= 256 else np.empty(len(codes), dtype=np.uint16)
                incr_(codes, data)
                datacols.append(Factor.from_array(col, levels, data))
            else:
                pass
        return cls(datacols)
Exemplo n.º 2
0
def concatenate(dfs):
    """
    Concatenate pandas DataFrames with saving 'category' dtype.

    All dataframes' columns must be equal to each other.

    Parameters
    ----------
    dfs : list
        List of pandas DataFrames to concatenate.

    Returns
    -------
    pandas.DataFrame
        A pandas DataFrame.
    """
    for df in dfs:
        assert df.columns.equals(dfs[0].columns)
    for i in range(len(dfs[0].columns)):
        if dfs[0].dtypes.iloc[i].name != "category":
            continue
        columns = [df.iloc[:, i] for df in dfs]
        union = union_categoricals(columns)
        for df in dfs:
            df.iloc[:, i] = pandas.Categorical(
                df.iloc[:, i], categories=union.categories
            )
    return pandas.concat(dfs)
Exemplo n.º 3
0
    def get_result(self):
        """
        :return: stored query result.
        """

        for packet in self.packet_generator:
            self.store(packet)

        if self.columnar:
            data = []
            # Transpose to a list of columns, each column is list of chunks
            for column_chunks in zip(*self.data):
                # Concatenate chunks for each column
                if isinstance(column_chunks[0], np.ndarray):
                    column = np.concatenate(column_chunks)
                elif isinstance(column_chunks[0], pd.Categorical):
                    column = union_categoricals(column_chunks)
                else:
                    column = tuple(chain.from_iterable(column_chunks))
                data.append(column)
        else:
            data = self.data

        if self.with_column_types:
            return data, self.columns_with_types
        else:
            return data
Exemplo n.º 4
0
def _union(dfs: List[pd.DataFrame], ignore_index: bool) -> pd.DataFrame:
    cats: Tuple[Set[str], ...] = tuple(set(df.select_dtypes(include="category").columns) for df in dfs)
    for col in set.intersection(*cats):
        cat = union_categoricals([df[col] for df in dfs])
        for df in dfs:
            df[col] = pd.Categorical(df[col].values, categories=cat.categories)
    return pd.concat(objs=dfs, sort=False, copy=False, ignore_index=ignore_index)
Exemplo n.º 5
0
def concat_categorical(df_a, df_b, ignore_index=True):
    for cat_col in df_a.select_dtypes(["category"]):
        a = df_a[cat_col]
        b = df_b[cat_col]
        a_b = union_categoricals([a, b], ignore_order=True)
        a.cat.set_categories(a_b.categories, inplace=True)
        b.cat.set_categories(a_b.categories, inplace=True)
    return pd.concat([df_a, df_b], ignore_index=ignore_index)
def union_categoricals(to_union, sort_categories=False, ignore_order=False):
    warnings.warn(
        "pandas.types.concat.union_categoricals is "
        "deprecated and will be removed in a future version.\n"
        "use pandas.api.types.union_categoricals",
        FutureWarning,
        stacklevel=2)
    from pandas.api.types import union_categoricals
    return union_categoricals(to_union,
                              sort_categories=sort_categories,
                              ignore_order=ignore_order)
Exemplo n.º 7
0
def merge_chunk(lhs, *args, **kwargs):
    empty_index_dtype = kwargs.pop("empty_index_dtype", None)
    categorical_columns = kwargs.pop("categorical_columns", None)

    rhs, *args = args
    left_index = kwargs.get("left_index", False)
    right_index = kwargs.get("right_index", False)

    if categorical_columns is not None and PANDAS_GT_100:
        for col in categorical_columns:
            left = None
            right = None

            if col in lhs:
                left = lhs[col]
            elif col == kwargs.get("right_on", None) and left_index:
                if is_categorical_dtype(lhs.index):
                    left = lhs.index

            if col in rhs:
                right = rhs[col]
            elif col == kwargs.get("left_on", None) and right_index:
                if is_categorical_dtype(rhs.index):
                    right = rhs.index

            dtype = "category"
            if left is not None and right is not None:
                dtype = union_categoricals([
                    left.astype("category").values,
                    right.astype("category").values
                ]).dtype

            if left is not None:
                if isinstance(left, pd.Index):
                    lhs.index = left.astype(dtype)
                else:
                    lhs[col] = left.astype(dtype)
            if right is not None:
                if isinstance(right, pd.Index):
                    rhs.index = right.astype(dtype)
                else:
                    rhs[col] = right.astype(dtype)

    out = lhs.merge(rhs, *args, **kwargs)

    # Workaround pandas bug where if the output result of a merge operation is
    # an empty dataframe, the output index is `int64` in all cases, regardless
    # of input dtypes.
    if len(out) == 0 and empty_index_dtype is not None:
        out.index = out.index.astype(empty_index_dtype)
    return out
Exemplo n.º 8
0
def concatenate(dfs):
    """
    From https://stackoverflow.com/questions/45639350/retaining-categorical-dtype-upon-dataframe-concatenation
    Concatenate while preserving categorical columns.
    NB: We change the categories in-place for the input dataframes"""
    from pandas.api.types import union_categoricals
    import pandas as pd
    # Iterate on categorical columns common to all dfs
    for col in set.intersection(
            *[set(df.select_dtypes(include='category').columns)
              for df in dfs]):
        # Generate the union category across dfs for this column
        uc = union_categoricals([df[col] for df in dfs])
        # Change to union category for all dataframes
        for df in dfs:
            df[col] = pd.Categorical(df[col], categories=uc.categories)
    return pd.concat(dfs)
Exemplo n.º 9
0
    def convert(cls, df, outdir):
        path = Path(outdir)
        assert path.is_dir()
        if not path.exists():
            path.mkdir()

        df = df if isinstance(df, TextFileReader) else [df]
        colimporters = {}
        v = []
        for chunk in df:
            for col in chunk.columns:
                dtype = chunk[col].dtype
                if dtype == "object" or dtype == "str" or dtype == "bool":
                    chunk[col] = chunk[col].astype("category")
                elif dtype == np.float32 or dtype == np.float64 or dtype == np.int8 or dtype == np.int16 or dtype == np.int32 or dtype == np.int64 or dtype == np.uint8 or dtype == np.uint16 or dtype == np.uint32 or dtype == np.uint64:
                    chunk[col] = chunk[col].astype(np.float32)
                else:
                    pass
               
            if len(v) == 0:
                v = chunk
            else:
                for col in chunk.columns:
                    if chunk[col].dtype.name == "category":
                        uc = union_categoricals([v[col], chunk[col]])
                        v[col] = pd.Categorical(v[col], categories=uc.categories)
                        chunk[col] = pd.Categorical(chunk[col], categories=uc.categories)

            for col in chunk.columns:
                if not col in colimporters:
                    iscat = chunk[col].dtype.name == "category"
                    colimporters[col] = ColImporter(col, iscat, path)
                colimporter = colimporters[col]
                if colimporter.iscategorical:
                    factor = chunk[col]
                    levels = [MISSINGLEVEL] + [str(v) for v in factor.cat.categories.values]
                    colimporter.importcategorical(chunk[col].cat.codes.values, levels)
                else:
                    colimporter.importnumeric(chunk[col].values)

        d = {"datacolumns": [c.asdict() for _, c in colimporters.items()]}
        jsonstr = json.dumps(d)
        with open((path / "header.json").resolve(), "w") as f:
            f.write(jsonstr)
Exemplo n.º 10
0
def _union(dfs: List[pd.DataFrame],
           ignore_index: Optional[bool]) -> pd.DataFrame:
    if ignore_index is None:
        ignore_index = False
        for df in dfs:
            if hasattr(df, "_awswrangler_ignore_index"):
                if df._awswrangler_ignore_index is True:  # pylint: disable=protected-access
                    ignore_index = True
                    break
    cats: Tuple[Set[str], ...] = tuple(
        set(df.select_dtypes(include="category").columns) for df in dfs)
    for col in set.intersection(*cats):
        cat = union_categoricals([df[col] for df in dfs])
        for df in dfs:
            df[col] = pd.Categorical(df[col].values, categories=cat.categories)
    return pd.concat(objs=dfs,
                     sort=False,
                     copy=False,
                     ignore_index=ignore_index)
Exemplo n.º 11
0
def merge_categories(results: Sequence[pd.DataFrame]):
    to_cat = []
    df = results[0]
    for col in df:
        if isinstance(df[col].dtype, pd.CategoricalDtype):
            to_cat.append(col)
    if to_cat:
        unify = {}
        for col in to_cat:
            u = unify[col] = []
            for result in results:
                u.append(result[col])
        unified = {
            col: union_categoricals(v).categories
            for col, v in unify.items()
        }

        for res in results:
            for col in to_cat:
                res[col] = pd.Categorical(res[col].values,
                                          categories=unified[col])
Exemplo n.º 12
0
    def concatenate(cls, dfs):
        """
        Concatenate Pandas DataFrames with saving 'category' dtype

        Parameters
        ----------
            dfs: list of DataFrames

        Returns
        -------
            A Pandas DataFrame
        """
        categoricals_columns = set.intersection(
            *[set(df.select_dtypes("category").columns.tolist()) for df in dfs]
        )

        for col in categoricals_columns:
            uc = union_categoricals([df[col] for df in dfs])
            for df in dfs:
                df[col] = pandas.Categorical(df[col], categories=uc.categories)

        return pandas.concat(dfs)
Exemplo n.º 13
0
    def _read(
        self, name, index_col=None, filter_: Optional[Filter] = None
    ) -> Optional[pd.DataFrame]:
        # Get all paths for split files.
        filename = f"{name}_*.csv"
        srcs = glob.glob(os.path.join(self.DATA_DIR, filename))
        if not srcs:
            return

        # Read data from each file.
        dfs = []
        for src in srcs:
            df = pd.read_csv(src, dtype=self.DTYPE[name],
                             parse_dates=self.PARSE_DATES[name],
                             encoding=self.ENCODING)
            if filter_ is not None:
                filter_(df)
            dfs.append(df)

        # Concatenate them.
        if len(dfs) == 1:
            concatenated_df = dfs[0]
        else:
            # Combine categoricals.
            for cname, dtype in self.DTYPE[name].items():
                if dtype == "category":
                    union = types.union_categoricals([df[cname] for df in dfs])
                    for df in dfs:
                        df[cname] = pd.Categorical(
                            df[cname], categories=union.categories)
            concatenated_df = pd.concat(dfs)

        if index_col is not None:
            concatenated_df.set_index(index_col, drop=False, inplace=True)
            concatenated_df.sort_index(inplace=True)
        return concatenated_df
Exemplo n.º 14
0
def concat_pandas(dfs, axis=0, join='outer', uniform=False, filter_warning=True):
    if axis == 1:
        return pd.concat(dfs, axis=axis, join=join, **concat_kwargs)

    # Support concatenating indices along axis 0
    if isinstance(dfs[0], pd.Index):
        if isinstance(dfs[0], pd.CategoricalIndex):
            return pd.CategoricalIndex(union_categoricals(dfs),
                                       name=dfs[0].name)
        elif isinstance(dfs[0], pd.MultiIndex):
            first, rest = dfs[0], dfs[1:]
            if all((isinstance(o, pd.MultiIndex) and o.nlevels >= first.nlevels)
                    for o in rest):
                arrays = [concat([_get_level_values(i, n) for i in dfs])
                          for n in range(first.nlevels)]
                return pd.MultiIndex.from_arrays(arrays, names=first.names)

            to_concat = (first.values, ) + tuple(k._values for k in rest)
            new_tuples = np.concatenate(to_concat)
            try:
                return pd.MultiIndex.from_tuples(new_tuples, names=first.names)
            except Exception:
                return pd.Index(new_tuples)
        return dfs[0].append(dfs[1:])

    # Handle categorical index separately
    dfs0_index = dfs[0].index

    has_categoricalindex = (
        isinstance(dfs0_index, pd.CategoricalIndex) or
        (isinstance(dfs0_index, pd.MultiIndex) and
         any(isinstance(i, pd.CategoricalIndex) for i in dfs0_index.levels)))

    if has_categoricalindex:
        dfs2 = [df.reset_index(drop=True) for df in dfs]
        ind = concat([df.index for df in dfs])
    else:
        dfs2 = dfs
        ind = None

    # Concatenate the partitions together, handling categories as needed
    if (isinstance(dfs2[0], pd.DataFrame) if uniform else
            any(isinstance(df, pd.DataFrame) for df in dfs2)):
        if uniform:
            dfs3 = dfs2
            cat_mask = dfs2[0].dtypes == 'category'
        else:
            # When concatenating mixed dataframes and series on axis 1, Pandas
            # converts series to dataframes with a single column named 0, then
            # concatenates.
            dfs3 = [df if isinstance(df, pd.DataFrame) else
                    df.to_frame().rename(columns={df.name: 0}) for df in dfs2]
            # pandas may raise a RuntimeWarning for comparing ints and strs
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", RuntimeWarning)
                if filter_warning:
                    warnings.simplefilter('ignore', FutureWarning)
                cat_mask = pd.concat([(df.dtypes == 'category').to_frame().T
                                      for df in dfs3], join=join,
                                     **concat_kwargs).any()

        if cat_mask.any():
            not_cat = cat_mask[~cat_mask].index
            # this should be aligned, so no need to filter warning
            out = pd.concat([df[df.columns.intersection(not_cat)]
                             for df in dfs3], join=join, **concat_kwargs)
            temp_ind = out.index
            for col in cat_mask.index.difference(not_cat):
                # Find an example of categoricals in this column
                for df in dfs3:
                    sample = df.get(col)
                    if sample is not None:
                        break
                # Extract partitions, subbing in missing if needed
                parts = []
                for df in dfs3:
                    if col in df.columns:
                        parts.append(df[col])
                    else:
                        codes = np.full(len(df), -1, dtype='i8')
                        data = pd.Categorical.from_codes(codes,
                                                         sample.cat.categories,
                                                         sample.cat.ordered)
                        parts.append(data)
                out[col] = union_categoricals(parts)
                # Pandas resets index type on assignment if frame is empty
                # https://github.com/pandas-dev/pandas/issues/17101
                if not len(temp_ind):
                    out.index = temp_ind
            out = out.reindex(columns=cat_mask.index)
        else:
            # pandas may raise a RuntimeWarning for comparing ints and strs
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", RuntimeWarning)
                if filter_warning:
                    warnings.simplefilter("ignore", FutureWarning)
                out = pd.concat(dfs3, join=join, **concat_kwargs)
    else:
        if is_categorical_dtype(dfs2[0].dtype):
            if ind is None:
                ind = concat([df.index for df in dfs2])
            return pd.Series(union_categoricals(dfs2), index=ind,
                             name=dfs2[0].name)
        with warnings.catch_warnings():
            if filter_warning:
                warnings.simplefilter('ignore', FutureWarning)
            out = pd.concat(dfs2, join=join, **concat_kwargs)
    # Re-add the index if needed
    if ind is not None:
        out.index = ind
    return out
Exemplo n.º 15
0
 def time_union(self):
     union_categoricals([self.a, self.b])
Exemplo n.º 16
0
 def time_union(self):
     union_categoricals([self.a, self.b])
Exemplo n.º 17
0
def concat(dfs, axis=0, join='outer', uniform=False):
    """Concatenate, handling some edge cases:

    - Unions categoricals between partitions
    - Ignores empty partitions

    Parameters
    ----------
    dfs : list of DataFrame, Series, or Index
    axis : int or str, optional
    join : str, optional
    uniform : bool, optional
        Whether to treat ``dfs[0]`` as representative of ``dfs[1:]``. Set to
        True if all arguments have the same columns and dtypes (but not
        necessarily categories). Default is False.
    """
    if axis == 1:
        return pd.concat(dfs, axis=axis, join=join)

    if len(dfs) == 1:
        return dfs[0]

    # Support concatenating indices along axis 0
    if isinstance(dfs[0], pd.Index):
        if isinstance(dfs[0], pd.CategoricalIndex):
            return pd.CategoricalIndex(union_categoricals(dfs),
                                       name=dfs[0].name)
        return dfs[0].append(dfs[1:])

    # Handle categorical index separately
    if isinstance(dfs[0].index, pd.CategoricalIndex):
        dfs2 = [df.reset_index(drop=True) for df in dfs]
        ind = concat([df.index for df in dfs])
    else:
        dfs2 = dfs
        ind = None

    # Concatenate the partitions together, handling categories as needed
    if (isinstance(dfs2[0], pd.DataFrame) if uniform else
            any(isinstance(df, pd.DataFrame) for df in dfs2)):
        if uniform:
            dfs3 = dfs2
            cat_mask = dfs2[0].dtypes == 'category'
        else:
            # When concatenating mixed dataframes and series on axis 1, Pandas
            # converts series to dataframes with a single column named 0, then
            # concatenates.
            dfs3 = [df if isinstance(df, pd.DataFrame) else
                    df.to_frame().rename(columns={df.name: 0}) for df in dfs2]
            cat_mask = pd.concat([(df.dtypes == 'category').to_frame().T
                                  for df in dfs3], join=join).any()

        if cat_mask.any():
            not_cat = cat_mask[~cat_mask].index
            out = pd.concat([df[df.columns.intersection(not_cat)]
                             for df in dfs3], join=join)
            for col in cat_mask.index.difference(not_cat):
                # Find an example of categoricals in this column
                for df in dfs3:
                    sample = df.get(col)
                    if sample is not None:
                        break
                # Extract partitions, subbing in missing if needed
                parts = []
                for df in dfs3:
                    if col in df.columns:
                        parts.append(df[col])
                    else:
                        codes = np.full(len(df), -1, dtype='i8')
                        data = pd.Categorical.from_codes(codes,
                                                         sample.cat.categories,
                                                         sample.cat.ordered)
                        parts.append(data)
                out[col] = union_categoricals(parts)
            out = out.reindex_axis(cat_mask.index, axis=1)
        else:
            out = pd.concat(dfs3, join=join)
    else:
        if is_categorical_dtype(dfs2[0].dtype):
            if ind is None:
                ind = concat([df.index for df in dfs2])
            return pd.Series(union_categoricals(dfs2), index=ind,
                             name=dfs2[0].name)
        out = pd.concat(dfs2, join=join)
    # Re-add the index if needed
    if ind is not None:
        out.index = ind
    return out
Exemplo n.º 18
0
def concat(dfs, axis=0, join='outer', uniform=False):
    """Concatenate, handling some edge cases:

    - Unions categoricals between partitions
    - Ignores empty partitions

    Parameters
    ----------
    dfs : list of DataFrame, Series, or Index
    axis : int or str, optional
    join : str, optional
    uniform : bool, optional
        Whether to treat ``dfs[0]`` as representative of ``dfs[1:]``. Set to
        True if all arguments have the same columns and dtypes (but not
        necessarily categories). Default is False.
    """
    if axis == 1:
        return pd.concat(dfs, axis=axis, join=join)

    if len(dfs) == 1:
        return dfs[0]

    # Support concatenating indices along axis 0
    if isinstance(dfs[0], pd.Index):
        if isinstance(dfs[0], pd.CategoricalIndex):
            return pd.CategoricalIndex(union_categoricals(dfs),
                                       name=dfs[0].name)
        elif isinstance(dfs[0], pd.MultiIndex):
            first, rest = dfs[0], dfs[1:]
            if all((isinstance(o, pd.MultiIndex) and o.nlevels >= first.nlevels)
                    for o in rest):
                arrays = [concat([_get_level_values(i, n) for i in dfs])
                          for n in range(first.nlevels)]
                return pd.MultiIndex.from_arrays(arrays, names=first.names)

            to_concat = (first.values, ) + tuple(k._values for k in rest)
            new_tuples = np.concatenate(to_concat)
            try:
                return pd.MultiIndex.from_tuples(new_tuples, names=first.names)
            except Exception:
                return pd.Index(new_tuples)
        return dfs[0].append(dfs[1:])

    # Handle categorical index separately
    dfs0_index = dfs[0].index

    has_categoricalindex = (
        isinstance(dfs0_index, pd.CategoricalIndex) or
        (isinstance(dfs0_index, pd.MultiIndex) and
         any(isinstance(i, pd.CategoricalIndex) for i in dfs0_index.levels)))

    if has_categoricalindex:
        dfs2 = [df.reset_index(drop=True) for df in dfs]
        ind = concat([df.index for df in dfs])
    else:
        dfs2 = dfs
        ind = None

    # Concatenate the partitions together, handling categories as needed
    if (isinstance(dfs2[0], pd.DataFrame) if uniform else
            any(isinstance(df, pd.DataFrame) for df in dfs2)):
        if uniform:
            dfs3 = dfs2
            cat_mask = dfs2[0].dtypes == 'category'
        else:
            # When concatenating mixed dataframes and series on axis 1, Pandas
            # converts series to dataframes with a single column named 0, then
            # concatenates.
            dfs3 = [df if isinstance(df, pd.DataFrame) else
                    df.to_frame().rename(columns={df.name: 0}) for df in dfs2]
            # pandas may raise a RuntimeWarning for comparing ints and strs
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", RuntimeWarning)
                cat_mask = pd.concat([(df.dtypes == 'category').to_frame().T
                                      for df in dfs3], join=join).any()

        if cat_mask.any():
            not_cat = cat_mask[~cat_mask].index
            out = pd.concat([df[df.columns.intersection(not_cat)]
                             for df in dfs3], join=join)
            temp_ind = out.index
            for col in cat_mask.index.difference(not_cat):
                # Find an example of categoricals in this column
                for df in dfs3:
                    sample = df.get(col)
                    if sample is not None:
                        break
                # Extract partitions, subbing in missing if needed
                parts = []
                for df in dfs3:
                    if col in df.columns:
                        parts.append(df[col])
                    else:
                        codes = np.full(len(df), -1, dtype='i8')
                        data = pd.Categorical.from_codes(codes,
                                                         sample.cat.categories,
                                                         sample.cat.ordered)
                        parts.append(data)
                out[col] = union_categoricals(parts)
                # Pandas resets index type on assignment if frame is empty
                # https://github.com/pandas-dev/pandas/issues/17101
                if not len(temp_ind):
                    out.index = temp_ind
            out = out.reindex(columns=cat_mask.index)
        else:
            # pandas may raise a RuntimeWarning for comparing ints and strs
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", RuntimeWarning)
                out = pd.concat(dfs3, join=join)
    else:
        if is_categorical_dtype(dfs2[0].dtype):
            if ind is None:
                ind = concat([df.index for df in dfs2])
            return pd.Series(union_categoricals(dfs2), index=ind,
                             name=dfs2[0].name)
        out = pd.concat(dfs2, join=join)
    # Re-add the index if needed
    if ind is not None:
        out.index = ind
    return out
def concat(dfs, axis=0, join='outer', uniform=False):
    """Concatenate, handling some edge cases:

    - Unions categoricals between partitions
    - Ignores empty partitions

    Parameters
    ----------
    dfs : list of DataFrame, Series, or Index
    axis : int or str, optional
    join : str, optional
    uniform : bool, optional
        Whether to treat ``dfs[0]`` as representative of ``dfs[1:]``. Set to
        True if all arguments have the same columns and dtypes (but not
        necessarily categories). Default is False.
    """
    if axis == 1:
        return pd.concat(dfs, axis=axis, join=join)

    if len(dfs) == 1:
        return dfs[0]

    # Support concatenating indices along axis 0
    if isinstance(dfs[0], pd.Index):
        if isinstance(dfs[0], pd.CategoricalIndex):
            return pd.CategoricalIndex(union_categoricals(dfs),
                                       name=dfs[0].name)
        elif isinstance(dfs[0], pd.MultiIndex):
            first, rest = dfs[0], dfs[1:]
            if all(
                (isinstance(o, pd.MultiIndex) and o.nlevels >= first.nlevels)
                    for o in rest):
                arrays = [
                    concat([_get_level_values(i, n) for i in dfs])
                    for n in range(first.nlevels)
                ]
                return pd.MultiIndex.from_arrays(arrays, names=first.names)

            to_concat = (first.values, ) + tuple(k._values for k in rest)
            new_tuples = np.concatenate(to_concat)
            try:
                return pd.MultiIndex.from_tuples(new_tuples, names=first.names)
            except Exception:
                return pd.Index(new_tuples)
        return dfs[0].append(dfs[1:])

    # Handle categorical index separately
    dfs0_index = dfs[0].index
    if (isinstance(dfs0_index, pd.CategoricalIndex)
            or (isinstance(dfs0_index, pd.MultiIndex) and any(
                isinstance(i, pd.CategoricalIndex)
                for i in dfs0_index.levels))):
        dfs2 = [df.reset_index(drop=True) for df in dfs]
        ind = concat([df.index for df in dfs])
    else:
        dfs2 = dfs
        ind = None

    # Concatenate the partitions together, handling categories as needed
    if (isinstance(dfs2[0], pd.DataFrame) if uniform else any(
            isinstance(df, pd.DataFrame) for df in dfs2)):
        if uniform:
            dfs3 = dfs2
            cat_mask = dfs2[0].dtypes == 'category'
        else:
            # When concatenating mixed dataframes and series on axis 1, Pandas
            # converts series to dataframes with a single column named 0, then
            # concatenates.
            dfs3 = [
                df if isinstance(df, pd.DataFrame) else df.to_frame().rename(
                    columns={df.name: 0}) for df in dfs2
            ]
            # pandas may raise a RuntimeWarning for comparing ints and strs
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", RuntimeWarning)
                cat_mask = pd.concat([(df.dtypes == 'category').to_frame().T
                                      for df in dfs3],
                                     join=join).any()

        if cat_mask.any():
            not_cat = cat_mask[~cat_mask].index
            out = pd.concat(
                [df[df.columns.intersection(not_cat)] for df in dfs3],
                join=join)
            for col in cat_mask.index.difference(not_cat):
                # Find an example of categoricals in this column
                for df in dfs3:
                    sample = df.get(col)
                    if sample is not None:
                        break
                # Extract partitions, subbing in missing if needed
                parts = []
                for df in dfs3:
                    if col in df.columns:
                        parts.append(df[col])
                    else:
                        codes = np.full(len(df), -1, dtype='i8')
                        data = pd.Categorical.from_codes(
                            codes, sample.cat.categories, sample.cat.ordered)
                        parts.append(data)
                out[col] = union_categoricals(parts)
            out = out.reindex(columns=cat_mask.index)
        else:
            # pandas may raise a RuntimeWarning for comparing ints and strs
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", RuntimeWarning)
                out = pd.concat(dfs3, join=join)
    else:
        if is_categorical_dtype(dfs2[0].dtype):
            if ind is None:
                ind = concat([df.index for df in dfs2])
            return pd.Series(union_categoricals(dfs2),
                             index=ind,
                             name=dfs2[0].name)
        out = pd.concat(dfs2, join=join)
    # Re-add the index if needed
    if ind is not None:
        out.index = ind
    return out
Exemplo n.º 20
0
print(user_info.cat.rename_categories(['A+', 'AB+', 'B+', 'O+']))
print('-------------------snip---------------')
# 除了重命名,也会遇到添加类别,删除分类的操作,这些都可以通过 .cat.add_categories ,.cat.remove_categories 来实现
# 分类数据也支持使用 value_counts 方法来查看数据分布
print(user_info.value_counts())
print('-------------------snip---------------')
# 分类数据也支持使用 .str 属性来访问的
print(user_info.str.contains('O'))
print('-------------------snip---------------')
# 合并数据,借助 pd.concat 来完成
blood_type1 = pd.Categorical(['A', 'AB'])
blood_type2 = pd.Categorical(['B', 'O'])
print(pd.concat([pd.Series(blood_type1), pd.Series(blood_type2)]))
print('-------------------snip---------------')
# 分类数据经过 pd.concat 合并后类型转为了 object 类型。如果想要保持分类类型的话,可以借助 union_categoricals 来完成
from pandas.api.types import union_categoricals
print(union_categoricals([blood_type1, blood_type2]))
print('-------------------snip---------------')
# 3. 内存使用量的陷阱
# Categorical 的内存使用量是与分类数乘以数据长度成正比,object 类型的数据是一个常数乘以数据的长度
blood_type = pd.Series(['AB', 'O', 'A', 'B', np.nan, 'A'] * 1000)
print('object 类型的数据的长度:\n', blood_type.nbytes)
print('-------------------snip---------------')
print('转换成分类数据的长度:\n', blood_type.astype('category').nbytes)
print('-------------------snip---------------')
# 当类别的数量接近数据的长度,那么 Categorical 将使用与等效的 object 表示几乎相同或更多的内存
blood_type = pd.Series(['AB%04d' % i for i in range(2000)])
print('object 类型的数据的长度:\n', blood_type.nbytes)
print('-------------------snip---------------')
print('转换成分类数据的长度:\n', blood_type.astype('category').nbytes)
Exemplo n.º 21
0
        'type':
        'category',
        'id':
        'category',
        'io':
        'category',
        'timestamp':
        'datetime64[s]'
    })
    #df.loc['timestamp'] = df['timestamp'].str[:-4]
    #df.loc['timestamp'] = pd.to_datetime(df['timestamp'], format="%Y-%m-%d %H:%M:%S")
    log_dfs.append(df)

# get levels for categories
print('key levels')
key_cats = union_categoricals([x['key'] for x in log_dfs]).categories
print('type levels')
type_cats = union_categoricals([x['type'] for x in log_dfs]).categories
print('id levels')
id_cats = union_categoricals([x['id'] for x in log_dfs]).categories
print('io levels')
io_cats = union_categoricals([x.io for x in log_dfs]).categories

# set levels
for x in tqdm(log_dfs, desc="setting levels"):
    x['key'] = pd.Categorical(x['key'], categories=key_cats)
    x['type'] = pd.Categorical(x['type'], categories=type_cats)
    x['id'] = pd.Categorical(x['id'], categories=id_cats)
    x['io'] = pd.Categorical(x['io'], categories=io_cats)

great_df = pd.concat(log_dfs, axis=0, ignore_index=True)
Exemplo n.º 22
0
# %%

df2 = pd.read_csv(c.predicted_18M)

# %%
df2 = df2[['monolayer1', 'monolayer2']]

# %%
cats2 = pd.unique(df2.to_numpy().ravel())
print(f"size of cats2: {cats2.size:,}")
# %%
cats1_type = pd.CategoricalDtype(categories=cats1, ordered=False)
cats2_type = pd.CategoricalDtype(categories=cats2, ordered=False)

# %%
cats1 = pd.Categorical(cats1)
cats2 = pd.Categorical(cats2)
cats_union = union_categoricals([cats1, cats2]).unique()
cats_union.sort_values(inplace=True)
print(f"size of cats union: {cats_union.size:,}")

# %%
# save cats to file
catsdf = pd.DataFrame(
    {"codes": cats_union.codes},
    index=cats_union.categories.to_series(),
)
catsdf.index.names = ['categories']

catsdf.to_csv(c.layer_categories)