Пример #1
0
 def sum(
     self,
     axis=None,
     skipna=None,
     level=None,
     numeric_only=None,
     min_count=0,
     **kwargs,
 ):
     axis = self._get_axis_number(axis) if axis is not None else 0
     if numeric_only not in (
             None,
             True,
     ):
         raise err._unsupported_error("numeric_only", numeric_only)
     if min_count > 0:
         raise err._unsupported_error("min_count", min_count)
     return self._unary_reduction(
         [("sum", numeric_only)],
         axis=axis,
         skipna=skipna,
         level=level,
         min_count=min_count,
         **kwargs,
     )
Пример #2
0
 def var(
     self,
     axis=None,
     skipna=None,
     level=None,
     ddof=1,
     numeric_only=None,
     **kwargs,
 ):
     axis = self._get_axis_number(axis) if axis is not None else 0
     if numeric_only not in (
             None,
             True,
     ):
         raise err._unsupported_error("numeric_only", numeric_only)
     if ddof != 1:
         raise err._unsupported_error("ddof", ddof)
     return self._unary_reduction(
         [("var", numeric_only)],
         axis=axis,
         skipna=skipna,
         level=level,
         ddof=ddof,
         **kwargs,
     )
Пример #3
0
 def sum(self, numeric_only=True, min_count=0):
     if numeric_only not in (
             True,
             None,
     ):
         raise err._unsupported_error("numeric_only", numeric_only)
     if min_count > 0:
         raise err._unsupported_error("min_count", min_count)
     return self._groupby_reduce(ops=[("sum", numeric_only)])
Пример #4
0
def unary_reduction(df, ops, axis=0, skipna=True, level=None):
    if isinstance(ops, list):
        ops = [_maybe_convert_to_default(desc) for desc in ops]

    else:
        # TODO: We will hit this case once we add agg/aggregate
        assert False

    if axis != 0:
        raise err._unsupported_error("axis", axis)
    if skipna not in (
            True,
            None,
    ):
        raise err._unsupported_error("skipna", skipna)
    if level is not None:
        raise err._unsupported_error("level", level)

    columns = df._frame._columns

    indexer = []
    for idx, column in enumerate(columns):
        if incompatible_ops(ops, column.dtype.to_pandas()):
            continue
        indexer.append(idx)

    valid_columns = [columns[idx] for idx in indexer]
    ops = [desc[0] for desc in ops]

    if df._is_series:
        if len(valid_columns) == 0:
            raise TypeError(f"Cannot perform reduction '{ops[0]}' "
                            f"with {columns[0].dtype} dtype")
        result = valid_columns[0].unary_reduction(ops[0], skipna)
        return result.get_scalar().value

    else:
        frame = df._frame.replace_columns(valid_columns)
        columns = df.columns[indexer]
        new_frame = frame.unary_reduction(
            ops[0],
            columns,
            axis=axis,
            skipna=skipna,
            level=level,
        )

        if len(new_frame._columns) > 1:
            from .dataframe import DataFrame

            return DataFrame(frame=new_frame, columns=df.columns)

        else:
            from .series import Series

            return Series(frame=new_frame)
Пример #5
0
    def _copy_if_else(
        self,
        cond,
        other=None,
        inplace=False,
        axis=None,
        level=None,
        errors="raise",
        try_cast=False,
        negate=False,
    ):
        inplace = validate_bool_kwarg(inplace, "inplace")
        axis = self._get_axis_number(axis, 0)

        if level is not None:
            raise err._unsupported_error("level", level)

        if axis not in (0, ):
            raise err._unsupported_error("axis", axis)

        if try_cast not in (False, ):
            raise err._unsupported_error("try_cast", try_cast)

        # Checks on cond
        cond = self._ensure_valid_frame(cond)

        if self.ndim < cond.ndim:
            raise ValueError(
                "cannot use the higher dimensional dataframe for 'cond'")
        _, cond = self._align_frame(cond, join="left", broadcast_axis=1)

        if any(not is_bool_dtype(dtype) for dtype in cond._get_dtypes()):
            raise ValueError("'cond' must have only boolean values")

        # Checks on other
        if not is_scalar(other):
            other = self._ensure_valid_frame(other)

            if self.ndim < other.ndim:
                raise ValueError(
                    "cannot use the higher dimensional dataframe for 'other'")
            _, other = self._align_frame(other, join="left", broadcast_axis=1)

            for l_dtype, r_dtype in zip(self._get_dtypes(),
                                        other._get_dtypes()):
                if l_dtype != r_dtype:
                    raise ValueError("'other' must have the same type as self")

            other = other._frame

        else:
            other = util.sanitize_scalar(other)

        frame = self._frame.copy_if_else(cond._frame, other, negate=negate)
        return self._create_or_update_frame(frame, inplace)
Пример #6
0
    def __getitem__(self, key):
        (row_loc, row_scalar, out_ndim) = self._validate_locator(key)

        sr = self.sr
        if row_scalar:
            result = sr._frame.read_at(row_loc)

        elif isinstance(row_loc, slice):
            if row_loc == slice(None):
                result = sr._frame
            else:
                result = sr._frame.slice_rows_by_slice(row_loc, False)

        else:
            row_loc = sr._ensure_valid_frame(row_loc)

            if not row_loc._is_series:
                raise ValueError("indexer must be 1-dimensional")

            if not is_bool_dtype(row_loc.dtype):
                raise err._unsupported_error(
                    "only boolean indexers are supported now")

            # This may raise an exception if the indexer size doesn't match
            # with the index of the LHS.
            row_loc = row_loc._frame.update_legate_index(sr._raw_index)

            result = sr._frame.select(row_loc)

        try:
            return super().construct_result(result, out_ndim, row_scalar)
        except _NotFoundError:
            raise KeyError(row_loc)
Пример #7
0
 def mean(self, numeric_only=True):
     if numeric_only not in (
             True,
             None,
     ):
         raise err._unsupported_error("numeric_only", numeric_only)
     return self._groupby_reduce(ops=[("mean", numeric_only)])
Пример #8
0
def to_datetime(
    arg,
    errors="raise",
    dayfirst=False,
    yearfirst=False,
    utc=None,
    format=None,
    exact=True,
    unit=None,
    infer_datetime_format=False,
    origin="unix",
    cache=True,
):
    if not isinstance(arg, Frame):
        result = pandas.to_datetime(
            arg,
            errors=errors,
            dayfirst=dayfirst,
            yearfirst=yearfirst,
            utc=utc,
            format=format,
            exact=exact,
            unit=unit,
            infer_datetime_format=infer_datetime_format,
            origin=origin,
            cache=cache,
        )
        return util.sanitize_scalar(result)

    if not (arg._is_series and is_string_dtype(arg.dtype)):
        print(type(arg.dtype))
        raise err._unsupported_error("to_datetime handles only string columns")

    return arg.str.to_datetime(format)
Пример #9
0
def _uncompress_files(paths, compressions):
    new_paths = []
    to_remove = []

    for path, compression in zip(paths, compressions):
        if compression == CompressionType.UNCOMPRESSED:
            new_paths.append(path)
            continue

        import tempfile

        out = os.path.join(
            tempfile.gettempdir(),
            f"_lg_uncompressed_{os.path.basename(path).replace('.gz', '')}",
        )
        new_paths.append(out)
        to_remove.append(out)

        if compression == CompressionType.GZIP:
            import gzip as decompress

        elif compression == CompressionType.BZ2:
            import bz2 as decompress

        else:
            from legate.pandas.common import errors as err

            raise err._unsupported_error(
                f"unsupported compression method '{compression.name.lower()}'")

        with open(out, "wb") as f_out:
            with decompress.open(path, "rb") as f_in:
                shutil.copyfileobj(f_in, f_out)
    return new_paths, [CompressionType.UNCOMPRESSED] * len(paths), to_remove
Пример #10
0
    def dropna(self,
               axis=0,
               how="any",
               thresh=None,
               subset=None,
               inplace=False):
        axis = self._get_axis_number(axis, 0)
        inplace = validate_bool_kwarg(inplace, "inplace")

        if axis not in (0, ):
            raise err._unsupported_error("axis", axis)

        if how is None and thresh is None:
            raise TypeError("must specify how or thresh")

        if how is not None and how not in ("any", "all"):
            raise ValueError("invalid how option: %s" % how)

        if subset is not None:
            idxr = self.columns.get_indexer_for(subset)
            mask = idxr == -1
            if mask.any():
                raise KeyError(list(np.compress(mask, subset)))
        else:
            idxr = list(range(len(self.columns)))

        if thresh is None:
            thresh = len(idxr) if how == "any" else 1

        new_frame = self._frame.dropna(axis, idxr, thresh)
        return self._create_or_update_frame(new_frame, inplace)
Пример #11
0
    def sort_values(
        self,
        axis=0,
        ascending=True,
        inplace=False,
        kind="quicksort",
        na_position="last",
        ignore_index: bool = False,
    ):
        axis = self._get_axis_number(axis)
        if axis not in (0, ):
            raise err._unsupported_error("axis", axis)

        if na_position not in (
                "first",
                "last",
        ):
            raise err._invalid_value_error("na_position", na_position)

        ascending = self._get_ascending(ascending, 1)
        new_frame = self._frame.sort_values(
            [0],
            axis,
            ascending,
            kind,
            na_position,
            ignore_index,
        )
        return self._create_or_update_frame(new_frame, inplace)
Пример #12
0
    def squeeze(self, axis=None):
        axis = self._get_axis_number(axis, None)
        if axis not in (
                1,
                None,
        ):
            raise err._unsupported_error("axis", axis)

        result = self

        if (axis in (
                1,
                None,
        ) and len(result.columns) == 1):
            result = Series(frame=result._frame, name=result.columns[0])

        if (axis in (
                0,
                None,
        ) and len(result) == 1):
            if result._is_series:
                result = result.to_pandas().squeeze()
            else:
                # TODO: We want to handle this case once we support series
                #       of mixed type values (which would be either expressed
                #       by its transpose or backed by a Pandas series).
                warnings.warn(
                    "Squeezing a dataframe on both axes is currently "
                    "unsupported unless the size is 1. Squeeze for axis=0 "
                    "will be ignored.")

        return result
Пример #13
0
 def __getattr__(self, key):
     try:
         return object.__getattribute__(self, key)
     except AttributeError as e:
         if hasattr(pandas.Series, key):
             raise err._unsupported_error(
                 f"Series.{key} is not yet implemented in Legate Pandas.")
         raise e
Пример #14
0
def convert_agg_func(agg_func):
    if isinstance(agg_func, str):
        if agg_func not in _SUPPORTED_AGGS:
            raise err._unsupported_error(
                f"Unsupported aggregation method: {agg_func}")
        return (agg_func, _NUMERIC_ONLY[agg_func])
    elif is_dict_like(agg_func):
        converted = {}
        for col, func in agg_func.items():
            funcs = util.to_list_if_scalar(convert_agg_func(func))
            converted[col] = funcs
        return converted
    elif is_list_like(agg_func):
        return [convert_agg_func(func) for func in agg_func]
    else:
        raise err._unsupported_error(
            f"Unsupported aggregation descriptor: {agg_func}")
Пример #15
0
    def sort_index(
        self,
        axis=0,
        level=None,
        ascending=True,
        inplace=False,
        kind="quicksort",
        na_position="last",
        sort_remaining=True,
        ignore_index: bool = False,
    ):
        axis = self._get_axis_number(axis)
        if axis not in (0, ):
            raise err._unsupported_error("axis", axis)

        nlevels = self._raw_index.nlevels
        if nlevels == 1:
            # Pandas ignores level and sort_remaining for single-level indices,
            levels = [0] if level is None else util.to_list_if_scalar(level)
            # and it casts ascending to a boolean value...
            ascending = [bool(ascending)]
        else:
            if level is None:
                levels = list(range(nlevels))
                # When level is None, Pandas crops the ascending list
                # to match its length to the number of levels...
                ascending = self._get_ascending(ascending, nlevels)[:nlevels]
            else:
                levels = util.to_list_if_scalar(level)
                levels = [
                    self._raw_index._get_level_number(lvl) for lvl in levels
                ]
                default_asc = bool(ascending)
                ascending = self._get_ascending(ascending, len(levels))
                if len(ascending) != len(levels):
                    raise ValueError(
                        "level must have same length as ascending")
                # XXX: Pandas ignores sort_remaining for multi-level indices
                #      (GH #24247), and always sorts the levels monotonically
                #      before the actual sorting...
                #      Here we do the right thing and hopefully Pandas fixes
                #      its bug in the future.
                if sort_remaining:
                    already_added = set(levels)
                    for lvl in range(nlevels):
                        if lvl not in already_added:
                            levels.append(lvl)
                            ascending.append(default_asc)

        new_frame = self._frame.sort_index(
            axis=axis,
            levels=levels,
            ascending=ascending,
            kind=kind,
            na_position=na_position,
            ignore_index=ignore_index,
        )
        return self._create_or_update_frame(new_frame, inplace)
Пример #16
0
    def binary_op(self, op, other):
        reverse = False
        if op in _REVERSED_OPS:
            op = op[1:]
            reverse = True

        # Perform binary operation
        rhs1 = self._columns
        if is_scalar(other):
            other = self._runtime.create_scalar(other, ty.infer_dtype(other))
            rhs2 = [other] * len(rhs1)
        else:
            rhs2 = other._columns

        results = []
        for rh1, rh2 in zip(rhs1, rhs2):
            # If the right operand is integer, we convert it to the left
            # operand's dtype
            if isinstance(rh2, Scalar):
                if ty.is_integer_dtype(rh2.dtype):
                    rh2 = rh2.astype(rh1.dtype)
                elif ty.is_categorical_dtype(rh1.dtype):
                    rh2 = rh1.dtype.encode(rh2, unwrap=False, can_fail=True)
                else:
                    common_dtype = ty.find_common_dtype(rh1.dtype, rh2.dtype)
                    rh1 = rh1.astype(common_dtype)
                    rh2 = rh2.astype(common_dtype)

            elif not (
                ty.is_categorical_dtype(rh1.dtype)
                or ty.is_categorical_dtype(rh2.dtype)
            ):
                common_dtype = ty.find_common_dtype(rh1.dtype, rh2.dtype)
                rh1 = rh1.astype(common_dtype)
                rh2 = rh2.astype(common_dtype)

            lh_dtype = ty.get_binop_result_type(op, rh1.dtype, rh2.dtype)

            if ty.is_string_dtype(rh1.dtype) and op in (
                "add",
                "mul",
            ):
                raise err._unsupported_error(
                    f"unsupported operand type(s) for {op}: "
                    f"'{rh1.dtype}' and '{rh2.dtype}'"
                )

            if reverse:
                rh1, rh2 = rh2, rh1

            swapped = False
            if isinstance(rh1, Scalar):
                rh1, rh2 = rh2, rh1
                swapped = True

            results.append(rh1.binary_op(op, rh2, lh_dtype, swapped=swapped))

        return Table(self._runtime, self._index, results)
Пример #17
0
 def count(self, axis=0, level=None, numeric_only=False):
     axis = self._get_axis_number(axis) if axis is not None else 0
     if numeric_only not in (
             None,
             False,
     ):
         raise err._unsupported_error("numeric_only", numeric_only)
     return self._unary_reduction([("count", numeric_only)],
                                  axis=axis,
                                  level=level)
Пример #18
0
    def from_stores(type, stores, children=None):
        from .bitmask import Bitmask
        from .runtime import _runtime as rt

        if children is not None:
            raise err._unsupported_error("Only accept flat stores for now")

        slices = [Column._import_store(rt, store) for store in stores]

        if len(stores) > 2:
            raise err._unsupported_error(
                f"Unsupported Legate Array type: {type}")

        dtype = ty.to_legate_dtype(type)

        assert dtype == slices[1].dtype

        bitmask = None if slices[0] is None else Bitmask(rt, slices[0])
        return Column(rt, slices[1], bitmask)
Пример #19
0
    def _import_store(rt, store):
        if store is None:
            return None
        kind = store.kind

        if kind not in ((Region, FieldID), (Region, int)):
            raise err._unsupported_error(
                f"Unsupported Legate Store kind: {kind}")

        (region, fid) = store.storage

        if region.index_space.get_dim() != 1:
            raise err._unsupported_error("All Legate Arrays must be 1-D")

        dtype = ty.to_legate_dtype(store.type)
        if kind[1] is FieldID:
            fid = fid.fid

        storage = rt._create_external_storage(region)
        return storage.import_field(region, fid, dtype)
Пример #20
0
    def from_pandas(cls, runtime, dtype):
        if dtype.categories.dtype != object:
            raise err._unsupported_error("Categories must be strings for now")
        categories_storage = runtime.create_storage(len(dtype.categories))
        categories_column = runtime._create_string_column_from_pandas(
            categories_storage,
            dtype.categories,
            num_pieces=1,
        ).as_replicated_column()

        return cls(categories_column, dtype.ordered)
Пример #21
0
    def to_csv(
        self,
        path_or_buf=None,
        sep=",",
        na_rep="",
        columns=None,
        header=True,
        index=True,
        line_terminator=None,
        chunksize=None,
        partition=False,
    ):
        if not isinstance(path_or_buf, str):
            raise err._unsupported_error("path must be a string for now")

        if len(sep) != 1:
            raise err._unsupported_error("separator must be a character")

        line_terminator = (os.linesep
                           if line_terminator is None else line_terminator)

        # The default chunk size is 8
        chunksize = 8 if chunksize is None else chunksize

        new_self = self
        if columns is not None:
            new_self = self[util.to_list_if_scalar(columns)]

        new_self._frame.to_csv(
            path=path_or_buf,
            sep=sep,
            na_rep=na_rep,
            header=header,
            index=index,
            line_terminator=line_terminator,
            chunksize=chunksize,
            partition=partition,
            column_names=new_self.columns.to_list(),
        )
Пример #22
0
def find_common_dtype(dtype1, dtype2):
    if is_categorical_dtype(dtype1) and is_categorical_dtype(dtype2):
        from legate.pandas.common import errors as err

        raise err._unsupported_error(
            "categorical dtypes are not supported yet")

    if dtype1 == dtype2:
        return dtype1
    else:
        return to_legate_dtype(
            np.find_common_type(
                [dtype1.to_pandas(), dtype2.to_pandas()], []))
Пример #23
0
    def __setitem__(self, key, item):
        (row_loc, row_scalar, _) = self._validate_locator(key)

        sr = self.sr

        self._validate_lhs(sr)

        if row_scalar:
            row_loc = sr._raw_index == row_loc

            index = sr._frame.slice_index_by_boolean_mask(row_loc)

            item = self._align_rhs(sr, index, item)

            result = sr._frame.scatter_by_boolean_mask(row_loc, index, item)

        elif isinstance(row_loc, slice):
            if row_loc == slice(None):
                index = sr._frame._index

                item = self._align_rhs(sr, index, item)

                result = item

            else:
                (index, bounds) = sr._frame.slice_index_by_slice(row_loc, True)

                item = self._align_rhs(sr, index, item)

                result = sr._frame.scatter_by_slice(index, bounds, item)

        else:
            row_loc = sr._ensure_valid_frame(row_loc)
            _, row_loc = sr._align_frame(row_loc, join="left", axis=0)

            if not row_loc._is_series:
                raise ValueError("indexer must be 1-dimensional")

            if not is_bool_dtype(row_loc.dtype):
                raise err._unsupported_error(
                    "only boolean indexers are supported now")

            row_loc = row_loc._frame

            index = sr._frame.slice_index_by_boolean_mask(row_loc)

            item = self._align_rhs(sr, index, item)

            result = sr._frame.scatter_by_boolean_mask(row_loc, index, item)

        self.update_column(result)