Пример #1
0
def maybe_mangle_lambdas(agg_spec: Any) -> Any:
    """
    Make new lambdas with unique names.

    Parameters
    ----------
    agg_spec : Any
        An argument to GroupBy.agg.
        Non-dict-like `agg_spec` are pass through as is.
        For dict-like `agg_spec` a new spec is returned
        with name-mangled lambdas.

    Returns
    -------
    mangled : Any
        Same type as the input.

    Examples
    --------
    >>> maybe_mangle_lambdas('sum')
    'sum'
    >>> maybe_mangle_lambdas([lambda: 1, lambda: 2])  # doctest: +SKIP
    [<function __main__.<lambda_0>,
     <function pandas...._make_lambda.<locals>.f(*args, **kwargs)>]
    """
    is_dict = is_dict_like(agg_spec)
    if not (is_dict or is_list_like(agg_spec)):
        return agg_spec
    mangled_aggspec = type(agg_spec)()  # dict or OrderedDict

    if is_dict:
        for key, aggfuncs in agg_spec.items():
            if is_list_like(aggfuncs) and not is_dict_like(aggfuncs):
                mangled_aggfuncs = _managle_lambda_list(aggfuncs)
            else:
                mangled_aggfuncs = aggfuncs

            mangled_aggspec[key] = mangled_aggfuncs
    else:
        mangled_aggspec = _managle_lambda_list(agg_spec)

    return mangled_aggspec
Пример #2
0
    def _get_empty_meta(
        self, columns, index_col, index_names, dtype: DtypeArg | None = None
    ):
        columns = list(columns)

        # Convert `dtype` to a defaultdict of some kind.
        # This will enable us to write `dtype[col_name]`
        # without worrying about KeyError issues later on.
        if not is_dict_like(dtype):
            # if dtype == None, default will be object.
            default_dtype = dtype or object
            # error: Argument 1 to "defaultdict" has incompatible type "Callable[[],
            # Union[ExtensionDtype, str, dtype[Any], Type[object], Dict[Hashable,
            # Union[ExtensionDtype, Union[str, dtype[Any]], Type[str], Type[float],
            # Type[int], Type[complex], Type[bool], Type[object]]]]]"; expected
            # "Optional[Callable[[], Union[ExtensionDtype, str, dtype[Any],
            # Type[object]]]]"
            # error: Incompatible return value type (got "Union[ExtensionDtype, str,
            # dtype[Any], Type[object], Dict[Hashable, Union[ExtensionDtype, Union[str,
            # dtype[Any]], Type[str], Type[float], Type[int], Type[complex], Type[bool],
            # Type[object]]]]", expected "Union[ExtensionDtype, str, dtype[Any],
            # Type[object]]")
            dtype = defaultdict(
                lambda: default_dtype  # type: ignore[arg-type, return-value]
            )
        else:
            dtype = cast(dict, dtype)
            dtype = defaultdict(
                lambda: object,
                {columns[k] if is_integer(k) else k: v for k, v in dtype.items()},
            )

        # Even though we have no data, the "index" of the empty DataFrame
        # could for example still be an empty MultiIndex. Thus, we need to
        # check whether we have any index columns specified, via either:
        #
        # 1) index_col (column indices)
        # 2) index_names (column names)
        #
        # Both must be non-null to ensure a successful construction. Otherwise,
        # we have to create a generic empty Index.
        if (index_col is None or index_col is False) or index_names is None:
            index = Index([])
        else:
            data = [Series([], dtype=dtype[name]) for name in index_names]
            index = ensure_index_from_sequences(data, names=index_names)
            index_col.sort()

            for i, n in enumerate(index_col):
                columns.pop(n - i)

        col_dict = {col_name: Series([], dtype=dtype[col_name]) for col_name in columns}

        return index, columns, col_dict
Пример #3
0
    def _get_empty_meta(self,
                        columns,
                        index_col,
                        index_names,
                        dtype: DtypeArg | None = None):
        columns = list(columns)

        # Convert `dtype` to a defaultdict of some kind.
        # This will enable us to write `dtype[col_name]`
        # without worrying about KeyError issues later on.
        dtype_dict: defaultdict[Hashable, Any]
        if not is_dict_like(dtype):
            # if dtype == None, default will be object.
            default_dtype = dtype or object
            dtype_dict = defaultdict(lambda: default_dtype)
        else:
            dtype = cast(dict, dtype)
            dtype_dict = defaultdict(
                lambda: object,
                {
                    columns[k] if is_integer(k) else k: v
                    for k, v in dtype.items()
                },
            )

        # Even though we have no data, the "index" of the empty DataFrame
        # could for example still be an empty MultiIndex. Thus, we need to
        # check whether we have any index columns specified, via either:
        #
        # 1) index_col (column indices)
        # 2) index_names (column names)
        #
        # Both must be non-null to ensure a successful construction. Otherwise,
        # we have to create a generic empty Index.
        if (index_col is None or index_col is False) or index_names is None:
            index = Index([])
        else:
            data = [Series([], dtype=dtype_dict[name]) for name in index_names]
            index = ensure_index_from_sequences(data, names=index_names)
            index_col.sort()

            for i, n in enumerate(index_col):
                columns.pop(n - i)

        col_dict = {
            col_name: Series([], dtype=dtype_dict[col_name])
            for col_name in columns
        }

        return index, columns, col_dict
Пример #4
0
def aggregate(
    obj,
    arg: AggFuncType,
    *args,
    **kwargs,
):
    """
    Provide an implementation for the aggregators.

    Parameters
    ----------
    obj : Pandas object to compute aggregation on.
    arg : string, dict, function.
    *args : args to pass on to the function.
    **kwargs : kwargs to pass on to the function.

    Returns
    -------
    tuple of result, how.

    Notes
    -----
    how can be a string describe the required post-processing, or
    None if not required.
    """
    _axis = kwargs.pop("_axis", None)
    if _axis is None:
        _axis = getattr(obj, "axis", 0)

    if isinstance(arg, str):
        return obj._try_aggregate_string_function(arg, *args, **kwargs), None
    elif is_dict_like(arg):
        arg = cast(Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]],
                   arg)
        return agg_dict_like(obj, arg, _axis), True
    elif is_list_like(arg):
        # we require a list, but not an 'str'
        arg = cast(List[AggFuncTypeBase], arg)
        return agg_list_like(obj, arg, _axis=_axis), None
    else:
        result = None

    if callable(arg):
        f = obj._get_cython_func(arg)
        if f and not args and not kwargs:
            return getattr(obj, f)(), None

    # caller can react
    return result, True
Пример #5
0
    def _validate_parse_dates_presence(self, columns: list[str]) -> None:
        """
        Check if parse_dates are in columns.

        If user has provided names for parse_dates, check if those columns
        are available.

        Parameters
        ----------
        columns : list
            List of names of the dataframe.

        Raises
        ------
        ValueError
            If column to parse_date is not in dataframe.

        """
        cols_needed: Iterable
        if is_dict_like(self.parse_dates):
            cols_needed = itertools.chain(*self.parse_dates.values())
        elif is_list_like(self.parse_dates):
            # a column in parse_dates could be represented
            # ColReference = Union[int, str]
            # DateGroups = List[ColReference]
            # ParseDates = Union[DateGroups, List[DateGroups],
            #     Dict[ColReference, DateGroups]]
            cols_needed = itertools.chain.from_iterable(
                col if is_list_like(col) and not isinstance(col, tuple) else [col]
                for col in self.parse_dates
            )
        else:
            cols_needed = []

        # get only columns that are references using names (str), not by index
        missing_cols = ", ".join(
            sorted(
                {
                    col
                    for col in cols_needed
                    if isinstance(col, str) and col not in columns
                }
            )
        )
        if missing_cols:
            raise ValueError(
                f"Missing column provided to 'parse_dates': '{missing_cols}'"
            )
Пример #6
0
    def rename(self, index=None, **kwargs):
        non_mapping = is_scalar(index) or (is_list_like(index)
                                           and not is_dict_like(index))
        if non_mapping:
            if kwargs.get("inplace", False):
                self.name = index
            else:
                self_cp = self.copy()
                self_cp.name = index
                return self_cp
        else:
            from .dataframe import DataFrame

            result = DataFrame(self).rename(index=index, **kwargs).squeeze()
            result.name = self.name
            return result
Пример #7
0
    def apply(self) -> FrameOrSeriesUnion:
        """ compute the results """
        # dispatch to agg
        if is_list_like(self.f) or is_dict_like(self.f):
            # pandas\core\apply.py:144: error: "aggregate" of "DataFrame" gets
            # multiple values for keyword argument "axis"
            return self.obj.aggregate(  # type: ignore[misc]
                self.f,
                axis=self.axis,
                *self.args,
                **self.kwds)

        # all empty
        if len(self.columns) == 0 and len(self.index) == 0:
            return self.apply_empty_result()

        # string dispatch
        if isinstance(self.f, str):
            # Support for `frame.transform('method')`
            # Some methods (shift, etc.) require the axis argument, others
            # don't, so inspect and insert if necessary.
            func = getattr(self.obj, self.f)
            sig = inspect.getfullargspec(func)
            if "axis" in sig.args:
                self.kwds["axis"] = self.axis
            return func(*self.args, **self.kwds)

        # ufunc
        elif isinstance(self.f, np.ufunc):
            with np.errstate(all="ignore"):
                results = self.obj._mgr.apply("apply", func=self.f)
            # _constructor will retain self.index and self.columns
            return self.obj._constructor(data=results)

        # broadcasting
        if self.result_type == "broadcast":
            return self.apply_broadcast(self.obj)

        # one axis empty
        elif not all(self.obj.shape):
            return self.apply_empty_result()

        # raw
        elif self.raw:
            return self.apply_raw()

        return self.apply_standard()
Пример #8
0
    def get_result(self):
        """ compute the results """

        # dispatch to agg
        if is_list_like(self.f) or is_dict_like(self.f):
            return self.obj.aggregate(self.f,
                                      axis=self.axis,
                                      *self.args,
                                      **self.kwds)

        # all empty
        if len(self.columns) == 0 and len(self.index) == 0:
            return self.apply_empty_result()

        # string dispatch
        if isinstance(self.f, str):
            # Support for `frame.transform('method')`
            # Some methods (shift, etc.) require the axis argument, others
            # don't, so inspect and insert if necessary.
            func = getattr(self.obj, self.f)
            sig = inspect.getfullargspec(func)
            if "axis" in sig.args:
                self.kwds["axis"] = self.axis
            return func(*self.args, **self.kwds)

        # ufunc
        elif isinstance(self.f, np.ufunc):
            with np.errstate(all="ignore"):
                results = self.obj._data.apply("apply", func=self.f)
            return self.obj._constructor(data=results,
                                         index=self.index,
                                         columns=self.columns,
                                         copy=False)

        # broadcasting
        if self.result_type == "broadcast":
            return self.apply_broadcast()

        # one axis empty
        elif not all(self.obj.shape):
            return self.apply_empty_result()

        # raw
        elif self.raw and not self.obj._is_mixed_type:
            return self.apply_raw()

        return self.apply_standard()
    def normalize_dictlike_arg(
        self, how: str, obj: FrameOrSeriesUnion, func: AggFuncTypeDict
    ) -> AggFuncTypeDict:
        """
        Handler for dict-like argument.

        Ensures that necessary columns exist if obj is a DataFrame, and
        that a nested renamer is not passed. Also normalizes to all lists
        when values consists of a mix of list and non-lists.
        """
        assert how in ("apply", "agg", "transform")

        # Can't use func.values(); wouldn't work for a Series
        if (
            how == "agg"
            and isinstance(obj, ABCSeries)
            and any(is_list_like(v) for _, v in func.items())
        ) or (any(is_dict_like(v) for _, v in func.items())):
            # GH 15931 - deprecation of renaming keys
            raise SpecificationError("nested renamer is not supported")

        if obj.ndim != 1:
            # Check for missing columns on a frame
            cols = set(func.keys()) - set(obj.columns)
            if len(cols) > 0:
                cols_sorted = list(safe_sort(list(cols)))
                raise KeyError(f"Column(s) {cols_sorted} do not exist")

        is_aggregator = lambda x: isinstance(x, (list, tuple, dict))

        # if we have a dict of any non-scalars
        # eg. {'A' : ['mean']}, normalize all to
        # be list-likes
        # Cannot use func.values() because arg may be a Series
        if any(is_aggregator(x) for _, x in func.items()):
            new_func: AggFuncTypeDict = {}
            for k, v in func.items():
                if not is_aggregator(v):
                    # mypy can't realize v is not a list here
                    new_func[k] = [v]  # type:ignore[list-item]
                else:
                    new_func[k] = v
            func = new_func
        return func
Пример #10
0
    def agg(self) -> Tuple[Optional[FrameOrSeriesUnion], Optional[bool]]:
        """
        Provide an implementation for the aggregators.

        Returns
        -------
        tuple of result, how.

        Notes
        -----
        how can be a string describe the required post-processing, or
        None if not required.
        """
        obj = self.obj
        arg = self.f
        args = self.args
        kwargs = self.kwds

        _axis = kwargs.pop("_axis", None)
        if _axis is None:
            _axis = getattr(obj, "axis", 0)

        result = self.maybe_apply_str()
        if result is not None:
            return result, None

        if is_dict_like(arg):
            arg = cast(AggFuncTypeDict, arg)
            return agg_dict_like(obj, arg, _axis), True
        elif is_list_like(arg):
            # we require a list, but not a 'str'
            arg = cast(List[AggFuncTypeBase], arg)
            return agg_list_like(obj, arg, _axis=_axis), None
        else:
            result = None

        if callable(arg):
            f = obj._get_cython_func(arg)
            if f and not args and not kwargs:
                return getattr(obj, f)(), None

        # caller can react
        return result, True
Пример #11
0
    def get_result(self):
        """ compute the results """

        # dispatch to agg
        if is_list_like(self.f) or is_dict_like(self.f):
            return self.obj.aggregate(self.f, axis=self.axis,
                                      *self.args, **self.kwds)

        # all empty
        if len(self.columns) == 0 and len(self.index) == 0:
            return self.apply_empty_result()

        # string dispatch
        if isinstance(self.f, compat.string_types):
            # Support for `frame.transform('method')`
            # Some methods (shift, etc.) require the axis argument, others
            # don't, so inspect and insert if necessary.
            func = getattr(self.obj, self.f)
            sig = compat.signature(func)
            if 'axis' in sig.args:
                self.kwds['axis'] = self.axis
            return func(*self.args, **self.kwds)

        # ufunc
        elif isinstance(self.f, np.ufunc):
            with np.errstate(all='ignore'):
                results = self.f(self.values)
            return self.obj._constructor(data=results, index=self.index,
                                         columns=self.columns, copy=False)

        # broadcasting
        if self.result_type == 'broadcast':
            return self.apply_broadcast()

        # one axis empty
        elif not all(self.obj.shape):
            return self.apply_empty_result()

        # raw
        elif self.raw and not self.obj._is_mixed_type:
            return self.apply_raw()

        return self.apply_standard()
Пример #12
0
def transform_dict_like(
    obj: FrameOrSeries,
    func: AggFuncTypeDict,
    *args,
    **kwargs,
):
    """
    Compute transform in the case of a dict-like func
    """
    from pandas.core.reshape.concat import concat

    if len(func) == 0:
        raise ValueError("No transform functions were provided")

    if obj.ndim != 1:
        # Check for missing columns on a frame
        cols = set(func.keys()) - set(obj.columns)
        if len(cols) > 0:
            cols_sorted = list(safe_sort(list(cols)))
            raise SpecificationError(f"Column(s) {cols_sorted} do not exist")

    # Can't use func.values(); wouldn't work for a Series
    if any(is_dict_like(v) for _, v in func.items()):
        # GH 15931 - deprecation of renaming keys
        raise SpecificationError("nested renamer is not supported")

    results: Dict[Hashable, FrameOrSeriesUnion] = {}
    for name, how in func.items():
        colg = obj._gotitem(name, ndim=1)
        try:
            results[name] = transform(colg, how, 0, *args, **kwargs)
        except Exception as err:
            if (
                str(err) == "Function did not transform"
                or str(err) == "No transform functions were provided"
            ):
                raise err

    # combine results
    if len(results) == 0:
        raise ValueError("Transform function failed")
    return concat(results, axis=1)
Пример #13
0
    def apply(self) -> FrameOrSeriesUnion:
        """ compute the results """
        # dispatch to agg
        if is_list_like(self.f) or is_dict_like(self.f):
            # pandas\core\apply.py:144: error: "aggregate" of "DataFrame" gets
            # multiple values for keyword argument "axis"
            return self.obj.aggregate(  # type: ignore[misc]
                self.f, axis=self.axis, *self.args, **self.kwds
            )

        # all empty
        if len(self.columns) == 0 and len(self.index) == 0:
            return self.apply_empty_result()

        # string dispatch
        result = self.maybe_apply_str()
        if result is not None:
            return result

        # ufunc
        elif isinstance(self.f, np.ufunc):
            with np.errstate(all="ignore"):
                results = self.obj._mgr.apply("apply", func=self.f)
            # _constructor will retain self.index and self.columns
            return self.obj._constructor(data=results)

        # broadcasting
        if self.result_type == "broadcast":
            return self.apply_broadcast(self.obj)

        # one axis empty
        elif not all(self.obj.shape):
            return self.apply_empty_result()

        # raw
        elif self.raw:
            return self.apply_raw()

        return self.apply_standard()
Пример #14
0
    def validate_dictlike_arg(self, how: str, obj: FrameOrSeriesUnion,
                              func: AggFuncTypeDict) -> None:
        """
        Raise if dict-like argument is invalid.

        Ensures that necessary columns exist if obj is a DataFrame, and
        that a nested renamer is not passed.
        """
        assert how in ("apply", "agg", "transform")

        # Can't use func.values(); wouldn't work for a Series
        if (how == "agg" and isinstance(obj, ABCSeries)
                and any(is_list_like(v) for _, v in func.items())) or (any(
                    is_dict_like(v) for _, v in func.items())):
            # GH 15931 - deprecation of renaming keys
            raise SpecificationError("nested renamer is not supported")

        if obj.ndim != 1:
            # Check for missing columns on a frame
            cols = set(func.keys()) - set(obj.columns)
            if len(cols) > 0:
                cols_sorted = list(safe_sort(list(cols)))
                raise KeyError(f"Column(s) {cols_sorted} do not exist")
Пример #15
0
    def _map_values(self, mapper, na_action=None):
        """
        An internal function that maps values using the input
        correspondence (which can be a dict, Series, or function).

        Parameters
        ----------
        mapper : function, dict, or Series
            The input correspondence object
        na_action : {None, 'ignore'}
            If 'ignore', propagate NA values, without passing them to the
            mapping function

        Returns
        -------
        Union[Index, MultiIndex], inferred
            The output of the mapping function applied to the index.
            If the function returns a tuple with more than one element
            a MultiIndex will be returned.
        """
        # we can fastpath dict/Series to an efficient map
        # as we know that we are not going to have to yield
        # python types
        if is_dict_like(mapper):
            if isinstance(mapper, dict) and hasattr(mapper, "__missing__"):
                # If a dictionary subclass defines a default value method,
                # convert mapper to a lookup function (GH #15999).
                dict_with_default = mapper
                mapper = lambda x: dict_with_default[x]
            else:
                # Dictionary does not have a default. Thus it's safe to
                # convert to an Series for efficiency.
                # we specify the keys here to handle the
                # possibility that they are tuples

                # The return value of mapping with an empty mapper is
                # expected to be pd.Series(np.nan, ...). As np.nan is
                # of dtype float64 the return value of this method should
                # be float64 as well
                mapper = create_series_with_explicit_dtype(
                    mapper, dtype_if_empty=np.float64)

        if isinstance(mapper, ABCSeries):
            # Since values were input this means we came from either
            # a dict or a series and mapper should be an index
            if is_categorical_dtype(self.dtype):
                # use the built in categorical series mapper which saves
                # time by mapping the categories instead of all values

                # error: Incompatible types in assignment (expression has type
                # "Categorical", variable has type "IndexOpsMixin")
                self = cast("Categorical", self)  # type: ignore[assignment]
                # error: Item "ExtensionArray" of "Union[ExtensionArray, Any]" has no
                # attribute "map"
                return self._values.map(mapper)  # type: ignore[union-attr]

            values = self._values

            indexer = mapper.index.get_indexer(values)
            new_values = algorithms.take_nd(mapper._values, indexer)

            return new_values

        # we must convert to python types
        if is_extension_array_dtype(self.dtype) and hasattr(
                self._values, "map"):
            # GH#23179 some EAs do not have `map`
            values = self._values
            if na_action is not None:
                raise NotImplementedError
            map_f = lambda values, f: values.map(f)
        else:
            # error: "IndexOpsMixin" has no attribute "astype"
            values = self.astype(object)._values  # type: ignore[attr-defined]
            if na_action == "ignore":
                map_f = lambda values, f: lib.map_infer_mask(
                    values, f,
                    isna(values).view(np.uint8))
            elif na_action is None:
                map_f = lib.map_infer
            else:
                msg = ("na_action must either be 'ignore' or None, "
                       f"{na_action} was passed")
                raise ValueError(msg)

        # mapper is a function
        new_values = map_f(values, mapper)

        return new_values
Пример #16
0
def transform(
    obj: FrameOrSeries, func: AggFuncType, axis: Axis, *args, **kwargs
) -> FrameOrSeriesUnion:
    """
    Transform a DataFrame or Series

    Parameters
    ----------
    obj : DataFrame or Series
        Object to compute the transform on.
    func : string, function, list, or dictionary
        Function(s) to compute the transform with.
    axis : {0 or 'index', 1 or 'columns'}
        Axis along which the function is applied:

        * 0 or 'index': apply function to each column.
        * 1 or 'columns': apply function to each row.

    Returns
    -------
    DataFrame or Series
        Result of applying ``func`` along the given axis of the
        Series or DataFrame.

    Raises
    ------
    ValueError
        If the transform function fails or does not transform.
    """
    is_series = obj.ndim == 1

    if obj._get_axis_number(axis) == 1:
        assert not is_series
        return transform(obj.T, func, 0, *args, **kwargs).T

    if is_list_like(func) and not is_dict_like(func):
        func = cast(List[AggFuncTypeBase], func)
        # Convert func equivalent dict
        if is_series:
            func = {com.get_callable_name(v) or v: v for v in func}
        else:
            func = {col: func for col in obj}

    if is_dict_like(func):
        func = cast(Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]], func)
        return transform_dict_like(obj, func, *args, **kwargs)

    # func is either str or callable
    func = cast(AggFuncTypeBase, func)
    try:
        result = transform_str_or_callable(obj, func, *args, **kwargs)
    except Exception:
        raise ValueError("Transform function failed")

    # Functions that transform may return empty Series/DataFrame
    # when the dtype is not appropriate
    if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty:
        raise ValueError("Transform function failed")
    if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals(
        obj.index
    ):
        raise ValueError("Function did not transform")

    return result
Пример #17
0
    def check_syntax(self):
        #check input
        action_configs = self.data_configs.get('input')
        if not action_configs.get('fact_train', {}) or not action_configs.get(
                'fact_test', {}):
            logger.error(
                'Both fact_train and fact_test must be configured for {}!'.
                format('input'))
            exit(0)
        are_mappings = [is_dict_like(v) for k, v in action_configs.items()]
        if not all(are_mappings):
            logger.error(
                'items and nested items for input must be dictionaries!')
            exit(0)

        #check process_sequence
        process_sequence = self.data_configs.get('process_sequence', [])
        if not process_sequence:
            logger.error('process_sequence is mandantory!')
            exit(0)
        for process_key in process_sequence:
            process_configs = self.data_configs.get(process_key, {})
            if not process_configs:
                logger.error(
                    'There is no configruations for process_key {}!'.format(
                        process_key))
                exit(0)

            #check action sequence
            action_sequence = process_configs.get('action_sequence', [])
            if not action_sequence:
                logger.error(
                    'action_sequence is mandantory for process {}!'.format(
                        process_key))
                exit(0)

            if 'get_data' not in action_sequence or 'result' not in action_sequence:
                logger.error(
                    'get_data and result must be in action_sequence for {}'.
                    format(process_key))
                exit(0)

            possible_actions = [
                'aggregations', 'change_dtype', 'clip_outliers',
                'drop_columns', 'drop_rows', 'factorize_columns', 'get_data',
                'interaction_columns', 'kbins', 'one_hot_encoder',
                'onehot_encoding', 'pca', 'reduce_mem_usage',
                'remove_duplicate', 'replace_values', 'result',
                'select_columns', 'simple_impute', 'standardization'
            ]

            #check actions
            for action_key in action_sequence:
                action_configs = process_configs.get(action_key)
                if action_configs is None:
                    logger.error('No {} configuration for {}'.format(
                        action_key, process_key))
                    exit(0)

                ac = [x for x in possible_actions if x in action_key]
                if not ac:
                    logger.error(
                        '{} is not supported. Only below actions are supported at the moment:{}'
                        .format(action_key, possible_actions))
                    exit(0)

                #check every action
                if 'aggregations' in action_key:
                    action_configs = process_configs.get(action_key, [])
                    for action_config in action_configs:
                        groupby_cols = action_config.get('groupby', [])
                        if not groupby_cols:
                            logger.error("No columns for groupby field")
                            exit(0)
                        metrics_cols = action_config.get('metrics', {})
                        if not (metrics_cols
                                or action_config.get('count', False)
                                or action_config.get('percent', False)):
                            logger.error(
                                "There should be at least one of below three: columns for metrics field , count or percent "
                            )
                            exit(0)

                elif 'clip_outliers' in action_key:
                    action_configs = process_configs.get(action_key, [])
                    if type(action_config) != 'list':
                        logger.error("clip_outliers should be a list!")

                elif 'get_data' in action_key:
                    are_mappings = [is_dict_like(v) for v in action_configs]
                    if not all(are_mappings):
                        logger.error(
                            'items in {} must be dictionaries!'.format(
                                action_key))
                        exit(0)

                elif 'replace_values' in action_key:
                    action_configs = process_configs.get(action_key, [])
                    are_mappings = [
                        is_dict_like(v) for k, v in action_configs.items()
                    ]
                    if not all(are_mappings):
                        logger.error(
                            'items in replace_values must be dictionaries!')
                        exit(0)

                elif 'interaction_columns' in action_key:
                    action_configs = process_configs.get(action_key, [])
                    possible_interactions = [
                        'add', 'subtract', 'subtract_positive', 'multiply',
                        'divide', 'datetime', 'function'
                    ]
                    for v in action_configs:
                        interaction = v.get('mode', None)
                        if interaction not in possible_interactions:
                            logger.error(
                                "interaction {} is not supported. Only below interactions are supported at the moment:{}"
                                .format(v, possible_interactions))
                            exit(0)
        return
    def transform(self) -> FrameOrSeriesUnion:
        """
        Transform a DataFrame or Series.

        Returns
        -------
        DataFrame or Series
            Result of applying ``func`` along the given axis of the
            Series or DataFrame.

        Raises
        ------
        ValueError
            If the transform function fails or does not transform.
        """
        obj = self.obj
        func = self.orig_f
        axis = self.axis
        args = self.args
        kwargs = self.kwargs

        is_series = obj.ndim == 1

        if obj._get_axis_number(axis) == 1:
            assert not is_series
            return obj.T.transform(func, 0, *args, **kwargs).T

        if is_list_like(func) and not is_dict_like(func):
            func = cast(List[AggFuncTypeBase], func)
            # Convert func equivalent dict
            if is_series:
                func = {com.get_callable_name(v) or v: v for v in func}
            else:
                func = {col: func for col in obj}

        if is_dict_like(func):
            func = cast(AggFuncTypeDict, func)
            return self.transform_dict_like(func)

        # func is either str or callable
        func = cast(AggFuncTypeBase, func)
        try:
            result = self.transform_str_or_callable(func)
        except TypeError:
            raise
        except Exception as err:
            raise ValueError("Transform function failed") from err

        # Functions that transform may return empty Series/DataFrame
        # when the dtype is not appropriate
        if (
            isinstance(result, (ABCSeries, ABCDataFrame))
            and result.empty
            and not obj.empty
        ):
            raise ValueError("Transform function failed")
        if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals(
            obj.index
        ):
            raise ValueError("Function did not transform")

        return result
Пример #19
0
    def transform(self) -> DataFrame | Series:
        """
        Transform a DataFrame or Series.

        Returns
        -------
        DataFrame or Series
            Result of applying ``func`` along the given axis of the
            Series or DataFrame.

        Raises
        ------
        ValueError
            If the transform function fails or does not transform.
        """
        obj = self.obj
        func = self.orig_f
        axis = self.axis
        args = self.args
        kwargs = self.kwargs

        is_series = obj.ndim == 1

        if obj._get_axis_number(axis) == 1:
            assert not is_series
            return obj.T.transform(func, 0, *args, **kwargs).T

        if is_list_like(func) and not is_dict_like(func):
            func = cast(List[AggFuncTypeBase], func)
            # Convert func equivalent dict
            if is_series:
                func = {com.get_callable_name(v) or v: v for v in func}
            else:
                func = {col: func for col in obj}

        if is_dict_like(func):
            func = cast(AggFuncTypeDict, func)
            return self.transform_dict_like(func)

        # func is either str or callable
        func = cast(AggFuncTypeBase, func)
        try:
            result = self.transform_str_or_callable(func)
        except TypeError:
            raise
        except Exception as err:
            raise ValueError("Transform function failed") from err

        # Functions that transform may return empty Series/DataFrame
        # when the dtype is not appropriate
        if (isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty
                and not obj.empty):
            raise ValueError("Transform function failed")
        # error: Argument 1 to "__get__" of "AxisProperty" has incompatible type
        # "Union[Series, DataFrame, GroupBy[Any], SeriesGroupBy,
        # DataFrameGroupBy, BaseWindow, Resampler]"; expected "Union[DataFrame,
        # Series]"
        if not isinstance(
                result, (ABCSeries, ABCDataFrame)) or not result.index.equals(
                    obj.index  # type:ignore[arg-type]
                ):
            raise ValueError("Function did not transform")

        return result
Пример #20
0
    def to_sql(self,
               frame,
               name,
               if_exists='fail',
               index=True,
               index_label=None,
               schema=None,
               chunksize=None,
               dtype=None,
               pkcs=None):
        """
        Write records stored in a DataFrame to a SQL database.

        Parameters
        ----------
        frame : DataFrame
        name : string
            Name of SQL table.
        if_exists : {'fail', 'replace', 'append'}, default 'fail'
            - fail: If table exists, do nothing.
            - replace: If table exists, drop it, recreate it, and insert data.
            - append: If table exists, insert data. Create if does not exist.
        index : boolean, default True
            Write DataFrame index as a column.
        index_label : string or sequence, default None
            Column label for index column(s). If None is given (default) and
            `index` is True, then the index names are used.
            A sequence should be given if the DataFrame uses MultiIndex.
        schema : string, default None
            Name of SQL schema in database to write to (if database flavor
            supports this). If specified, this overwrites the default
            schema of the SQLDatabase object.
        chunksize : int, default None
            If not None, then rows will be written in batches of this size at a
            time.  If None, all rows will be written at once.
        dtype : single type or dict of column name to SQL type, default None
            Optional specifying the datatype for columns. The SQL type should
            be a SQLAlchemy type. If all columns are of the same type, one
            single value can be used.

        """
        if dtype and not is_dict_like(dtype):
            dtype = {col_name: dtype for col_name in frame}

        if dtype is not None:
            from sqlalchemy.types import to_instance, TypeEngine
            for col, my_type in dtype.items():
                if not isinstance(to_instance(my_type), TypeEngine):
                    raise ValueError('The type of %s is not a SQLAlchemy '
                                     'type ' % col)

        table = SQLTable_extend(name,
                                self,
                                frame=frame,
                                index=index,
                                if_exists=if_exists,
                                index_label=index_label,
                                schema=schema,
                                dtype=dtype,
                                pkcs=pkcs)
        table.create()
        table.insert(chunksize)
        if (not name.isdigit() and not name.islower()):
            # check for potentially case sensitivity issues (GH7815)
            # Only check when name is not a number and name is not lower case
            engine = self.connectable.engine
            with self.connectable.connect() as conn:
                table_names = engine.table_names(
                    schema=schema or self.meta.schema,
                    connection=conn,
                )
            if name not in table_names:
                msg = (
                    "The provided table name '{0}' is not found exactly as "
                    "such in the database after writing the table, possibly "
                    "due to case sensitivity issues. Consider using lower "
                    "case table names.").format(name)
                warnings.warn(msg, UserWarning)