Exemplo n.º 1
0
def _select(__data, *args, **kwargs):
    # see https://stackoverflow.com/questions/25914329/rearrange-columns-in-sqlalchemy-select-object
    if kwargs:
        raise NotImplementedError(
            "Using kwargs in select not currently supported. "
            "Use _.newname == _.oldname instead")
    last_op = __data.last_op
    columns = {c.key: c for c in last_op.inner_columns}

    # same as for DataFrame
    colnames = Series(list(columns))
    vl = VarList()
    evaluated = (arg(vl) if callable(arg) else arg for arg in args)
    od = var_select(colnames, *evaluated)

    col_list = []
    for k, v in od.items():
        col = columns[k]
        col_list.append(col if v is None else col.label(v))

    return __data.append_op(last_op.with_only_columns(col_list))
Exemplo n.º 2
0
def pivot_longer(
        __data,
        *args,
        names_to: Union[str, Tuple[str, ...]] = "name",
        names_prefix: Optional[str] = None,
        names_sep: Optional[str] = None,
        names_pattern: Optional[str] = None,
        names_ptypes: Optional[Tuple] = None,
        names_repair: str = "check_unique",
        values_to: str = "value",
        values_drop_na: bool = False,
        values_ptypes: Optional[Union[str, Tuple[str, ...]]] = None,
        values_transform: Optional[Dict] = dict(),
):

    if names_sep is not None and names_pattern is not None:
        raise ValueError("You may only use either `names_sep` or "
                         "`names_pattern`.")

    if isinstance(names_to, str):
        names_to = (names_to, )

    # Copied selection over from gather, maybe this can be compartmentalised?
    var_list = var_create(*args)
    od = var_select(__data.columns, *var_list)

    value_vars = list(od) or None

    id_vars = [col for col in __data.columns if col not in od]

    keep_data = __data.loc[:, id_vars]
    if value_vars is None:
        # While stack works in this case, it will later on merge in to the
        # original dataframe. To copy tidyr behaviour, we need to raise a
        # ValueError
        # stacked = __data.stack(dropna=values_drop_na)
        raise ValueError("Please provide at least 1 column or all columns "
                         "(shorthand: _[:]).")
    elif names_sep is not None or names_pattern is not None:
        to_stack = __data.loc[:, value_vars]
        column_index = (
            to_stack.columns.str.split(names_sep).map(tuple)
            if names_sep is not None
            # Split by names_pattern, and remove empty strings using filter
            else to_stack.columns.str.split(names_pattern).map(
                lambda x: tuple(list(filter(None, x)))))
        split_lengths = np.array(column_index.map(len))

        if not np.all(split_lengths == split_lengths[0]):
            raise ValueError(
                "Splitting by {} leads to unequal lenghts ({}).".format(
                    names_sep if names_sep is not None else names_pattern))

        if split_lengths[0] != len(names_to):
            raise ValueError("Splitting provided more values than provided in "
                             "`names_to`")

        # TODO: To set names for the new index, we need to feed in a list.
        # There's no particular reason to use a tuples as input in the first
        # place, might be worth reconsidering the choice of input format?
        # TODO: What if we don't use '_value' in the tuple? Need to check tidyr
        stack_idx = (
            [i for i, x in enumerate(list(names_to))
             if x != "_value"] if names_to != ('_value', ) else -1)
        names_to = [x if x != "_value" else None for x in names_to]

        column_index = column_index.set_names(names_to)

        to_stack.columns = column_index
        stacked = to_stack.stack(stack_idx)
        stacked = stacked.reset_index(level=stacked.index.nlevels - 1)

        if stack_idx == -1:
            stacked = stacked.drop(columns='level_1')
        if np.nan in names_to:
            stacked = stacked.drop(columns=[np.nan])
        if values_drop_na:
            stacked = stacked.dropna(axis=1)
    else:
        stacked = __data.loc[:, value_vars].stack(dropna=values_drop_na)
        # Set column names for stack
        # As in tidyr `values_to` is ignored if `names_sep` or `names_pattern`
        # is provided.
        stacked.index.rename(names_to[0], level=1, inplace=True)
        stacked.name = values_to

    # values_transform was introduced in tidyr 1.1.0
    if values_to in values_transform:
        # TODO: error handling -- this won't work for dictionaries
        # list needs special handling, as it can only be applied to iterables,
        # not integers.
        if values_transform[values_to] == list:
            stacked = stacked.apply(lambda x: [x])
        else:
            stacked = stacked.apply(lambda x: values_transform[values_to](x))

    stacked_df = (
        # if `names_sep` or `names_pattern` are not provided `stacked` will
        # be a pd.Series and needs its index reset.
        stacked.reset_index(1)
        if names_sep is None and names_pattern is None else stacked)

    # If we want to pivot all but one, we are left with a `pd.Series`.
    # This needs to be converted to a DataFrame to serve as left element in a
    # merge
    if isinstance(keep_data, pd.Series):
        output_df = keep_data.to_frame().merge(stacked_df,
                                               left_index=True,
                                               right_index=True)
    elif keep_data.empty:
        output_df = stacked_df
    else:
        output_df = keep_data.merge(stacked_df,
                                    left_index=True,
                                    right_index=True)

    return output_df