Exemplo n.º 1
0
def group_by_to_index(index: tp.Index, group_by: tp.GroupByLike) -> GroupByT:
    """Convert mapper `group_by` to `pd.Index`.

    !!! note
        Index and mapper must have the same length."""
    if group_by is None or group_by is False:
        return group_by
    if group_by is True:
        group_by = pd.Index(np.full(len(index), 0))  # one group
    elif isinstance(group_by, (int, str)):
        group_by = index_fns.select_levels(index, group_by)
    elif checks.is_sequence(group_by):
        if len(group_by) != len(index) \
                and isinstance(group_by[0], (int, str)) \
                and isinstance(index, pd.MultiIndex) \
                and len(group_by) <= len(index.names):
            try:
                group_by = index_fns.select_levels(index, group_by)
            except (IndexError, KeyError):
                pass
    if not isinstance(group_by, pd.Index):
        group_by = pd.Index(group_by)
    if len(group_by) != len(index):
        raise ValueError("group_by and index must have the same length")
    return group_by
Exemplo n.º 2
0
    def download(cls: tp.Type[DataT],
                 symbols: tp.Union[tp.Label, tp.Labels],
                 tz_localize: tp.Optional[tp.TimezoneLike] = None,
                 tz_convert: tp.Optional[tp.TimezoneLike] = None,
                 missing_index: tp.Optional[str] = None,
                 missing_columns: tp.Optional[str] = None,
                 wrapper_kwargs: tp.KwargsLike = None,
                 **kwargs) -> DataT:
        """Download data using `Data.download_symbol`.

        Args:
            symbols (hashable or sequence of hashable): One or multiple symbols.

                !!! note
                    Tuple is considered as a single symbol (since hashable).
            tz_localize (any): See `Data.from_data`.
            tz_convert (any): See `Data.from_data`.
            missing_index (str): See `Data.from_data`.
            missing_columns (str): See `Data.from_data`.
            wrapper_kwargs (dict): See `Data.from_data`.
            **kwargs: Passed to `Data.download_symbol`.

                If two symbols require different keyword arguments, pass `symbol_dict` for each argument.
        """
        if checks.is_hashable(symbols):
            symbols = [symbols]
        elif not checks.is_sequence(symbols):
            raise TypeError("Symbols must be either hashable or sequence of hashable")

        data = dict()
        for s in symbols:
            # Select keyword arguments for this symbol
            _kwargs = cls.select_symbol_kwargs(s, kwargs)

            # Download data for this symbol
            data[s] = cls.download_symbol(s, **_kwargs)

        # Create new instance from data
        return cls.from_data(
            data,
            tz_localize=tz_localize,
            tz_convert=tz_convert,
            missing_index=missing_index,
            missing_columns=missing_columns,
            wrapper_kwargs=wrapper_kwargs,
            download_kwargs=kwargs
        )
Exemplo n.º 3
0
def split_ranges_into_sets(
        start_idxs: tp.ArrayLike,
        end_idxs: tp.ArrayLike,
        set_lens: tp.MaybeSequence[tp.Sequence[float]] = (),
        left_to_right: tp.MaybeSequence[bool] = True) -> RangesT:
    """Generate ranges between each in `start_idxs` and `end_idxs` and
    optionally split into one or more sets.

    Args:
        start_idxs (array_like): Start indices.
        end_idxs (array_like): End indices.
        set_lens (list of float): Lengths of sets in each range.

            The number of returned sets is the length of `set_lens` plus one,
            which stores the remaining elements.

            Can be passed per range.
        left_to_right (bool or list of bool): Whether to resolve `set_lens` from left to right.

            Makes the last set variable, otherwise makes the first set variable.

            Can be passed per range.

    ## Example

    * `set_lens=(0.5)`: 50% in training set, the rest in test set
    * `set_lens=(0.5, 0.25)`: 50% in training set, 25% in validation set, the rest in test set
    * `set_lens=(50, 30)`: 50 in training set, 30 in validation set, the rest in test set
    * `set_lens=(50, 30)` and `left_to_right=False`: 30 in test set, 50 in validation set,
        the rest in training set
    """
    start_idxs = np.asarray(start_idxs)
    end_idxs = np.asarray(end_idxs)
    checks.assert_len_equal(start_idxs, end_idxs)

    for i in range(len(start_idxs)):
        start_idx = start_idxs[i]
        end_idx = end_idxs[i]

        range_len = end_idx - start_idx + 1
        new_set_lens = []
        if len(set_lens) == 0:
            yield (np.arange(start_idx, end_idx + 1), )
        else:
            if checks.is_sequence(set_lens[0]):
                _set_lens = set_lens[i]
            else:
                _set_lens = set_lens
            if checks.is_sequence(left_to_right):
                _left_to_right = left_to_right[i]
            else:
                _left_to_right = left_to_right
            for j, set_len in enumerate(_set_lens):
                if 0 < set_len < 1:
                    set_len = math.floor(set_len * range_len)
                if set_len == 0:
                    raise ValueError(f"Set {j} in the range {i} is empty")
                new_set_lens.append(set_len)
            if sum(new_set_lens) < range_len:
                if _left_to_right:
                    new_set_lens = new_set_lens + [
                        range_len - sum(new_set_lens)
                    ]
                else:
                    new_set_lens = [range_len - sum(new_set_lens)
                                    ] + new_set_lens
            else:
                raise ValueError(
                    f"Range of length {range_len} too short to split into {len(_set_lens) + 1} sets"
                )

            # Split each range into sets
            idx_offset = 0
            set_ranges = []
            for set_len in new_set_lens:
                new_idx_offset = idx_offset + set_len
                set_ranges.append(
                    np.arange(start_idx + idx_offset,
                              start_idx + new_idx_offset))
                idx_offset = new_idx_offset

            yield tuple(set_ranges)