Exemplo n.º 1
0
def get_table_dyn(S: pd.Index, n_tot: int, max_length=100):
    """
    Parameters
    ----------
    S: pd.Index or np.ndarray
        a Series of occurrences
    n_tot: int
        total number of occurrences in the original events
    max_length: int, default=None
        maximum number of occurrences for a cycle to cover,
        by default it will be set to :math:`\log_{2}\left(|S|\right)`

    """
    diffs = np.diff(S)
    triples = sliding_window_view(S, 3)
    diff_pairs = sliding_window_view(diffs, 2)
    dS = S.max() - S.min()

    score_one = residual_length(1, n_tot, dS)  # 1 really ?

    scores = sum(cycle_length(triples, diff_pairs, len(S), dS))
    change = scores > 3 * score_one
    scores[change] = 3 * score_one  # inplace replacement
    cut_points = np.array([-1] * len(scores), dtype=object)
    cut_points[~change] = None

    scores = dict(zip(((i, i + 2) for i in range(len(scores))), scores))
    cut_points = dict(zip(scores.keys(), cut_points))

    max_length = min([len(S), max_length])
    for k in range(4, max_length + 1):
        w = sliding_window_view(S, k)
        _diffs = sliding_window_view(diffs, k - 1)
        _s = sum(cycle_length(w, _diffs, len(S), dS))

        for ia, best_score in enumerate(_s):
            cut_point = None
            iz = ia + k - 1
            for im in range(ia, iz):
                if im - ia + 1 < 3:
                    score_left = score_one * (im - ia + 1)
                else:
                    score_left = scores[(ia, im)]
                if iz - im < 3:
                    score_right = score_one * (iz - im)
                else:
                    score_right = scores[(im + 1, iz)]

                if score_left + score_right < best_score:
                    best_score = score_left + score_right
                    cut_point = im
            scores[(ia, iz)] = best_score
            cut_points[(ia, iz)] = cut_point

    return scores, cut_points
Exemplo n.º 2
0
    def _split(self, y: pd.Index) -> SPLIT_GENERATOR_TYPE:
        n_timepoints = y.shape[0]
        cutoffs = check_cutoffs(cutoffs=self.cutoffs)
        fh = _check_fh(fh=self.fh)
        window_length = check_window_length(
            window_length=self.window_length, n_timepoints=n_timepoints
        )
        _check_cutoffs_and_y(cutoffs=cutoffs, y=y)
        _check_cutoffs_fh_y(cutoffs=cutoffs, fh=fh, y=y)
        max_fh = fh.max()
        max_cutoff = np.max(cutoffs)

        for cutoff in cutoffs:
            if is_int(x=window_length) and is_int(x=cutoff):
                train_start = cutoff - window_length
            elif is_timedelta_or_date_offset(x=window_length) and is_datetime(x=cutoff):
                train_start = y.get_loc(max(y[0], cutoff - window_length))
            else:
                raise TypeError(
                    f"Unsupported combination of types: "
                    f"`window_length`: {type(window_length)}, "
                    f"`cutoff`: {type(cutoff)}"
                )

            split_point = cutoff if is_int(x=cutoff) else y.get_loc(y[y <= cutoff][-1])
            training_window = self._get_train_window(
                y=y, train_start=train_start + 1, split_point=split_point + 1
            )

            test_window = cutoff + fh.to_numpy()
            if is_datetime(x=max_cutoff) and is_timedelta(x=max_fh):
                test_window = test_window[test_window >= y.min()]
                test_window = np.array(
                    [y.get_loc(timestamp) for timestamp in test_window]
                )
            yield training_window, test_window