Exemplo n.º 1
0
def test_preprocess():
    with pytest.raises(ValueError) as excinfo:
        ts = 1
        ts = preprocess(ts, window=4)
        assert "ts is not array like!" \
            in str(excinfo.value)

    ts = np.array([np.nan, np.inf, np.inf, np.nan, np.inf, 2, 3, 2, 3, 1, 2, 3, 4, 2,
                   np.nan, np.inf, 4, 2, 3, 4, 5, 6, 7, 8, 3, 4, 2, 3, 4, 5, 6, 7, 6,
                   5, 4, 3, np.nan, np.nan, np.inf, np.nan, np.inf, np.nan])

    ts = preprocess(ts, window=4)
    assert(np.any(np.isnan(ts)) == False)
    assert(np.any(np.isinf(ts)) == False)
Exemplo n.º 2
0
def analyze(ts,
            query=None,
            windows=None,
            sample_pct=1.0,
            threshold=0.98,
            n_jobs=1,
            preprocessing_kwargs=None):
    """
    Runs an appropriate workflow based on the parameters passed in. The goal
    of this function is to compute all fundamental algorithms on the provided
    time series data. For now the following is computed:

    1. Matrix Profile - exact or approximate based on sample_pct given that a
       window is provided. By default is the exact algorithm.
    2. Top Motifs - The top 3 motifs are found.
    3. Top Discords - The top 3 discords are found.
    4. Plot MP, Motifs and Discords

    When a window is not provided or more than a single window is provided,
    the PMP is computed:

    1. Compute UPPER window when no window(s) is provided
    2. Compute PMP for all windows
    3. Top Motifs
    4. Top Discords
    5. Plot PMP, motifs and discords.

    Parameters
    ----------
    ts : array_like
        The time series to analyze.
    query : array_like, Optional
        The query to analyze. Note that when computing the PMP the query is
        ignored!
    windows : int or array_like, Optional
        The window(s) to compute the MatrixProfile. Note that it may be an int
        for a single matrix profile computation or an array of ints for
        computing the pan matrix profile.
    sample_pct : float, default = 1
        A float between 0 and 1 representing how many samples to compute for
        the MP or PMP. When it is 1, the exact algorithm is used.
    threshold : float, Default 0.98
        The correlation coefficient used as the threshold. It should be between
        0 and 1. This is used to compute the upper window size when no
        window(s) is given.
    n_jobs : int, Default = 1
        Number of cpu cores to use.
    preprocessing_kwargs : dict, default = None
        A dictionary object to sets parameters for preprocess function.
        A valid preprocessing_kwargs should have the following structure:

        >>> {
        >>>     'window': The window size to compute the mean/median/minimum/maximum value,
        >>>     'method': A string indicating the data imputation method, which should be
        >>>               'mean', 'median', 'min' or 'max',
        >>>     'direction': A string indicating the data imputation direction, which should be
        >>>                 'forward', 'fwd', 'f', 'backward', 'bwd', 'b'. If the direction is
        >>>                 forward, we use previous data for imputation; if the direction is
        >>>                 backward, we use subsequent data for imputation.,
        >>>     'add_noise': A boolean value indicating whether noise needs to be added into the
        >>>                 time series
        >>> }

        To disable preprocessing procedure, set the preprocessing_kwargs to
        None/False/""/{}.

    Returns
    -------
    tuple : (profile, figures)
        The appropriate PMP or MP profile object and associated figures.

    """
    result = None

    # preprocess the time series
    preprocessing_kwargs = validate_preprocess_kwargs(preprocessing_kwargs)
    if preprocessing_kwargs:
        ts = preprocess(
            ts,
            window=preprocessing_kwargs['window'],
            impute_method=preprocessing_kwargs['impute_method'],
            impute_direction=preprocessing_kwargs['impute_direction'],
            add_noise=preprocessing_kwargs['add_noise'])

    # determine proper number of jobs
    n_jobs = core.valid_n_jobs(n_jobs)

    # determine what algorithm to use based on params
    no_window = isinstance(windows, type(None))
    many_windows = core.is_array_like(windows) and len(windows) > 1
    single_window = isinstance(windows, int) or \
                    (core.is_array_like(windows) and len(windows) == 1)
    is_exact = sample_pct >= 1
    is_approx = sample_pct > 0 and sample_pct < 1

    # use PMP with no window provided
    if no_window or many_windows:
        result = analyze_pmp(ts,
                             query,
                             sample_pct,
                             threshold,
                             windows=windows,
                             n_jobs=n_jobs)
    elif single_window and is_exact:
        result = analyze_mp_exact(ts, query, windows, n_jobs=n_jobs)
    elif single_window and is_approx:
        result = analyze_mp_approximate(ts,
                                        query,
                                        windows,
                                        sample_pct,
                                        n_jobs=n_jobs)
    else:
        raise RuntimeError('Param combination resulted in an uknown operation')

    return result
Exemplo n.º 3
0
def compute(ts,
            windows=None,
            query=None,
            sample_pct=1,
            threshold=0.98,
            n_jobs=1,
            preprocessing_kwargs=None):
    """
    Computes the exact or approximate MatrixProfile based on the sample percent
    specified. Currently, MPX and SCRIMP++ is used for the exact and
    approximate algorithms respectively. When multiple windows are passed, the
    Pan-MatrixProfile is computed and returned.

    By default, only passing in a time series (ts), the Pan-MatrixProfile is
    computed based on the maximum upper window algorithm with a correlation
    threshold of 0.98.

    Notes
    -----
    When multiple windows are passed and the Pan-MatrixProfile is computed, the
    query is ignored!

    Parameters
    ----------
    ts : array_like
        The time series to analyze.
    windows : int, array_like
        The window(s) to compute the MatrixProfile. Note that it may be an int
        for a single matrix profile computation or an array of ints for
        computing the pan matrix profile.
    query : array_like, optional
        The query to analyze. Note that when computing the PMP the query is
        ignored!
    sample_pct : float, default 1
        A float between 0 and 1 representing how many samples to compute for
        the MP or PMP. When it is 1, the exact algorithm is used.
    threshold : float, default 0.98
        The correlation coefficient used as the threshold. It should be between
        0 and 1. This is used to compute the upper window size when no
        window(s) is given.
    n_jobs : int, default = 1
        Number of cpu cores to use.
    preprocessing_kwargs : dict, default = None
        A dictionary object to sets parameters for preprocess function.
        A valid preprocessing_kwargs should have the following structure:

        >>> {
        >>>     'window': The window size to compute the mean/median/minimum/maximum value,
        >>>     'method': A string indicating the data imputation method, which should be
        >>>               'mean', 'median', 'min' or 'max',
        >>>     'direction': A string indicating the data imputation direction, which should be
        >>>                 'forward', 'fwd', 'f', 'backward', 'bwd', 'b'. If the direction is
        >>>                 forward, we use previous data for imputation; if the direction is
        >>>                 backward, we use subsequent data for imputation.,
        >>>     'add_noise': A boolean value indicating whether noise needs to be added into the
        >>>                 time series
        >>> }

        To disable preprocessing procedure, set the preprocessing_kwargs to
        None/False/""/{}.

    Returns
    -------
    dict : profile
        The profile computed.

    """
    result = None
    multiple_windows = core.is_array_like(windows) and len(windows) > 1
    no_windows = isinstance(windows, type(None))
    has_threshold = isinstance(threshold, float)

    if no_windows and not has_threshold:
        raise ValueError(
            'compute requires a threshold or window(s) to be set!')

    # Check to make sure all window sizes are greater than 3, return a ValueError if not.
    if (isinstance(windows, int)
            and windows < 4) or (multiple_windows
                                 and np.any(np.unique(windows) < 4)):
        raise ValueError(
            'Compute requires all window sizes to be greater than 3!')

    if core.is_array_like(windows) and len(windows) == 1:
        windows = windows[0]

    # preprocess the time series
    preprocessing_kwargs = validate_preprocess_kwargs(preprocessing_kwargs)
    if preprocessing_kwargs:
        ts = preprocess(
            ts,
            window=preprocessing_kwargs['window'],
            impute_method=preprocessing_kwargs['impute_method'],
            impute_direction=preprocessing_kwargs['impute_direction'],
            add_noise=preprocessing_kwargs['add_noise'])

    # compute the upper window and pmp
    if no_windows and has_threshold:
        profile = maximum_subsequence(ts, threshold, include_pmp=True)

        # determine windows to be computed
        # from 8 in steps of 2 until upper w
        start = 4
        #start = 8
        windows = range(start, profile['upper_window'] + 1)

        # compute the pmp
        result = skimp(ts,
                       windows=windows,
                       sample_pct=sample_pct,
                       pmp_obj=profile)

    # compute the pmp
    elif multiple_windows:
        if core.is_array_like(query):
            logger.warn('Computing PMP - query is ignored!')

        result = skimp(ts, windows=windows, sample_pct=1, n_jobs=n_jobs)

    # compute exact mp
    elif sample_pct >= 1:
        result = mpx(ts, windows, query=query, n_jobs=n_jobs)

    # compute approximate mp
    else:
        result = scrimp_plus_plus(ts,
                                  windows,
                                  query=query,
                                  n_jobs=n_jobs,
                                  sample_pct=sample_pct)

    return result