Exemplo n.º 1
0
def test_clean_nan_inf():
    a = np.array([np.nan, 1, np.inf, 2])
    desired = np.array([0, 1, 0, 2])
    actual = core.clean_nan_inf(a)

    np.testing.assert_equal(actual, desired)

    with pytest.raises(ValueError) as excinfo:
        core.clean_nan_inf(None)
        assert 'Unable to convert to np.ndarray!' in str(excinfo.value)
Exemplo n.º 2
0
def stomp(ts, window_size, query=None, n_jobs=1):
    """
    Computes matrix profiles for a single dimensional time series using the 
    parallelized STOMP algorithm (by default). Ray or Python's multiprocessing
    library may be used. When you have initialized Ray on your machine, 
    it takes priority over using Python's multiprocessing.

    Parameters
    ----------
    ts : array_like
        The time series to compute the matrix profile for.
    window_size: int
        The size of the window to compute the matrix profile over.
    query : array_like
        Optionally, a query can be provided to perform a similarity join.
    n_jobs : int, Default = 1
        Number of cpu cores to use.

    Returns
    -------
    dict : profile
        A MatrixProfile data structure.
        
        >>> {
        >>>     'mp': The matrix profile,
        >>>     'pi': The matrix profile 1NN indices,
        >>>     'rmp': The right matrix profile,
        >>>     'rpi': The right matrix profile 1NN indices,
        >>>     'lmp': The left matrix profile,
        >>>     'lpi': The left matrix profile 1NN indices,
        >>>     'metric': The distance metric computed for the mp,
        >>>     'w': The window size used to compute the matrix profile,
        >>>     'ez': The exclusion zone used,
        >>>     'join': Flag indicating if a similarity join was computed,
        >>>     'sample_pct': Percentage of samples used in computing the MP,
        >>>     'data': {
        >>>         'ts': Time series data,
        >>>         'query': Query data if supplied
        >>>     }
        >>>     'class': "MatrixProfile"
        >>>     'algorithm': "stomp_parallel"
        >>> }

    Raises
    ------
    ValueError
        If window_size < 4.
        If window_size > query length / 2.
        If ts is not a list or np.array.
        If query is not a list or np.array.
        If ts or query is not one dimensional.

    """
    is_join = core.is_similarity_join(ts, query)
    if not is_join:
        query = ts

    # data conversion to np.array
    ts = core.to_np_array(ts)
    query = core.to_np_array(query)

    if window_size < 4:
        error = "window size must be at least 4."
        raise ValueError(error)

    if window_size > len(query) / 2:
        error = "Time series is too short relative to desired window size"
        raise ValueError(error)

    # multiprocessing or single threaded approach
    if n_jobs == 1:
        pass
    else:
        n_jobs = core.valid_n_jobs(n_jobs)

    # precompute some common values - profile length, query length etc.
    profile_length = core.get_profile_length(ts, query, window_size)
    data_length = len(ts)
    query_length = len(query)
    num_queries = query_length - window_size + 1
    exclusion_zone = int(np.ceil(window_size / 2.0))

    # do not use exclusion zone for join
    if is_join:
        exclusion_zone = 0

    # find skip locations, clean up nan and inf in the ts and query
    skip_locs = core.find_skip_locations(ts, profile_length, window_size)
    ts = core.clean_nan_inf(ts)
    query = core.clean_nan_inf(query)

    # initialize matrices
    matrix_profile = np.full(profile_length, np.inf)
    profile_index = np.full(profile_length, 0)

    # compute left and right matrix profile when similarity join does not happen
    left_matrix_profile = None
    right_matrix_profile = None
    left_profile_index = None
    right_profile_index = None

    if not is_join:
        left_matrix_profile = np.copy(matrix_profile)
        right_matrix_profile = np.copy(matrix_profile)
        left_profile_index = np.copy(profile_index)
        right_profile_index = np.copy(profile_index)

    # precompute some statistics on ts
    data_mu, data_sig = core.moving_avg_std(ts, window_size)
    first_window = query[0:window_size]
    first_product = core.fft_convolve(ts, first_window)

    batch_windows = []
    results = []

    # batch compute with multiprocessing
    args = []
    for start, end in core.generate_batch_jobs(num_queries, n_jobs):
        args.append((start, end, ts, query, window_size, data_length,
                     profile_length, exclusion_zone, is_join, data_mu,
                     data_sig, first_product, skip_locs))
        batch_windows.append((start, end))

    # we are running single threaded stomp - no need to initialize any
    # parallel environments.
    if n_jobs == 1 or len(args) == 1:
        results.append(_batch_compute(args[0]))
    else:
        # parallelize
        with core.mp_pool()(n_jobs) as pool:
            results = pool.map(_batch_compute, args)

    # now we combine the batch results
    if len(results) == 1:
        result = results[0]
        matrix_profile = result['mp']
        profile_index = result['pi']
        left_matrix_profile = result['lmp']
        left_profile_index = result['lpi']
        right_matrix_profile = result['rmp']
        right_profile_index = result['rpi']
    else:
        for index, result in enumerate(results):
            start = batch_windows[index][0]
            end = batch_windows[index][1]

            # update the matrix profile
            indices = result['mp'] < matrix_profile
            matrix_profile[indices] = result['mp'][indices]
            profile_index[indices] = result['pi'][indices]

            # update the left and right matrix profiles
            if not is_join:
                indices = result['lmp'] < left_matrix_profile
                left_matrix_profile[indices] = result['lmp'][indices]
                left_profile_index[indices] = result['lpi'][indices]

                indices = result['rmp'] < right_matrix_profile
                right_matrix_profile[indices] = result['rmp'][indices]
                right_profile_index[indices] = result['rpi'][indices]

    return {
        'mp': matrix_profile,
        'pi': profile_index,
        'rmp': right_matrix_profile,
        'rpi': right_profile_index,
        'lmp': left_matrix_profile,
        'lpi': left_profile_index,
        'metric': 'euclidean',
        'w': window_size,
        'ez': exclusion_zone,
        'join': is_join,
        'sample_pct': 1,
        'data': {
            'ts': ts,
            'query': query
        },
        'class': "MatrixProfile",
        'algorithm': "stomp"
    }
Exemplo n.º 3
0
def mstomp(ts, window_size, return_dimension=False, n_jobs=1):
    """
    Computes multidimensional matrix profile with mSTAMP (stomp based). Ray or Python's multiprocessing library may be used. When you have initialized Ray on your machine, it takes priority over using Python's multiprocessing.

    Parameters
    ----------
    ts : array_like, shape (n_dim, seq_len)
        The multidimensional time series to compute the multidimensional matrix profile for.
    window_size: int
        The size of the window to compute the matrix profile over.
    return_dimension : bool
        if True, also return the matrix profile dimension. It takses O(d^2 n)
        to store and O(d^2 n^2) to compute. (default is False)
    n_jobs : int, Default = 1
        Number of cpu cores to use.

    Returns
    -------
    dict : profile
        A MatrixProfile data structure.
        
        >>> {
        >>>     'mp': The matrix profile,
        >>>     'pi': The matrix profile 1NN indices,
        >>>     'rmp': The right matrix profile,
        >>>     'rpi': The right matrix profile 1NN indices,
        >>>     'lmp': The left matrix profile,
        >>>     'lpi': The left matrix profile 1NN indices,
        >>>     'metric': The distance metric computed for the mp,
        >>>     'w': The window size used to compute the matrix profile,
        >>>     'ez': The exclusion zone used,
        >>>     'sample_pct': Percentage of samples used in computing the MP,
        >>>     'data': {
        >>>         'ts': Time series data,
        >>>         'query': Query data if supplied
        >>>     }
        >>>     'class': "MatrixProfile"
        >>>     'algorithm': "stomp_based_mstamp"
        >>> }

    Raises
    ------
    ValueError
        If window_size < 4.
        If window_size > time series length / 2.
        If ts is not a list or np.array.

    """

    query = ts

    # data conversion to np.array
    ts = core.to_np_array(ts)
    query = core.to_np_array(query)

    if window_size < 4:
        error = "window size must be at least 4."
        raise ValueError(error)

    if ts.ndim == 1:
        ts = np.expand_dims(ts, axis=0)
        query = np.expand_dims(query, axis=0)

    if window_size > query.shape[1] / 2:
        error = "Time series is too short relative to desired window size"
        raise ValueError(error)

    # multiprocessing or single threaded approach
    if n_jobs == 1:
        pass
    else:
        n_jobs = core.valid_n_jobs(n_jobs)

    # precompute some common values - profile length, query length etc.
    profile_length = core.get_profile_length(ts, query, window_size)
    data_length = ts.shape[1]
    query_length = query.shape[1]
    num_queries = query_length - window_size + 1
    exclusion_zone = int(np.ceil(window_size / 2.0))
    num_dim = ts.shape[0]

    # find skip locations, clean up nan and inf in the ts and query
    skip_locs = core.find_multid_skip_locations(ts, profile_length, window_size)
    ts = core.clean_nan_inf(ts)
    query = core.clean_nan_inf(query)

    # initialize matrices
    matrix_profile = np.full((num_dim, profile_length), np.inf)
    profile_index = np.full((num_dim, profile_length), 0)
    # profile_index = np.full((num_dim, profile_length), -1)

    # compute left and right matrix profile when similarity join does not happen
    left_matrix_profile = np.copy(matrix_profile)
    right_matrix_profile = np.copy(matrix_profile)
    left_profile_index = np.copy(profile_index)
    right_profile_index = np.copy(profile_index)

    profile_dimension = []
    if return_dimension:
        n_jobs = 1
        for i in range(num_dim):
            profile_dimension.append(np.empty((i + 1, profile_length), dtype=int))

    # precompute some statistics on ts
    data_mu, data_sig, first_product = np.empty((num_dim, profile_length)), np.empty(
        (num_dim, profile_length)), np.empty((num_dim, profile_length))
    for i in range(num_dim):
        data_mu[i, :], data_sig[i, :] = core.moving_avg_std(ts[i, :], window_size)
        first_window = query[i, 0:window_size]
        first_product[i, :] = core.fft_convolve(ts[i, :], first_window)

    batch_windows = []
    results = []

    # batch compute with multiprocessing
    args = []
    for start, end in core.generate_batch_jobs(num_queries, n_jobs):
        args.append((num_dim, start, end, ts, query, window_size, data_length, profile_length, exclusion_zone, data_mu,
                     data_sig, first_product, skip_locs, profile_dimension, return_dimension))
        batch_windows.append((start, end))

    # we are running single threaded stomp - no need to initialize any
    # parallel environments.
    if n_jobs == 1 or len(args) == 1:
        results.append(_batch_compute(args[0]))
    else:
        # parallelize
        with core.mp_pool()(n_jobs) as pool:
            results = pool.map(_batch_compute, args)

    # now we combine the batch results
    if len(results) == 1:
        result = results[0]
        matrix_profile = result['mp']
        profile_index = result['pi']
        profile_dimension = result['pd']
        left_matrix_profile = result['lmp']
        left_profile_index = result['lpi']
        right_matrix_profile = result['rmp']
        right_profile_index = result['rpi']
    else:
        for index, result in enumerate(results):
            start = batch_windows[index][0]
            end = batch_windows[index][1]

            # update the matrix profile
            indices = result['mp'] < matrix_profile
            matrix_profile[indices] = result['mp'][indices]
            profile_index[indices] = result['pi'][indices]

            # update the left and right matrix profiles
            indices = result['lmp'] < left_matrix_profile
            left_matrix_profile[indices] = result['lmp'][indices]
            left_profile_index[indices] = result['lpi'][indices]

            indices = result['rmp'] < right_matrix_profile
            right_matrix_profile[indices] = result['rmp'][indices]
            right_profile_index[indices] = result['rpi'][indices]

    return {
        'mp': matrix_profile,
        'pi': profile_index,
        'pd': profile_dimension,
        'rmp': right_matrix_profile,
        'rpi': right_profile_index,
        'lmp': left_matrix_profile,
        'lpi': left_profile_index,
        'metric': 'euclidean',
        'w': window_size,
        'ez': exclusion_zone,
        'sample_pct': 1,
        'data': {
            'ts': ts,
            'query': query
        },
        'class': "MatrixProfile",
        'algorithm': "stomp_based_mstamp"
    }