Пример #1
0
def test_moving_avg_std():
    a = np.array([1, 2, 3, 4, 5, 6])
    mu, std = core.moving_avg_std(a, 3)
    mu_desired = np.array([2., 3., 4., 5.])
    std_desired = np.array([0.81649658, 0.81649658, 0.81649658, 0.81649658])

    np.testing.assert_almost_equal(mu, mu_desired)
    np.testing.assert_almost_equal(std, std_desired)
Пример #2
0
def stomp(ts, window_size, query=None, n_jobs=1):
    """
    Computes matrix profiles for a single dimensional time series using the 
    parallelized STOMP algorithm (by default). Ray or Python's multiprocessing
    library may be used. When you have initialized Ray on your machine, 
    it takes priority over using Python's multiprocessing.

    Parameters
    ----------
    ts : array_like
        The time series to compute the matrix profile for.
    window_size: int
        The size of the window to compute the matrix profile over.
    query : array_like
        Optionally, a query can be provided to perform a similarity join.
    n_jobs : int, Default = 1
        Number of cpu cores to use.

    Returns
    -------
    dict : profile
        A MatrixProfile data structure.
        
        >>> {
        >>>     'mp': The matrix profile,
        >>>     'pi': The matrix profile 1NN indices,
        >>>     'rmp': The right matrix profile,
        >>>     'rpi': The right matrix profile 1NN indices,
        >>>     'lmp': The left matrix profile,
        >>>     'lpi': The left matrix profile 1NN indices,
        >>>     'metric': The distance metric computed for the mp,
        >>>     'w': The window size used to compute the matrix profile,
        >>>     'ez': The exclusion zone used,
        >>>     'join': Flag indicating if a similarity join was computed,
        >>>     'sample_pct': Percentage of samples used in computing the MP,
        >>>     'data': {
        >>>         'ts': Time series data,
        >>>         'query': Query data if supplied
        >>>     }
        >>>     'class': "MatrixProfile"
        >>>     'algorithm': "stomp_parallel"
        >>> }

    Raises
    ------
    ValueError
        If window_size < 4.
        If window_size > query length / 2.
        If ts is not a list or np.array.
        If query is not a list or np.array.
        If ts or query is not one dimensional.

    """
    is_join = core.is_similarity_join(ts, query)
    if not is_join:
        query = ts

    # data conversion to np.array
    ts = core.to_np_array(ts)
    query = core.to_np_array(query)

    if window_size < 4:
        error = "window size must be at least 4."
        raise ValueError(error)

    if window_size > len(query) / 2:
        error = "Time series is too short relative to desired window size"
        raise ValueError(error)

    # multiprocessing or single threaded approach
    if n_jobs == 1:
        pass
    else:
        n_jobs = core.valid_n_jobs(n_jobs)

    # precompute some common values - profile length, query length etc.
    profile_length = core.get_profile_length(ts, query, window_size)
    data_length = len(ts)
    query_length = len(query)
    num_queries = query_length - window_size + 1
    exclusion_zone = int(np.ceil(window_size / 2.0))

    # do not use exclusion zone for join
    if is_join:
        exclusion_zone = 0

    # find skip locations, clean up nan and inf in the ts and query
    skip_locs = core.find_skip_locations(ts, profile_length, window_size)
    ts = core.clean_nan_inf(ts)
    query = core.clean_nan_inf(query)

    # initialize matrices
    matrix_profile = np.full(profile_length, np.inf)
    profile_index = np.full(profile_length, 0)

    # compute left and right matrix profile when similarity join does not happen
    left_matrix_profile = None
    right_matrix_profile = None
    left_profile_index = None
    right_profile_index = None

    if not is_join:
        left_matrix_profile = np.copy(matrix_profile)
        right_matrix_profile = np.copy(matrix_profile)
        left_profile_index = np.copy(profile_index)
        right_profile_index = np.copy(profile_index)

    # precompute some statistics on ts
    data_mu, data_sig = core.moving_avg_std(ts, window_size)
    first_window = query[0:window_size]
    first_product = core.fft_convolve(ts, first_window)

    batch_windows = []
    results = []

    # batch compute with multiprocessing
    args = []
    for start, end in core.generate_batch_jobs(num_queries, n_jobs):
        args.append((start, end, ts, query, window_size, data_length,
                     profile_length, exclusion_zone, is_join, data_mu,
                     data_sig, first_product, skip_locs))
        batch_windows.append((start, end))

    # we are running single threaded stomp - no need to initialize any
    # parallel environments.
    if n_jobs == 1 or len(args) == 1:
        results.append(_batch_compute(args[0]))
    else:
        # parallelize
        with core.mp_pool()(n_jobs) as pool:
            results = pool.map(_batch_compute, args)

    # now we combine the batch results
    if len(results) == 1:
        result = results[0]
        matrix_profile = result['mp']
        profile_index = result['pi']
        left_matrix_profile = result['lmp']
        left_profile_index = result['lpi']
        right_matrix_profile = result['rmp']
        right_profile_index = result['rpi']
    else:
        for index, result in enumerate(results):
            start = batch_windows[index][0]
            end = batch_windows[index][1]

            # update the matrix profile
            indices = result['mp'] < matrix_profile
            matrix_profile[indices] = result['mp'][indices]
            profile_index[indices] = result['pi'][indices]

            # update the left and right matrix profiles
            if not is_join:
                indices = result['lmp'] < left_matrix_profile
                left_matrix_profile[indices] = result['lmp'][indices]
                left_profile_index[indices] = result['lpi'][indices]

                indices = result['rmp'] < right_matrix_profile
                right_matrix_profile[indices] = result['rmp'][indices]
                right_profile_index[indices] = result['rpi'][indices]

    return {
        'mp': matrix_profile,
        'pi': profile_index,
        'rmp': right_matrix_profile,
        'rpi': right_profile_index,
        'lmp': left_matrix_profile,
        'lpi': left_profile_index,
        'metric': 'euclidean',
        'w': window_size,
        'ez': exclusion_zone,
        'join': is_join,
        'sample_pct': 1,
        'data': {
            'ts': ts,
            'query': query
        },
        'class': "MatrixProfile",
        'algorithm': "stomp"
    }
Пример #3
0
def mstomp(ts, window_size, return_dimension=False, n_jobs=1):
    """
    Computes multidimensional matrix profile with mSTAMP (stomp based). Ray or Python's multiprocessing library may be used. When you have initialized Ray on your machine, it takes priority over using Python's multiprocessing.

    Parameters
    ----------
    ts : array_like, shape (n_dim, seq_len)
        The multidimensional time series to compute the multidimensional matrix profile for.
    window_size: int
        The size of the window to compute the matrix profile over.
    return_dimension : bool
        if True, also return the matrix profile dimension. It takses O(d^2 n)
        to store and O(d^2 n^2) to compute. (default is False)
    n_jobs : int, Default = 1
        Number of cpu cores to use.

    Returns
    -------
    dict : profile
        A MatrixProfile data structure.
        
        >>> {
        >>>     'mp': The matrix profile,
        >>>     'pi': The matrix profile 1NN indices,
        >>>     'rmp': The right matrix profile,
        >>>     'rpi': The right matrix profile 1NN indices,
        >>>     'lmp': The left matrix profile,
        >>>     'lpi': The left matrix profile 1NN indices,
        >>>     'metric': The distance metric computed for the mp,
        >>>     'w': The window size used to compute the matrix profile,
        >>>     'ez': The exclusion zone used,
        >>>     'sample_pct': Percentage of samples used in computing the MP,
        >>>     'data': {
        >>>         'ts': Time series data,
        >>>         'query': Query data if supplied
        >>>     }
        >>>     'class': "MatrixProfile"
        >>>     'algorithm': "stomp_based_mstamp"
        >>> }

    Raises
    ------
    ValueError
        If window_size < 4.
        If window_size > time series length / 2.
        If ts is not a list or np.array.

    """

    query = ts

    # data conversion to np.array
    ts = core.to_np_array(ts)
    query = core.to_np_array(query)

    if window_size < 4:
        error = "window size must be at least 4."
        raise ValueError(error)

    if ts.ndim == 1:
        ts = np.expand_dims(ts, axis=0)
        query = np.expand_dims(query, axis=0)

    if window_size > query.shape[1] / 2:
        error = "Time series is too short relative to desired window size"
        raise ValueError(error)

    # multiprocessing or single threaded approach
    if n_jobs == 1:
        pass
    else:
        n_jobs = core.valid_n_jobs(n_jobs)

    # precompute some common values - profile length, query length etc.
    profile_length = core.get_profile_length(ts, query, window_size)
    data_length = ts.shape[1]
    query_length = query.shape[1]
    num_queries = query_length - window_size + 1
    exclusion_zone = int(np.ceil(window_size / 2.0))
    num_dim = ts.shape[0]

    # find skip locations, clean up nan and inf in the ts and query
    skip_locs = core.find_multid_skip_locations(ts, profile_length, window_size)
    ts = core.clean_nan_inf(ts)
    query = core.clean_nan_inf(query)

    # initialize matrices
    matrix_profile = np.full((num_dim, profile_length), np.inf)
    profile_index = np.full((num_dim, profile_length), 0)
    # profile_index = np.full((num_dim, profile_length), -1)

    # compute left and right matrix profile when similarity join does not happen
    left_matrix_profile = np.copy(matrix_profile)
    right_matrix_profile = np.copy(matrix_profile)
    left_profile_index = np.copy(profile_index)
    right_profile_index = np.copy(profile_index)

    profile_dimension = []
    if return_dimension:
        n_jobs = 1
        for i in range(num_dim):
            profile_dimension.append(np.empty((i + 1, profile_length), dtype=int))

    # precompute some statistics on ts
    data_mu, data_sig, first_product = np.empty((num_dim, profile_length)), np.empty(
        (num_dim, profile_length)), np.empty((num_dim, profile_length))
    for i in range(num_dim):
        data_mu[i, :], data_sig[i, :] = core.moving_avg_std(ts[i, :], window_size)
        first_window = query[i, 0:window_size]
        first_product[i, :] = core.fft_convolve(ts[i, :], first_window)

    batch_windows = []
    results = []

    # batch compute with multiprocessing
    args = []
    for start, end in core.generate_batch_jobs(num_queries, n_jobs):
        args.append((num_dim, start, end, ts, query, window_size, data_length, profile_length, exclusion_zone, data_mu,
                     data_sig, first_product, skip_locs, profile_dimension, return_dimension))
        batch_windows.append((start, end))

    # we are running single threaded stomp - no need to initialize any
    # parallel environments.
    if n_jobs == 1 or len(args) == 1:
        results.append(_batch_compute(args[0]))
    else:
        # parallelize
        with core.mp_pool()(n_jobs) as pool:
            results = pool.map(_batch_compute, args)

    # now we combine the batch results
    if len(results) == 1:
        result = results[0]
        matrix_profile = result['mp']
        profile_index = result['pi']
        profile_dimension = result['pd']
        left_matrix_profile = result['lmp']
        left_profile_index = result['lpi']
        right_matrix_profile = result['rmp']
        right_profile_index = result['rpi']
    else:
        for index, result in enumerate(results):
            start = batch_windows[index][0]
            end = batch_windows[index][1]

            # update the matrix profile
            indices = result['mp'] < matrix_profile
            matrix_profile[indices] = result['mp'][indices]
            profile_index[indices] = result['pi'][indices]

            # update the left and right matrix profiles
            indices = result['lmp'] < left_matrix_profile
            left_matrix_profile[indices] = result['lmp'][indices]
            left_profile_index[indices] = result['lpi'][indices]

            indices = result['rmp'] < right_matrix_profile
            right_matrix_profile[indices] = result['rmp'][indices]
            right_profile_index[indices] = result['rpi'][indices]

    return {
        'mp': matrix_profile,
        'pi': profile_index,
        'pd': profile_dimension,
        'rmp': right_matrix_profile,
        'rpi': right_profile_index,
        'lmp': left_matrix_profile,
        'lpi': left_profile_index,
        'metric': 'euclidean',
        'w': window_size,
        'ez': exclusion_zone,
        'sample_pct': 1,
        'data': {
            'ts': ts,
            'query': query
        },
        'class': "MatrixProfile",
        'algorithm': "stomp_based_mstamp"
    }
Пример #4
0
def _batch_compute(args):
    """
    Internal function to compute a batch of the time series in parallel.

    Parameters
    ----------
    args : tuple
        Various attributes used for computing the batch.
        (
            batch_start : int
                The starting index for this batch.
            batch_end : int
                The ending index for this batch.
            ts : array_like
                The time series to compute the matrix profile for.
            query : array_like
                The query.
            window_size : int
                The size of the window to compute the profile over.
            data_length : int
                The number of elements in the time series.
            profile_length : int
                The number of elements that will be in the final matrix
                profile.
            exclusion_zone : int
                Used to exclude trivial matches.
            is_join : bool
                Flag to indicate if an AB join or self join is occuring.
            data_mu : array_like
                The moving average over the time series for the given window
                size.
            data_sig : array_like
                The moving standard deviation over the time series for the
                given window size.
            first_product : array_like
                The first sliding dot product for the time series over index
                0 to window_size.
            skip_locs : array_like
                Indices that should be skipped for distance profile calculation
                due to a nan or inf.
        )

    Returns
    -------
    dict : profile
        The matrix profile, left and right matrix profiles and their respective
        profile indices.

        >>> {
        >>>     'mp': The matrix profile,
        >>>     'pi': The matrix profile 1NN indices,
        >>>     'rmp': The right matrix profile,
        >>>     'rpi': The right matrix profile 1NN indices,
        >>>     'lmp': The left matrix profile,
        >>>     'lpi': The left matrix profile 1NN indices,
        >>> }

    """
    batch_start, batch_end, ts, query, window_size, data_length, \
    profile_length, exclusion_zone, is_join, data_mu, data_sig, \
    first_product, skip_locs = args

    # initialize matrices
    matrix_profile = np.full(profile_length, np.inf)
    profile_index = np.full(profile_length, 0)

    left_matrix_profile = None
    right_matrix_profile = None
    left_profile_index = None
    right_profile_index = None

    if not is_join:
        left_matrix_profile = np.copy(matrix_profile)
        right_matrix_profile = np.copy(matrix_profile)
        left_profile_index = np.copy(profile_index)
        right_profile_index = np.copy(profile_index)

    # with batch 0 we do not need to recompute the dot product
    # however with other batch windows, we need the previous iterations sliding
    # dot product
    last_product = None
    if batch_start is 0:
        first_window = query[batch_start:batch_start + window_size]
        last_product = np.copy(first_product)
    else:
        first_window = query[batch_start - 1:batch_start + window_size - 1]
        last_product = core.fft_convolve(ts, first_window)

    query_sum = np.sum(first_window)
    query_2sum = np.sum(first_window**2)
    query_mu, query_sig = core.moving_avg_std(first_window, window_size)

    drop_value = first_window[0]

    # only compute the distance profile for index 0 and update
    if batch_start is 0:
        distance_profile = core.distance_profile(last_product, window_size,
                                                 data_mu, data_sig, query_mu,
                                                 query_sig)

        # apply exclusion zone
        distance_profile = core.apply_exclusion_zone(exclusion_zone, is_join,
                                                     window_size, data_length,
                                                     0, distance_profile)

        # update the matrix profile
        indices = (distance_profile < matrix_profile)
        matrix_profile[indices] = distance_profile[indices]
        profile_index[indices] = 0

        batch_start += 1

    # make sure to compute inclusively from batch start to batch end
    # otherwise there are gaps in the profile
    if batch_end < profile_length:
        batch_end += 1

    # iteratively compute distance profile and update with element-wise mins
    for i in range(batch_start, batch_end):

        # check for nan or inf and skip
        if skip_locs[i]:
            continue

        query_window = query[i:i + window_size]
        query_sum = query_sum - drop_value + query_window[-1]
        query_2sum = query_2sum - drop_value**2 + query_window[-1]**2
        query_mu = query_sum / window_size
        query_sig2 = query_2sum / window_size - query_mu**2
        query_sig = np.sqrt(query_sig2)
        last_product[1:] = last_product[0:data_length - window_size] \
            - ts[0:data_length - window_size] * drop_value \
            + ts[window_size:] * query_window[-1]
        last_product[0] = first_product[i]
        drop_value = query_window[0]

        distance_profile = core.distance_profile(last_product, window_size,
                                                 data_mu, data_sig, query_mu,
                                                 query_sig)

        # apply the exclusion zone
        distance_profile = core.apply_exclusion_zone(exclusion_zone, is_join,
                                                     window_size, data_length,
                                                     i, distance_profile)

        # update the matrix profile
        indices = (distance_profile < matrix_profile)
        matrix_profile[indices] = distance_profile[indices]
        profile_index[indices] = i

        # update the left and right matrix profiles
        if not is_join:
            # find differences, shift left and update
            indices = distance_profile[i:] < left_matrix_profile[i:]
            falses = np.zeros(i).astype('bool')
            indices = np.append(falses, indices)
            left_matrix_profile[indices] = distance_profile[indices]
            left_profile_index[np.argwhere(indices)] = i

            # find differences, shift right and update
            indices = distance_profile[0:i] < right_matrix_profile[0:i]
            falses = np.zeros(profile_length - i).astype('bool')
            indices = np.append(indices, falses)
            right_matrix_profile[indices] = distance_profile[indices]
            right_profile_index[np.argwhere(indices)] = i

    return {
        'mp': matrix_profile,
        'pi': profile_index,
        'rmp': right_matrix_profile,
        'rpi': right_profile_index,
        'lmp': left_matrix_profile,
        'lpi': left_profile_index,
    }
Пример #5
0
def _batch_compute(args):
    """
    Internal function to compute a batch of the time series in parallel.

    Parameters
    ----------
    args : tuple
        Various attributes used for computing the batch.
        (
            batch_start : int
                The starting index for this batch.
            batch_end : int
                The ending index for this batch.
            ts : array_like
                The time series to compute the matrix profile for.
            query : array_like
                The query.
            window_size : int
                The size of the window to compute the profile over.
            data_length : int
                The number of elements in the time series.
            profile_length : int
                The number of elements that will be in the final matrix
                profile.
            exclusion_zone : int
                Used to exclude trivial matches.
            data_mu : array_like
                The moving average over the time series for the given window
                size.
            data_sig : array_like
                The moving standard deviation over the time series for the
                given window size.
            first_product : array_like
                The first sliding dot product for the time series over index
                0 to window_size.
            skip_locs : array_like
                Indices that should be skipped for distance profile calculation
                due to a nan or inf.
        )

    Returns
    -------
    dict : profile
        The matrix profile, left and right matrix profiles and their respective
        profile indices.

        >>> {
        >>>     'mp': The matrix profile,
        >>>     'pi': The matrix profile 1NN indices,
        >>>     'rmp': The right matrix profile,
        >>>     'rpi': The right matrix profile 1NN indices,
        >>>     'lmp': The left matrix profile,
        >>>     'lpi': The left matrix profile 1NN indices,
        >>> }

    """
    num_dim, batch_start, batch_end, ts, query, window_size, data_length, \
    profile_length, exclusion_zone, data_mu, data_sig, \
    first_product, skip_locs, profile_dimension, return_dimension = args

    # initialize matrices
    matrix_profile = np.full((num_dim, profile_length), np.inf)
    profile_index = np.full((num_dim, profile_length), 0)

    left_matrix_profile = None
    right_matrix_profile = None
    left_profile_index = None
    right_profile_index = None

    left_matrix_profile = np.copy(matrix_profile)
    right_matrix_profile = np.copy(matrix_profile)
    left_profile_index = np.copy(profile_index)
    right_profile_index = np.copy(profile_index)

    # with batch 0 we do not need to recompute the dot product
    # however with other batch windows, we need the previous iterations sliding
    # dot product
    last_product = np.copy(first_product)
    if batch_start is 0:
        first_window = query[:, batch_start:batch_start + window_size]
    else:
        first_window = query[:, batch_start - 1:batch_start + window_size - 1]
        for i in range(num_dim):
            last_product[i, :] = core.fft_convolve(ts[i, :], first_window[i, :])

    query_sum = np.sum(first_window, axis=1)
    query_2sum = np.sum(first_window**2, axis=1)
    query_mu, query_sig = np.empty(num_dim), np.empty(num_dim)
    for i in range(num_dim):
        query_mu[i], query_sig[i] = core.moving_avg_std(first_window[i, :], window_size)

    drop_value = np.empty(num_dim)
    for i in range(num_dim):
        drop_value[i] = first_window[i, 0]
    distance_profile = np.empty((num_dim, profile_length))

    # make sure to compute inclusively from batch start to batch end
    # otherwise there are gaps in the profile
    if batch_end < profile_length:
        batch_end += 1

    # iteratively compute distance profile and update with element-wise mins
    for i in range(batch_start, batch_end):
        # check for nan or inf and skip
        if skip_locs[i]:
            continue
        for j in range(num_dim):
            if i == 0:
                query_window = query[j, i:i + window_size]
                distance_profile[j, :] = core.distance_profile(last_product[j, :], window_size, data_mu[j, :],
                                                               data_sig[j, :], query_mu[j], query_sig[j])

                # apply exclusion zone
                distance_profile[j, :] = core.apply_exclusion_zone(exclusion_zone, 0, window_size, data_length, 0,
                                                                   distance_profile[j, :])
            else:
                query_window = query[j, i:i + window_size]
                query_sum[j] = query_sum[j] - drop_value[j] + query_window[-1]
                query_2sum[j] = query_2sum[j] - drop_value[j]**2 + query_window[-1]**2
                query_mu[j] = query_sum[j] / window_size
                query_sig2 = query_2sum[j] / window_size - query_mu[j]**2
                if query_sig2 < _EPS:
                    query_sig2 = _EPS
                query_sig[j] = np.sqrt(query_sig2)
                last_product[j, 1:] = last_product[j, 0:data_length - window_size] \
                - ts[j, 0:data_length - window_size] * drop_value[j] \
                + ts[j, window_size:] * query_window[-1]
                last_product[j, 0] = first_product[j, i]

                distance_profile[j, :] = core.distance_profile(last_product[j, :], window_size, data_mu[j, :],
                                                               data_sig[j, :], query_mu[j], query_sig[j])

                # apply the exclusion zone
                distance_profile[j, :] = core.apply_exclusion_zone(exclusion_zone, 0, window_size, data_length, i,
                                                                   distance_profile[j, :])
            distance_profile[j, distance_profile[j, :] < _EPS] = 0
            drop_value[j] = query_window[0]
        if np.any(query_sig < _EPS):
            continue
        distance_profile[:, skip_locs] = np.inf
        distance_profile[data_sig < np.sqrt(_EPS)] = np.inf

        distance_profile_dim = np.argsort(distance_profile, axis=0)
        distance_profile_sort = np.sort(distance_profile, axis=0)
        distance_profile_cumsum = np.zeros(profile_length)
        for j in range(num_dim):
            distance_profile_cumsum += distance_profile_sort[j, :]
            distance_profile_mean = distance_profile_cumsum / (j + 1)

            # update the matrix profile
            indices = (distance_profile_mean < matrix_profile[j, :])
            matrix_profile[j, indices] = distance_profile_mean[indices]
            profile_index[j, indices] = i
            if return_dimension:
                profile_dimension[j][:, indices] = distance_profile_dim[:j + 1, indices]

            # update the left and right matrix profiles
            # find differences, shift left and update
            indices = distance_profile_mean[i:] < left_matrix_profile[j, i:]
            falses = np.zeros(i).astype('bool')
            indices = np.append(falses, indices)
            left_matrix_profile[j, indices] = distance_profile_mean[indices]
            left_profile_index[j, np.argwhere(indices)] = i

            # find differences, shift right and update
            indices = distance_profile_mean[0:i] < right_matrix_profile[j, 0:i]
            falses = np.zeros(profile_length - i).astype('bool')
            indices = np.append(indices, falses)
            right_matrix_profile[j, indices] = distance_profile_mean[indices]
            right_profile_index[j, np.argwhere(indices)] = i
    return {
        'mp': matrix_profile,
        'pi': profile_index,
        'pd': profile_dimension,
        'rmp': right_matrix_profile,
        'rpi': right_profile_index,
        'lmp': left_matrix_profile,
        'lpi': left_profile_index,
    }
Пример #6
0
def statistics(ts, window_size):
    """
	Compute global and moving statistics for the provided 1D time
	series. The statistics computed include the min, max, mean, std. and median
	over the window specified and globally.

	Parameters
	----------
	ts : array_like
        The time series.
    window_size: int
        The size of the window to compute moving statistics over.

    Returns
    -------
    dict :
    {
    	ts: the original time series,
		min: the global minimum,
		max: the global maximum,
		mean: the global mean,
		std: the global standard deviation,
		median: the global standard deviation,
		moving_min: the moving minimum,
		moving_max: the moving maximum,
		moving_mean: the moving mean,
		moving_std: the moving standard deviation,
		moving_median: the moving median,
		window_size: the window size provided,
		class: Statistics
    }

    Raises
    ------
    ValueError
    	If window_size is not an int.
        If window_size > len(ts)
        If ts is not a list or np.array.
        If ts is not 1D.
	"""
    if not core.is_array_like(ts):
        raise ValueError('ts must be array like')

    if not core.is_one_dimensional(ts):
        raise ValueError('The time series must be 1D')

    if not isinstance(window_size, int):
        raise ValueError('Expecting int for window_size')

    if window_size > len(ts):
        raise ValueError('Window size cannot be greater than len(ts)')

    if window_size < 3:
        raise ValueError('Window size cannot be less than 3')

    moving_mu, moving_sigma = core.moving_avg_std(ts, window_size)
    rolling_ts = core.rolling_window(ts, window_size)

    return {
        'ts': ts,
        'min': np.min(ts),
        'max': np.max(ts),
        'mean': np.mean(ts),
        'std': np.std(ts),
        'median': np.median(ts),
        'moving_min': np.min(rolling_ts, axis=1),
        'moving_max': np.max(rolling_ts, axis=1),
        'moving_mean': moving_mu,
        'moving_std': moving_sigma,
        'moving_median': np.median(rolling_ts, axis=1),
        'window_size': window_size,
        'class': 'Statistics'
    }
Пример #7
0
def scrimp_plus_plus(ts, window_size, query=None, step_size=0.25, sample_pct=0.1,
                     random_state=None, n_jobs=1):
    """SCRIMP++ is an anytime algorithm that computes the matrix profile for a 
    given time series (ts) over a given window size (m). Essentially, it allows
    for an approximate solution to be provided for quicker analysis. In the 
    case of this implementation, sample percentage is used. An approximate
    solution is given based a sample percentage from 0 to 1. The default sample
    percentage is currently 10%.

    This algorithm was created at the University of California Riverside. For
    further academic understanding, please review this paper:

    Matrix Profile XI: SCRIMP++: Time Series Motif Discovery at Interactive
    Speed. Yan Zhu, Chin-Chia Michael Yeh, Zachary Zimmerman, Kaveh Kamgar
    Eamonn Keogh, ICDM 2018.

    https://www.cs.ucr.edu/~eamonn/SCRIMP_ICDM_camera_ready_updated.pdf

    Parameters
    ----------
    ts : np.ndarray
        The time series to compute the matrix profile for.
    window_size : int
        The window size.
    query : array_like
        Optionally, a query can be provided to perform a similarity join.
    step_size : float, default 0.25
        The sampling interval for the window. The paper suggest 0.25 is the
        most practical. It should be a float value between 0 and 1.
    sample_pct : float, default = 0.1 (10%)
        Number of samples to compute distances for in the MP.
    random_state : int, default None
        Set the random seed generator for reproducible results.
    n_jobs : int, Default = 1
        Number of cpu cores to use.

    Returns
    -------
    dict : profile
        A MatrixProfile data structure.

        >>> {
        >>>    'mp': The matrix profile,
        >>>    'pi': The matrix profile 1NN indices,
        >>>    'rmp': The right matrix profile,
        >>>    'rpi': The right matrix profile 1NN indices,
        >>>    'lmp': The left matrix profile,
        >>>    'lpi': The left matrix profile 1NN indices,
        >>>    'metric': The distance metric computed for the mp,
        >>>    'w': The window size used to compute the matrix profile,
        >>>    'ez': The exclusion zone used,
        >>>    'join': Flag indicating if a similarity join was computed,
        >>>    'sample_pct': Percentage of samples used in computing the MP,
        >>>    'data': {
        >>>        'ts': Time series data,
        >>>        'query': Query data if supplied
        >>>    }
        >>>    'class': "MatrixProfile"
        >>>    'algorithm': "scrimp++"
        >>> }

    Raises
    ------
    ValueError
        If window_size < 4.
        If window_size > query length / 2.
        If ts is not a list or np.array.
        If query is not a list or np.array.
        If ts or query is not one dimensional.
        If sample_pct is not between 0 and 1.

    """
    # validate random_state
    if random_state is not None:
        try:
            np.random.seed(random_state)
        except:
            raise ValueError('Invalid random_state value given.')

    ###########################
    # PreSCRIMP
    ###########################
    profile = prescrimp(ts, window_size, query=query, step_size=step_size,
        sample_pct=sample_pct, random_state=random_state, n_jobs=n_jobs)

    # data conversion to np.array
    ts = profile['data']['ts']
    query = profile['data']['query']
    if isinstance(query, type(None)):
        query = ts

    # precompute some common values - profile length, query length etc.
    step_size = int(math.floor(window_size * step_size))
    profile_length = core.get_profile_length(ts, query, window_size)
    data_length = len(ts)
    exclusion_zone = profile['ez']
    window_size = profile['w']

    # precompute some statistics on ts
    data_mu, data_sig = core.moving_avg_std(ts, window_size)

    ###########################
    # SCRIMP
    ###########################

    # randomly sort indices for compute order
    orig_index = np.arange(profile_length)
    compute_order = np.copy(orig_index[orig_index > exclusion_zone])
    #np.random.shuffle(compute_order)

    # Only refine to provided sample_pct
    sample_size = int(np.ceil(len(compute_order) * sample_pct))
    compute_order = np.random.choice(compute_order, size=sample_size, 
        replace=False)

    # initialize some values
    curlastz = np.zeros(profile_length)
    curdistance = np.zeros(profile_length)
    dist1 = np.full(profile_length, np.inf)
    dist2 = np.full(profile_length, np.inf)

    for idx in compute_order:
        # compute last z
        curlastz[idx] = np.sum(ts[0:window_size] * ts[idx:idx + window_size])
        curlastz[idx+1:] = curlastz[idx] + np.cumsum(
            (ts[window_size:data_length - idx] * ts[idx + window_size:data_length]) -\
            (ts[0:profile_length - idx - 1] * ts[idx:profile_length - 1])
        )

        # compute distances
        curdistance[idx:] = np.sqrt(np.abs(
            2 * (window_size - (curlastz[idx:profile_length + 1] -\
                window_size * (data_mu[idx:] * data_mu[0:profile_length - idx])) /\
                (data_sig[idx:] * data_sig[0:profile_length - idx]))
        ))

        dist1[0:idx - 1] = np.inf
        dist1[idx:] = curdistance[idx:]

        dist2[0:profile_length - idx] = curdistance[idx:]
        dist2[profile_length - idx + 2:] = np.inf

        loc1 = dist1 < profile['mp']
        if loc1.any():
            profile['mp'][loc1] = dist1[loc1]
            profile['pi'][loc1] = orig_index[loc1] - idx

        loc2 = dist2 < profile['mp']
        if loc2.any():
            profile['mp'][loc2] = dist2[loc2]
            profile['pi'][loc2] = orig_index[loc2] + idx


    profile['algorithm'] = 'scrimp++'
    profile['sample_pct'] = sample_pct

    return profile
Пример #8
0
def prescrimp(ts, window_size, query=None, step_size=0.25, sample_pct=0.1,
                     random_state=None, n_jobs=1):
    """
    This is the PreScrimp algorithm from the SCRIMP++ paper. It is primarly
    used to compute the approximate matrix profile. In this case we use
    a sample percentage to mock "the anytime/approximate nature".

    Parameters
    ----------
    ts : np.ndarray
        The time series to compute the matrix profile for.
    window_size : int
        The window size.
    query : array_like
        Optionally, a query can be provided to perform a similarity join.
    step_size : float, default 0.25
        The sampling interval for the window. The paper suggest 0.25 is the
        most practical. It should be a float value between 0 and 1.
    sample_pct : float, default = 0.1 (10%)
        Number of samples to compute distances for in the MP.
    random_state : int, default None
        Set the random seed generator for reproducible results.
    n_jobs : int, Default = 1
        Number of cpu cores to use.

    Note
    ----
    The matrix profiles computed from prescrimp will always be the approximate
    solution.

    Returns
    -------
    dict : profile
        A MatrixProfile data structure.
        
        >>> {
        >>>    'mp': The matrix profile,
        >>>    'pi': The matrix profile 1NN indices,
        >>>    'rmp': The right matrix profile,
        >>>    'rpi': The right matrix profile 1NN indices,
        >>>    'lmp': The left matrix profile,
        >>>    'lpi': The left matrix profile 1NN indices,
        >>>    'metric': The distance metric computed for the mp,
        >>>    'w': The window size used to compute the matrix profile,
        >>>    'ez': The exclusion zone used,
        >>>    'join': Flag indicating if a similarity join was computed,
        >>>    'sample_pct': Percentage of samples used in computing the MP,
        >>>    'data': {
        >>>        'ts': Time series data,
        >>>        'query': Query data if supplied
        >>>    }
        >>>    'class': "MatrixProfile"
        >>>    'algorithm': "prescrimp"
        >>>}

    Raises
    ------
    ValueError
        If window_size < 4.
        If window_size > query length / 2.
        If ts is not a list or np.array.
        If query is not a list or np.array.
        If ts or query is not one dimensional.
        If sample_pct is not between 0 and 1.

    """
    is_join = core.is_similarity_join(ts, query)
    if not is_join:
        query = ts

    # data conversion to np.array
    ts = core.to_np_array(ts)
    query = core.to_np_array(query)

    # validate step_size
    if not isinstance(step_size, float) or step_size > 1 or step_size < 0:
        raise ValueError('step_size should be a float between 0 and 1.')

    # validate sample_pct
    if not isinstance(sample_pct, float) or sample_pct > 1 or sample_pct < 0:
        raise ValueError('sample_pct should be a float between 0 and 1.')

    # validate random_state
    if random_state is not None:
        try:
            np.random.seed(random_state)
        except:
            raise ValueError('Invalid random_state value given.')

    if window_size < 4:
        error = "window size must be at least 4."
        raise ValueError(error)

    if window_size > len(query) / 2:
        error = "Time series is too short relative to desired window size"
        raise ValueError(error)

    # precompute some common values - profile length, query length etc.
    step_size = int(math.floor(window_size * step_size))
    profile_length = core.get_profile_length(ts, query, window_size)
    data_length = len(ts)
    exclusion_zone = int(np.ceil(window_size / 4.0))

    matrix_profile = np.zeros(profile_length)
    mp_index = np.zeros(profile_length, dtype='int')

    X = np.fft.fft(ts)
    mux, sigx = core.moving_avg_std(ts, window_size)

    dotproduct = np.zeros(profile_length)
    refine_distance = np.full(profile_length, np.inf)
    orig_index = np.arange(profile_length)

    # iterate over sampled indices and update the matrix profile
    # compute_order = compute_indices(profile_length, step_size, sample_pct)
    compute_order = np.arange(0, profile_length, step=step_size)

    for iteration, idx in enumerate(compute_order):
        subsequence = ts[idx:idx + window_size]

        # compute distance profile
        distance_profile = calc_distance_profile(X, subsequence, data_length,
            window_size, mux, sigx)
        
        # apply exclusion zone
        distance_profile = core.apply_exclusion_zone(exclusion_zone, is_join,
            window_size, data_length, idx, distance_profile)

        # find and store nearest neighbor
        if iteration == 0:
            matrix_profile = distance_profile
            mp_index[:] = idx
        else:
            update_pos = distance_profile < matrix_profile
            mp_index[update_pos] = idx
            matrix_profile[update_pos] = distance_profile[update_pos]

        idx_min = np.argmin(distance_profile)
        matrix_profile[idx] = distance_profile[idx_min]
        mp_index[idx] = idx_min
        idx_nn = mp_index[idx]

        # compute the target indices
        idx_diff = idx_nn - idx
        endidx = np.min([
            profile_length - 1,
            idx + step_size - 1,
            profile_length - idx_diff - 1
        ])
        beginidx = np.max([0, idx - step_size + 1, 2 - idx_diff])

        # compute dot product and refine distance for the idx, begin idx 
        # and end idx
        dotproduct = calc_dotproduct_idx(dotproduct, window_size, 
            matrix_profile, idx, sigx, idx_nn, mux)

        dotproduct = calc_dotproduct_end_idx(ts, dotproduct, idx, window_size,
                                             endidx, idx_nn, idx_diff)

        refine_distance = calc_refine_distance_end_idx(
            refine_distance, dotproduct, idx, endidx, mux, sigx, idx_nn,
            idx_diff, window_size)
        
        dotproduct = calc_dotproduct_begin_idx(
            ts, dotproduct, beginidx, idx, idx_diff, window_size, idx_nn)

        refine_distance = calc_refine_distance_begin_idx(
            refine_distance, dotproduct, beginidx, idx, idx_diff, idx_nn, 
            sigx, mux, window_size)

        matrix_profile, mp_index = apply_update_positions(matrix_profile, 
                                                          mp_index, 
                                                          refine_distance, 
                                                          beginidx, 
                                                          endidx, 
                                                          orig_index, idx_diff)

    return {
        'mp': matrix_profile,
        'pi': mp_index,
        'rmp': None,
        'rpi': None,
        'lmp': None,
        'lpi': None,
        'w': window_size,
        'ez': exclusion_zone,
        'join': is_join,
        'sample_pct': sample_pct,
        'metric': 'euclidean',
        'data': {
            'ts': ts,
            'query': query if is_join else None
        },
        'class': 'MatrixProfile',
        'algorithm': 'prescrimp',
    }
Пример #9
0
def mass2(ts, query, extras=False):
    """
    Compute the distance profile for the given query over the given time 
    series.

    Parameters
    ----------
    ts : array_like
        The time series to search.
    query : array_like
        The query.
    extras : boolean, default False
        Optionally return additional data used to compute the matrix profile.

    Returns
    -------
    An array of distances np.array() or dict with extras.

    Extras:
    {
        'distance_profile': The distance profile,
        'product': The FFT product between ts and query,
        'data_mean': The moving average of the ts over len(query),
        'query_mean': The mean of the query,
        'data_std': The moving std. of the ts over len(query),
        'query_std': The std. of the query
    }

    Raises
    ------
    ValueError
        If ts is not a list or np.array.
        If query is not a list or np.array.
        If ts or query is not one dimensional.
    """
    ts, query = core.precheck_series_and_query_1d(ts, query)

    n = len(ts)
    m = len(query)
    x = ts
    y = query

    meany = np.mean(y)
    sigmay = np.std(y)

    meanx, sigmax = core.moving_avg_std(x, m)
    meanx = np.append(np.ones([1, len(x) - len(meanx)]), meanx)
    sigmax = np.append(np.zeros([1, len(x) - len(sigmax)]), sigmax)

    y = np.append(np.flip(y), np.zeros([1, n - m]))

    X = np.fft.fft(x)
    Y = np.fft.fft(y)
    Y.resize(X.shape)
    Z = X * Y
    z = np.fft.ifft(Z)

    dist = 2 * (m - (z[m - 1:n] - m * meanx[m - 1:n] * meany) /
                (sigmax[m - 1:n] * sigmay))
    dist = np.sqrt(dist)

    if extras:
        return {
            'distance_profile': dist,
            'product': z,
            'data_mean': meanx,
            'query_mean': meany,
            'data_std': sigmax,
            'query_std': sigmay
        }

    return dist