예제 #1
0
def test_precheck_series_and_query_invalid():
    with pytest.raises(ValueError) as excinfo:
        mtscore.precheck_series_and_query('1', [1, 2, 3])
        assert 'Invalid ts value given. Must be array_like!' \
            in str(excinfo.value)

    with pytest.raises(ValueError) as excinfo:
        mtscore.precheck_series_and_query([1, 2, 3], '1')
        assert 'Invalid query value given. Must be array_like!' \
            in str(excinfo.value)
예제 #2
0
def test_precheck_series_and_query_valid():
    ts = [1, 2, 3, 4, 5, 6, 7, 8]
    q = [1, 2, 3, 4]

    actual_ts, actual_q = mtscore.precheck_series_and_query(ts, q)
    np.testing.assert_equal(actual_ts, np.array(ts))
    np.testing.assert_equal(actual_q, np.array(q))
예제 #3
0
def mass2(ts, query):
    """
    Compute the distance profile for the given query over the given time 
    series. Optionally, the correlation coefficient can be returned.

    Parameters
    ----------
    ts : array_like
        The array to create a rolling window on.
    query : array_like
        The query.

    Returns
    -------
    An array of distances.

    Raises
    ------
    ValueError
        If ts is not a list or np.array.
        If query is not a list or np.array.
        If ts or query is not one dimensional.
    """
    ts, query = mtscore.precheck_series_and_query(ts, query)

    n = len(ts)
    m = len(query)
    x = ts
    y = query

    meany = np.mean(y)
    sigmay = np.std(y)

    meanx = mtscore.moving_average(x, m)
    meanx = np.append(np.ones([1, len(x) - len(meanx)]), meanx)
    sigmax = mtscore.moving_std(x, m)
    sigmax = np.append(np.zeros([1, len(x) - len(sigmax)]), sigmax)

    y = np.append(np.flip(y), np.zeros([1, n - m]))

    X = np.fft.fft(x)
    Y = np.fft.fft(y)
    Y.resize(X.shape)
    Z = X * Y
    z = np.fft.ifft(Z)

    dist = 2 * (m - (z[m - 1:n] - m * meanx[m - 1:n] * meany) /
                (sigmax[m - 1:n] * sigmay))
    dist = np.sqrt(dist)

    return dist
예제 #4
0
def mass(ts, query, normalize_query=True, corr_coef=False):
    """
    Compute the distance profile for the given query over the given time 
    series. Optionally, the correlation coefficient can be returned and 
    the query can be normalized.

    Parameters
    ----------
    ts : array_like
        The array to create a rolling window on.
    query : array_like
        The query.
    normalize_query : bool, default True
        Optionally normalize the query.
    corr_coef : bool, default False
        Optionally return the correlation coef.

    Returns
    -------
    An array of distances.

    Raises
    ------
    ValueError
        If ts is not a list or np.array.
        If query is not a list or np.array.
        If ts or query is not one dimensional.
    """
    ts, query = mtscore.precheck_series_and_query(ts, query)

    if normalize_query:
        query = (query - np.mean(query)) / np.std(query)

    n = len(ts)
    m = len(query)
    x = np.append(ts, np.zeros([1, n]))
    y = np.append(np.flipud(query), np.zeros([1, 2 * n - m]))

    X = np.fft.fft(x)
    Y = np.fft.fft(y)
    Y.resize(X.shape)
    Z = X * Y
    z = np.fft.ifft(Z)

    sumy = np.sum(y)
    sumy2 = np.sum(y**2)

    cum_sumx = np.cumsum(x)
    cum_sumx2 = np.cumsum(x**2)

    sumx2 = cum_sumx2[m:n] - cum_sumx2[0:n - m]
    sumx = cum_sumx[m:n] - cum_sumx[0:n - m]
    meanx = sumx / m
    sigmax2 = (sumx2 / m) - (meanx**2)
    sigmax = np.sqrt(sigmax2)

    dist = (sumx2 - 2 * sumx * meanx + m * (meanx ** 2)) \
        / sigmax2 - 2 * (z[m:n] - sumy * meanx) \
        / sigmax + sumy2
    dist = np.absolute(np.sqrt(dist))

    if corr_coef:
        return 1 - np.absolute(dist) / (2 * m)

    return dist
예제 #5
0
def mass3(ts, query, pieces):
    """
    Compute the distance profile for the given query over the given time 
    series. This version of MASS is hardware efficient given the right number
    of pieces.

    Parameters
    ----------
    ts : array_like
        The array to create a rolling window on.
    query : array_like
        The query.
    pieces : int
        Number of pieces to process. This is best as a power of 2.

    Returns
    -------
    An array of distances.

    Raises
    ------
    ValueError
        If ts is not a list or np.array.
        If query is not a list or np.array.
        If ts or query is not one dimensional.
        If pieces is less than the length of the query.
    """
    ts, query = mtscore.precheck_series_and_query(ts, query)

    m = len(query)

    if pieces < m:
        raise ValueError('pieces should be larger than the query length.')

    n = len(ts)
    k = pieces
    x = ts
    dist = np.array([])

    # compute stats in O(n)
    meany = np.mean(query)
    sigmay = np.std(query)

    meanx = mtscore.moving_average(x, m)
    meanx = np.append(np.ones([1, len(x) - len(meanx)]), meanx)
    sigmax = mtscore.moving_std(x, m)
    sigmax = np.append(np.zeros([1, len(x) - len(sigmax)]), sigmax)

    # reverse the query and append zeros
    y = np.append(np.flip(query), np.zeros(pieces - m))

    step_size = k - m + 1
    stop = n - k + 1

    for j in range(0, stop, step_size):
        # The main trick of getting dot products in O(n log n) time
        X = np.fft.fft(x[j:j + k])
        Y = np.fft.fft(y)

        Z = X * Y
        z = np.fft.ifft(Z)

        d = 2 * (m - (z[m - 1:k] - m * meanx[m + j - 1:j + k] * meany) /
                 (sigmax[m + j - 1:j + k] * sigmay))
        d = np.sqrt(d)
        dist = np.append(dist, d)

    j = j + k - m
    k = n - j - 1
    if k >= m:
        X = np.fft.fft(x[j:n - 1])
        y = y[0:k]

        Y = np.fft.fft(y)
        Z = X * Y
        z = np.fft.ifft(Z)

        d = 2 * (m - (z[m - 1:k] - m * meanx[j + m - 1:n - 1] * meany) /
                 (sigmax[j + m - 1:n - 1] * sigmay))

        d = np.sqrt(d)
        dist = np.append(dist, d)

    return np.array(dist)
예제 #6
0
def mass2_batch(ts, query, batch_size, top_matches=3, n_jobs=1):
    """
    MASS2 batch is a batch version of MASS2 that reduces overall memory usage,
    provides parallelization and enables you to find top K number of matches
    within the time series. The goal of using this implementation is for very
    large time series similarity search. The returned results are not sorted
    by distance. So you will need to find the top match with np.argmin() or
    sort them yourself.

    Parameters
    ----------
    ts : array_like
        The time series.
    query : array_like
        The query to search for.
    batch_size : int
        The partitioning size of the time series into batches. For example,
        a time series of length 1,000 and a batch size of 100 would create
        10 jobs where the first subsequence is 0 to 100.
    top_matches : int, Default 3
        The number of matches you would like to return.
    n_jobs : int, Default 1
        By default the implementation runs in single-threaded mode. Setting the
        n_jobs to < 1 sets the n_jobs to the number of available threads on the
        computer it is ran.

    Note
    ----
    This implementation does not support returning the entire distance profile
    at this time. However, it will be implemented in the near future.

    The value selected for top matches should be at max (n / batch_size).

    Returns
    -------
    Tuple (indices, distances) - a tuple of np.arrays where the first index is
    the indices and the second is the distances.

    Raises
    ------
    ValueError
        If ts is not a list or np.array.
        If query is not a list or np.array.
        If ts or query is not one dimensional.
        If batch_size is not an integer.
        If top_matches is < 1 or is not an integer.
        If n_jobs is not an integer.
    """
    # parameter validation
    ts, query = mtscore.precheck_series_and_query(ts, query)

    if not isinstance(batch_size, int):
        raise ValueError('batch_size must be an integer.')

    if not isinstance(top_matches, int) or top_matches < 1:
        raise ValueError('top_matches must be an integer > 0.')

    if not isinstance(n_jobs, int):
        raise ValueError('n_jobs must be an integer.')

    # set the n_jobs appropriately
    if n_jobs < 1:
        n_jobs = cpu_count()

    if n_jobs > cpu_count():
        n_jobs = cpu_count()

    n = len(ts)
    matches = []

    # generate indices to process over given batch size
    indices = list(range(0, n - batch_size + 1, batch_size))

    # determine if we are multiprocessing or not based on cpu_count
    if n_jobs > 1:
        with mtscore.mp_pool()(processes=n_jobs) as pool:
            matches = pool.map(
                _min_subsequence_distance,
                _batch_job_generator(ts, query, indices, batch_size))
    else:
        for values in _batch_job_generator(ts, query, indices, batch_size):
            matches.append(_min_subsequence_distance(values))

    # grab the indices and distances
    matches = np.array(matches)

    # find the best K number of matches
    # distance is in column 1
    top_indices = np.argpartition(matches[:, 1], top_matches)[0:top_matches]

    # ignore the warning when casting the index values back to ints
    # to store all values it had to choose complex types to handle the
    # distances
    with np.warnings.catch_warnings():
        np.warnings.filterwarnings(
            'ignore',
            r'Casting complex values to real discards the imaginary part')
        best_indices = matches[:, 0][top_indices].astype('int64')

    best_dists = matches[:, 1][top_indices]

    return (best_indices, best_dists)