def test_precheck_series_and_query_invalid(): with pytest.raises(ValueError) as excinfo: mtscore.precheck_series_and_query('1', [1, 2, 3]) assert 'Invalid ts value given. Must be array_like!' \ in str(excinfo.value) with pytest.raises(ValueError) as excinfo: mtscore.precheck_series_and_query([1, 2, 3], '1') assert 'Invalid query value given. Must be array_like!' \ in str(excinfo.value)
def test_precheck_series_and_query_valid(): ts = [1, 2, 3, 4, 5, 6, 7, 8] q = [1, 2, 3, 4] actual_ts, actual_q = mtscore.precheck_series_and_query(ts, q) np.testing.assert_equal(actual_ts, np.array(ts)) np.testing.assert_equal(actual_q, np.array(q))
def mass2(ts, query): """ Compute the distance profile for the given query over the given time series. Optionally, the correlation coefficient can be returned. Parameters ---------- ts : array_like The array to create a rolling window on. query : array_like The query. Returns ------- An array of distances. Raises ------ ValueError If ts is not a list or np.array. If query is not a list or np.array. If ts or query is not one dimensional. """ ts, query = mtscore.precheck_series_and_query(ts, query) n = len(ts) m = len(query) x = ts y = query meany = np.mean(y) sigmay = np.std(y) meanx = mtscore.moving_average(x, m) meanx = np.append(np.ones([1, len(x) - len(meanx)]), meanx) sigmax = mtscore.moving_std(x, m) sigmax = np.append(np.zeros([1, len(x) - len(sigmax)]), sigmax) y = np.append(np.flip(y), np.zeros([1, n - m])) X = np.fft.fft(x) Y = np.fft.fft(y) Y.resize(X.shape) Z = X * Y z = np.fft.ifft(Z) dist = 2 * (m - (z[m - 1:n] - m * meanx[m - 1:n] * meany) / (sigmax[m - 1:n] * sigmay)) dist = np.sqrt(dist) return dist
def mass(ts, query, normalize_query=True, corr_coef=False): """ Compute the distance profile for the given query over the given time series. Optionally, the correlation coefficient can be returned and the query can be normalized. Parameters ---------- ts : array_like The array to create a rolling window on. query : array_like The query. normalize_query : bool, default True Optionally normalize the query. corr_coef : bool, default False Optionally return the correlation coef. Returns ------- An array of distances. Raises ------ ValueError If ts is not a list or np.array. If query is not a list or np.array. If ts or query is not one dimensional. """ ts, query = mtscore.precheck_series_and_query(ts, query) if normalize_query: query = (query - np.mean(query)) / np.std(query) n = len(ts) m = len(query) x = np.append(ts, np.zeros([1, n])) y = np.append(np.flipud(query), np.zeros([1, 2 * n - m])) X = np.fft.fft(x) Y = np.fft.fft(y) Y.resize(X.shape) Z = X * Y z = np.fft.ifft(Z) sumy = np.sum(y) sumy2 = np.sum(y**2) cum_sumx = np.cumsum(x) cum_sumx2 = np.cumsum(x**2) sumx2 = cum_sumx2[m:n] - cum_sumx2[0:n - m] sumx = cum_sumx[m:n] - cum_sumx[0:n - m] meanx = sumx / m sigmax2 = (sumx2 / m) - (meanx**2) sigmax = np.sqrt(sigmax2) dist = (sumx2 - 2 * sumx * meanx + m * (meanx ** 2)) \ / sigmax2 - 2 * (z[m:n] - sumy * meanx) \ / sigmax + sumy2 dist = np.absolute(np.sqrt(dist)) if corr_coef: return 1 - np.absolute(dist) / (2 * m) return dist
def mass3(ts, query, pieces): """ Compute the distance profile for the given query over the given time series. This version of MASS is hardware efficient given the right number of pieces. Parameters ---------- ts : array_like The array to create a rolling window on. query : array_like The query. pieces : int Number of pieces to process. This is best as a power of 2. Returns ------- An array of distances. Raises ------ ValueError If ts is not a list or np.array. If query is not a list or np.array. If ts or query is not one dimensional. If pieces is less than the length of the query. """ ts, query = mtscore.precheck_series_and_query(ts, query) m = len(query) if pieces < m: raise ValueError('pieces should be larger than the query length.') n = len(ts) k = pieces x = ts dist = np.array([]) # compute stats in O(n) meany = np.mean(query) sigmay = np.std(query) meanx = mtscore.moving_average(x, m) meanx = np.append(np.ones([1, len(x) - len(meanx)]), meanx) sigmax = mtscore.moving_std(x, m) sigmax = np.append(np.zeros([1, len(x) - len(sigmax)]), sigmax) # reverse the query and append zeros y = np.append(np.flip(query), np.zeros(pieces - m)) step_size = k - m + 1 stop = n - k + 1 for j in range(0, stop, step_size): # The main trick of getting dot products in O(n log n) time X = np.fft.fft(x[j:j + k]) Y = np.fft.fft(y) Z = X * Y z = np.fft.ifft(Z) d = 2 * (m - (z[m - 1:k] - m * meanx[m + j - 1:j + k] * meany) / (sigmax[m + j - 1:j + k] * sigmay)) d = np.sqrt(d) dist = np.append(dist, d) j = j + k - m k = n - j - 1 if k >= m: X = np.fft.fft(x[j:n - 1]) y = y[0:k] Y = np.fft.fft(y) Z = X * Y z = np.fft.ifft(Z) d = 2 * (m - (z[m - 1:k] - m * meanx[j + m - 1:n - 1] * meany) / (sigmax[j + m - 1:n - 1] * sigmay)) d = np.sqrt(d) dist = np.append(dist, d) return np.array(dist)
def mass2_batch(ts, query, batch_size, top_matches=3, n_jobs=1): """ MASS2 batch is a batch version of MASS2 that reduces overall memory usage, provides parallelization and enables you to find top K number of matches within the time series. The goal of using this implementation is for very large time series similarity search. The returned results are not sorted by distance. So you will need to find the top match with np.argmin() or sort them yourself. Parameters ---------- ts : array_like The time series. query : array_like The query to search for. batch_size : int The partitioning size of the time series into batches. For example, a time series of length 1,000 and a batch size of 100 would create 10 jobs where the first subsequence is 0 to 100. top_matches : int, Default 3 The number of matches you would like to return. n_jobs : int, Default 1 By default the implementation runs in single-threaded mode. Setting the n_jobs to < 1 sets the n_jobs to the number of available threads on the computer it is ran. Note ---- This implementation does not support returning the entire distance profile at this time. However, it will be implemented in the near future. The value selected for top matches should be at max (n / batch_size). Returns ------- Tuple (indices, distances) - a tuple of np.arrays where the first index is the indices and the second is the distances. Raises ------ ValueError If ts is not a list or np.array. If query is not a list or np.array. If ts or query is not one dimensional. If batch_size is not an integer. If top_matches is < 1 or is not an integer. If n_jobs is not an integer. """ # parameter validation ts, query = mtscore.precheck_series_and_query(ts, query) if not isinstance(batch_size, int): raise ValueError('batch_size must be an integer.') if not isinstance(top_matches, int) or top_matches < 1: raise ValueError('top_matches must be an integer > 0.') if not isinstance(n_jobs, int): raise ValueError('n_jobs must be an integer.') # set the n_jobs appropriately if n_jobs < 1: n_jobs = cpu_count() if n_jobs > cpu_count(): n_jobs = cpu_count() n = len(ts) matches = [] # generate indices to process over given batch size indices = list(range(0, n - batch_size + 1, batch_size)) # determine if we are multiprocessing or not based on cpu_count if n_jobs > 1: with mtscore.mp_pool()(processes=n_jobs) as pool: matches = pool.map( _min_subsequence_distance, _batch_job_generator(ts, query, indices, batch_size)) else: for values in _batch_job_generator(ts, query, indices, batch_size): matches.append(_min_subsequence_distance(values)) # grab the indices and distances matches = np.array(matches) # find the best K number of matches # distance is in column 1 top_indices = np.argpartition(matches[:, 1], top_matches)[0:top_matches] # ignore the warning when casting the index values back to ints # to store all values it had to choose complex types to handle the # distances with np.warnings.catch_warnings(): np.warnings.filterwarnings( 'ignore', r'Casting complex values to real discards the imaginary part') best_indices = matches[:, 0][top_indices].astype('int64') best_dists = matches[:, 1][top_indices] return (best_indices, best_dists)