def mp_top_k_motifs(profile, exclusion_zone=None, k=3, max_neighbors=10, radius=3): """ Find the top K number of motifs (patterns) given a matrix profile. By default the algorithm will find up to 3 motifs (k) and up to 10 of their neighbors with a radius of 3 * min_dist. Parameters ---------- profile : dict The output from one of the matrix profile algorithms. exclusion_zone : int, Default to algorithm ez Desired number of values to exclude on both sides of the motif. This avoids trivial matches. It defaults to half of the computed window size. Setting the exclusion zone to 0 makes it not apply. k : int, Default = 3 Desired number of motifs to find. neighbor_count : int, Default = 10 The maximum number of neighbors to include for a given motif. radius : int, Default = 3 The radius is used to associate a neighbor by checking if the neighbor's distance is less than or equal to dist * radius Returns ------- The original input obj with the addition of the "motifs" key. The motifs key consists of the following structure. A list of dicts containing motif indices and their corresponding neighbor indices. [ { 'motifs': [first_index, second_index], 'neighbors': [index, index, index ...max_neighbors] } ] """ if not core.is_mp_obj(profile): raise ValueError('Expecting MP data structure!') window_size = profile['w'] data = profile.get('data', None) if data: ts = data.get('ts', None) data_len = len(ts) motifs = [] mp = np.copy(profile['mp']) mpi = profile['pi'] # TODO: this is based on STOMP standards when this motif finding algorithm # originally came out. Should we default this to 4.0 instead? That seems # to be the common value now per new research. if exclusion_zone is None: exclusion_zone = profile.get('ez', None) for i in range(k): min_idx = np.argmin(mp) min_dist = mp[min_idx] # we no longer have any motifs to find as all values are nan/inf if core.is_nan_inf(min_dist): break # create a motif pair corresponding to the first appearance and # second appearance first_idx = np.min([min_idx, mpi[min_idx]]) second_idx = np.max([min_idx, mpi[min_idx]]) # compute distance profile using mass2 for first appearance query = ts[first_idx:first_idx + window_size] distance_profile = mass2(ts, query) # exclude already picked motifs and neighbors mask = core.nan_inf_indices(mp) distance_profile[mask] = np.inf # apply exclusion zone for motif pair for j in (first_idx, second_idx): distance_profile = core.apply_exclusion_zone( exclusion_zone, False, window_size, data_len, j, distance_profile) mp = core.apply_exclusion_zone(exclusion_zone, False, window_size, data_len, j, mp) # find up to max_neighbors neighbors = [] for j in range(max_neighbors): neighbor_idx = np.argmin(distance_profile) neighbor_dist = distance_profile[neighbor_idx] not_in_radius = not ((radius * min_dist) >= neighbor_dist) # no more neighbors exist based on radius if core.is_nan_inf(neighbor_dist) or not_in_radius: break # add neighbor and apply exclusion zone neighbors.append(neighbor_idx) distance_profile = core.apply_exclusion_zone( exclusion_zone, False, window_size, data_len, neighbor_idx, distance_profile) mp = core.apply_exclusion_zone(exclusion_zone, False, window_size, data_len, neighbor_idx, mp) # add motifs and neighbors to results motifs.append({ 'motifs': [first_idx, second_idx], 'neighbors': neighbors }) profile['motifs'] = motifs return profile
def pmp_top_k_motifs(profile, exclusion_zone=None, k=3, max_neighbors=10, radius=3): """ Find the top K number of motifs (patterns) given a pan matrix profile. By default the algorithm will find up to 3 motifs (k) and up to 10 of their neighbors with a radius of 3 * min_dist. Parameters ---------- profile : dict The output from one of the pan matrix profile algorithms. exclusion_zone : int, Default to algorithm ez Desired number of values to exclude on both sides of the motif. This avoids trivial matches. It defaults to half of the computed window size. Setting the exclusion zone to 0 makes it not apply. k : int, Default = 3 Desired number of motifs to find. neighbor_count : int, Default = 10 The maximum number of neighbors to include for a given motif. radius : int, Default = 3 The radius is used to associate a neighbor by checking if the neighbor's distance is less than or equal to dist * radius Returns ------- The original input obj with the addition of the "motifs" key. The motifs key consists of the following structure. A list of dicts containing motif indices and their corresponding neighbor indices. Note that each index is a (row, col) index corresponding to the pan matrix profile. [ { 'motifs': [first_index, second_index], 'neighbors': [index, index, index ...max_neighbors] } ] """ if not core.is_pmp_obj(profile): raise ValueError('Expecting PMP data structure!') data = profile.get('data', None) ts = data.get('ts', None) data_len = len(ts) pmp = profile.get('pmp', None) profile_len = pmp.shape[1] pmpi = profile.get('pmpi', None) windows = profile.get('windows', None) # make sure we are working with Euclidean distances tmp = None if core.is_pearson_array(pmp): tmp = core.pearson_to_euclidean(pmp, windows) else: tmp = np.copy(pmp).astype('d') # replace nan and infs with infinity tmp[core.nan_inf_indices(tmp)] = np.inf motifs = [] for _ in range(k): min_idx = np.unravel_index(np.argmin(tmp), tmp.shape) min_dist = tmp[min_idx] # nothing else to find... if core.is_nan_inf(min_dist): break # create the motif pair min_row_idx = min_idx[0] min_col_idx = min_idx[1] # motif pairs are respective to the column of the matching row first_idx = np.min([min_col_idx, pmpi[min_row_idx][min_col_idx]]) second_idx = np.max([min_col_idx, pmpi[min_row_idx][min_col_idx]]) # compute distance profile for first appearance window_size = windows[min_row_idx] query = ts[first_idx:first_idx + window_size] distance_profile = mass2(ts, query) # extend the distance profile to be as long as the original infs = np.full(profile_len - len(distance_profile), np.inf) distance_profile = np.append(distance_profile, infs) # exclude already picked motifs and neighbors mask = core.nan_inf_indices(pmp[min_row_idx]) distance_profile[mask] = np.inf # determine the exclusion zone if not set if not exclusion_zone: exclusion_zone = int(np.floor(window_size / 2)) # apply exclusion zone for motif pair for j in (first_idx, second_idx): distance_profile = core.apply_exclusion_zone( exclusion_zone, False, window_size, data_len, j, distance_profile) tmp2 = core.apply_exclusion_zone(exclusion_zone, False, window_size, data_len, j, tmp[min_row_idx]) tmp[min_row_idx] = tmp2 # find up to max_neighbors neighbors = [] for j in range(max_neighbors): neighbor_idx = np.argmin(distance_profile) neighbor_dist = np.real(distance_profile[neighbor_idx]) not_in_radius = not ((radius * min_dist) >= neighbor_dist) # no more neighbors exist based on radius if core.is_nan_inf(neighbor_dist) or not_in_radius: break # add neighbor and apply exclusion zone neighbors.append((min_row_idx, neighbor_idx)) distance_profile = core.apply_exclusion_zone( exclusion_zone, False, window_size, data_len, neighbor_idx, distance_profile) tmp2 = core.apply_exclusion_zone(exclusion_zone, False, window_size, data_len, neighbor_idx, tmp[min_row_idx]) tmp[min_row_idx] = tmp2 # add the motifs and neighbors # note that they are (row, col) indices motifs.append({ 'motifs': [(min_row_idx, first_idx), (min_row_idx, second_idx)], 'neighbors': neighbors }) profile['motifs'] = motifs return profile
def _batch_compute(args): """ Internal function to compute a batch of the time series in parallel. Parameters ---------- args : tuple Various attributes used for computing the batch. ( batch_start : int The starting index for this batch. batch_end : int The ending index for this batch. ts : array_like The time series to compute the matrix profile for. query : array_like The query. window_size : int The size of the window to compute the profile over. data_length : int The number of elements in the time series. profile_length : int The number of elements that will be in the final matrix profile. exclusion_zone : int Used to exclude trivial matches. data_mu : array_like The moving average over the time series for the given window size. data_sig : array_like The moving standard deviation over the time series for the given window size. first_product : array_like The first sliding dot product for the time series over index 0 to window_size. skip_locs : array_like Indices that should be skipped for distance profile calculation due to a nan or inf. ) Returns ------- dict : profile The matrix profile, left and right matrix profiles and their respective profile indices. >>> { >>> 'mp': The matrix profile, >>> 'pi': The matrix profile 1NN indices, >>> 'rmp': The right matrix profile, >>> 'rpi': The right matrix profile 1NN indices, >>> 'lmp': The left matrix profile, >>> 'lpi': The left matrix profile 1NN indices, >>> } """ num_dim, batch_start, batch_end, ts, query, window_size, data_length, \ profile_length, exclusion_zone, data_mu, data_sig, \ first_product, skip_locs, profile_dimension, return_dimension = args # initialize matrices matrix_profile = np.full((num_dim, profile_length), np.inf) profile_index = np.full((num_dim, profile_length), 0) left_matrix_profile = None right_matrix_profile = None left_profile_index = None right_profile_index = None left_matrix_profile = np.copy(matrix_profile) right_matrix_profile = np.copy(matrix_profile) left_profile_index = np.copy(profile_index) right_profile_index = np.copy(profile_index) # with batch 0 we do not need to recompute the dot product # however with other batch windows, we need the previous iterations sliding # dot product last_product = np.copy(first_product) if batch_start is 0: first_window = query[:, batch_start:batch_start + window_size] else: first_window = query[:, batch_start - 1:batch_start + window_size - 1] for i in range(num_dim): last_product[i, :] = core.fft_convolve(ts[i, :], first_window[i, :]) query_sum = np.sum(first_window, axis=1) query_2sum = np.sum(first_window**2, axis=1) query_mu, query_sig = np.empty(num_dim), np.empty(num_dim) for i in range(num_dim): query_mu[i], query_sig[i] = core.moving_avg_std(first_window[i, :], window_size) drop_value = np.empty(num_dim) for i in range(num_dim): drop_value[i] = first_window[i, 0] distance_profile = np.empty((num_dim, profile_length)) # make sure to compute inclusively from batch start to batch end # otherwise there are gaps in the profile if batch_end < profile_length: batch_end += 1 # iteratively compute distance profile and update with element-wise mins for i in range(batch_start, batch_end): # check for nan or inf and skip if skip_locs[i]: continue for j in range(num_dim): if i == 0: query_window = query[j, i:i + window_size] distance_profile[j, :] = core.distance_profile(last_product[j, :], window_size, data_mu[j, :], data_sig[j, :], query_mu[j], query_sig[j]) # apply exclusion zone distance_profile[j, :] = core.apply_exclusion_zone(exclusion_zone, 0, window_size, data_length, 0, distance_profile[j, :]) else: query_window = query[j, i:i + window_size] query_sum[j] = query_sum[j] - drop_value[j] + query_window[-1] query_2sum[j] = query_2sum[j] - drop_value[j]**2 + query_window[-1]**2 query_mu[j] = query_sum[j] / window_size query_sig2 = query_2sum[j] / window_size - query_mu[j]**2 if query_sig2 < _EPS: query_sig2 = _EPS query_sig[j] = np.sqrt(query_sig2) last_product[j, 1:] = last_product[j, 0:data_length - window_size] \ - ts[j, 0:data_length - window_size] * drop_value[j] \ + ts[j, window_size:] * query_window[-1] last_product[j, 0] = first_product[j, i] distance_profile[j, :] = core.distance_profile(last_product[j, :], window_size, data_mu[j, :], data_sig[j, :], query_mu[j], query_sig[j]) # apply the exclusion zone distance_profile[j, :] = core.apply_exclusion_zone(exclusion_zone, 0, window_size, data_length, i, distance_profile[j, :]) distance_profile[j, distance_profile[j, :] < _EPS] = 0 drop_value[j] = query_window[0] if np.any(query_sig < _EPS): continue distance_profile[:, skip_locs] = np.inf distance_profile[data_sig < np.sqrt(_EPS)] = np.inf distance_profile_dim = np.argsort(distance_profile, axis=0) distance_profile_sort = np.sort(distance_profile, axis=0) distance_profile_cumsum = np.zeros(profile_length) for j in range(num_dim): distance_profile_cumsum += distance_profile_sort[j, :] distance_profile_mean = distance_profile_cumsum / (j + 1) # update the matrix profile indices = (distance_profile_mean < matrix_profile[j, :]) matrix_profile[j, indices] = distance_profile_mean[indices] profile_index[j, indices] = i if return_dimension: profile_dimension[j][:, indices] = distance_profile_dim[:j + 1, indices] # update the left and right matrix profiles # find differences, shift left and update indices = distance_profile_mean[i:] < left_matrix_profile[j, i:] falses = np.zeros(i).astype('bool') indices = np.append(falses, indices) left_matrix_profile[j, indices] = distance_profile_mean[indices] left_profile_index[j, np.argwhere(indices)] = i # find differences, shift right and update indices = distance_profile_mean[0:i] < right_matrix_profile[j, 0:i] falses = np.zeros(profile_length - i).astype('bool') indices = np.append(indices, falses) right_matrix_profile[j, indices] = distance_profile_mean[indices] right_profile_index[j, np.argwhere(indices)] = i return { 'mp': matrix_profile, 'pi': profile_index, 'pd': profile_dimension, 'rmp': right_matrix_profile, 'rpi': right_profile_index, 'lmp': left_matrix_profile, 'lpi': left_profile_index, }
def _batch_compute(args): """ Internal function to compute a batch of the time series in parallel. Parameters ---------- args : tuple Various attributes used for computing the batch. ( batch_start : int The starting index for this batch. batch_end : int The ending index for this batch. ts : array_like The time series to compute the matrix profile for. query : array_like The query. window_size : int The size of the window to compute the profile over. data_length : int The number of elements in the time series. profile_length : int The number of elements that will be in the final matrix profile. exclusion_zone : int Used to exclude trivial matches. is_join : bool Flag to indicate if an AB join or self join is occuring. data_mu : array_like The moving average over the time series for the given window size. data_sig : array_like The moving standard deviation over the time series for the given window size. first_product : array_like The first sliding dot product for the time series over index 0 to window_size. skip_locs : array_like Indices that should be skipped for distance profile calculation due to a nan or inf. ) Returns ------- dict : profile The matrix profile, left and right matrix profiles and their respective profile indices. >>> { >>> 'mp': The matrix profile, >>> 'pi': The matrix profile 1NN indices, >>> 'rmp': The right matrix profile, >>> 'rpi': The right matrix profile 1NN indices, >>> 'lmp': The left matrix profile, >>> 'lpi': The left matrix profile 1NN indices, >>> } """ batch_start, batch_end, ts, query, window_size, data_length, \ profile_length, exclusion_zone, is_join, data_mu, data_sig, \ first_product, skip_locs = args # initialize matrices matrix_profile = np.full(profile_length, np.inf) profile_index = np.full(profile_length, 0) left_matrix_profile = None right_matrix_profile = None left_profile_index = None right_profile_index = None if not is_join: left_matrix_profile = np.copy(matrix_profile) right_matrix_profile = np.copy(matrix_profile) left_profile_index = np.copy(profile_index) right_profile_index = np.copy(profile_index) # with batch 0 we do not need to recompute the dot product # however with other batch windows, we need the previous iterations sliding # dot product last_product = None if batch_start is 0: first_window = query[batch_start:batch_start + window_size] last_product = np.copy(first_product) else: first_window = query[batch_start - 1:batch_start + window_size - 1] last_product = core.fft_convolve(ts, first_window) query_sum = np.sum(first_window) query_2sum = np.sum(first_window**2) query_mu, query_sig = core.moving_avg_std(first_window, window_size) drop_value = first_window[0] # only compute the distance profile for index 0 and update if batch_start is 0: distance_profile = core.distance_profile(last_product, window_size, data_mu, data_sig, query_mu, query_sig) # apply exclusion zone distance_profile = core.apply_exclusion_zone(exclusion_zone, is_join, window_size, data_length, 0, distance_profile) # update the matrix profile indices = (distance_profile < matrix_profile) matrix_profile[indices] = distance_profile[indices] profile_index[indices] = 0 batch_start += 1 # make sure to compute inclusively from batch start to batch end # otherwise there are gaps in the profile if batch_end < profile_length: batch_end += 1 # iteratively compute distance profile and update with element-wise mins for i in range(batch_start, batch_end): # check for nan or inf and skip if skip_locs[i]: continue query_window = query[i:i + window_size] query_sum = query_sum - drop_value + query_window[-1] query_2sum = query_2sum - drop_value**2 + query_window[-1]**2 query_mu = query_sum / window_size query_sig2 = query_2sum / window_size - query_mu**2 query_sig = np.sqrt(query_sig2) last_product[1:] = last_product[0:data_length - window_size] \ - ts[0:data_length - window_size] * drop_value \ + ts[window_size:] * query_window[-1] last_product[0] = first_product[i] drop_value = query_window[0] distance_profile = core.distance_profile(last_product, window_size, data_mu, data_sig, query_mu, query_sig) # apply the exclusion zone distance_profile = core.apply_exclusion_zone(exclusion_zone, is_join, window_size, data_length, i, distance_profile) # update the matrix profile indices = (distance_profile < matrix_profile) matrix_profile[indices] = distance_profile[indices] profile_index[indices] = i # update the left and right matrix profiles if not is_join: # find differences, shift left and update indices = distance_profile[i:] < left_matrix_profile[i:] falses = np.zeros(i).astype('bool') indices = np.append(falses, indices) left_matrix_profile[indices] = distance_profile[indices] left_profile_index[np.argwhere(indices)] = i # find differences, shift right and update indices = distance_profile[0:i] < right_matrix_profile[0:i] falses = np.zeros(profile_length - i).astype('bool') indices = np.append(indices, falses) right_matrix_profile[indices] = distance_profile[indices] right_profile_index[np.argwhere(indices)] = i return { 'mp': matrix_profile, 'pi': profile_index, 'rmp': right_matrix_profile, 'rpi': right_profile_index, 'lmp': left_matrix_profile, 'lpi': left_profile_index, }
def prescrimp(ts, window_size, query=None, step_size=0.25, sample_pct=0.1, random_state=None, n_jobs=1): """ This is the PreScrimp algorithm from the SCRIMP++ paper. It is primarly used to compute the approximate matrix profile. In this case we use a sample percentage to mock "the anytime/approximate nature". Parameters ---------- ts : np.ndarray The time series to compute the matrix profile for. window_size : int The window size. query : array_like Optionally, a query can be provided to perform a similarity join. step_size : float, default 0.25 The sampling interval for the window. The paper suggest 0.25 is the most practical. It should be a float value between 0 and 1. sample_pct : float, default = 0.1 (10%) Number of samples to compute distances for in the MP. random_state : int, default None Set the random seed generator for reproducible results. n_jobs : int, Default = 1 Number of cpu cores to use. Note ---- The matrix profiles computed from prescrimp will always be the approximate solution. Returns ------- dict : profile A MatrixProfile data structure. >>> { >>> 'mp': The matrix profile, >>> 'pi': The matrix profile 1NN indices, >>> 'rmp': The right matrix profile, >>> 'rpi': The right matrix profile 1NN indices, >>> 'lmp': The left matrix profile, >>> 'lpi': The left matrix profile 1NN indices, >>> 'metric': The distance metric computed for the mp, >>> 'w': The window size used to compute the matrix profile, >>> 'ez': The exclusion zone used, >>> 'join': Flag indicating if a similarity join was computed, >>> 'sample_pct': Percentage of samples used in computing the MP, >>> 'data': { >>> 'ts': Time series data, >>> 'query': Query data if supplied >>> } >>> 'class': "MatrixProfile" >>> 'algorithm': "prescrimp" >>>} Raises ------ ValueError If window_size < 4. If window_size > query length / 2. If ts is not a list or np.array. If query is not a list or np.array. If ts or query is not one dimensional. If sample_pct is not between 0 and 1. """ is_join = core.is_similarity_join(ts, query) if not is_join: query = ts # data conversion to np.array ts = core.to_np_array(ts) query = core.to_np_array(query) # validate step_size if not isinstance(step_size, float) or step_size > 1 or step_size < 0: raise ValueError('step_size should be a float between 0 and 1.') # validate sample_pct if not isinstance(sample_pct, float) or sample_pct > 1 or sample_pct < 0: raise ValueError('sample_pct should be a float between 0 and 1.') # validate random_state if random_state is not None: try: np.random.seed(random_state) except: raise ValueError('Invalid random_state value given.') if window_size < 4: error = "window size must be at least 4." raise ValueError(error) if window_size > len(query) / 2: error = "Time series is too short relative to desired window size" raise ValueError(error) # precompute some common values - profile length, query length etc. step_size = int(math.floor(window_size * step_size)) profile_length = core.get_profile_length(ts, query, window_size) data_length = len(ts) exclusion_zone = int(np.ceil(window_size / 4.0)) matrix_profile = np.zeros(profile_length) mp_index = np.zeros(profile_length, dtype='int') X = np.fft.fft(ts) mux, sigx = core.moving_avg_std(ts, window_size) dotproduct = np.zeros(profile_length) refine_distance = np.full(profile_length, np.inf) orig_index = np.arange(profile_length) # iterate over sampled indices and update the matrix profile # compute_order = compute_indices(profile_length, step_size, sample_pct) compute_order = np.arange(0, profile_length, step=step_size) for iteration, idx in enumerate(compute_order): subsequence = ts[idx:idx + window_size] # compute distance profile distance_profile = calc_distance_profile(X, subsequence, data_length, window_size, mux, sigx) # apply exclusion zone distance_profile = core.apply_exclusion_zone(exclusion_zone, is_join, window_size, data_length, idx, distance_profile) # find and store nearest neighbor if iteration == 0: matrix_profile = distance_profile mp_index[:] = idx else: update_pos = distance_profile < matrix_profile mp_index[update_pos] = idx matrix_profile[update_pos] = distance_profile[update_pos] idx_min = np.argmin(distance_profile) matrix_profile[idx] = distance_profile[idx_min] mp_index[idx] = idx_min idx_nn = mp_index[idx] # compute the target indices idx_diff = idx_nn - idx endidx = np.min([ profile_length - 1, idx + step_size - 1, profile_length - idx_diff - 1 ]) beginidx = np.max([0, idx - step_size + 1, 2 - idx_diff]) # compute dot product and refine distance for the idx, begin idx # and end idx dotproduct = calc_dotproduct_idx(dotproduct, window_size, matrix_profile, idx, sigx, idx_nn, mux) dotproduct = calc_dotproduct_end_idx(ts, dotproduct, idx, window_size, endidx, idx_nn, idx_diff) refine_distance = calc_refine_distance_end_idx( refine_distance, dotproduct, idx, endidx, mux, sigx, idx_nn, idx_diff, window_size) dotproduct = calc_dotproduct_begin_idx( ts, dotproduct, beginidx, idx, idx_diff, window_size, idx_nn) refine_distance = calc_refine_distance_begin_idx( refine_distance, dotproduct, beginidx, idx, idx_diff, idx_nn, sigx, mux, window_size) matrix_profile, mp_index = apply_update_positions(matrix_profile, mp_index, refine_distance, beginidx, endidx, orig_index, idx_diff) return { 'mp': matrix_profile, 'pi': mp_index, 'rmp': None, 'rpi': None, 'lmp': None, 'lpi': None, 'w': window_size, 'ez': exclusion_zone, 'join': is_join, 'sample_pct': sample_pct, 'metric': 'euclidean', 'data': { 'ts': ts, 'query': query if is_join else None }, 'class': 'MatrixProfile', 'algorithm': 'prescrimp', }