Пример #1
0
    def update_features(self, indices):
        self.minimalp = 0
        self.minimaln = 0
        activation1, activation2 = self.get_activations()
        print("activation1")
        print(activation1)
        print("activation2")
        print(activation2)
        dat_znorm_p = znorm(activation1[indices])
        dat_znorm_n = znorm(activation2[indices])
        symRep_p = ts_to_string(dat_znorm_p, cuts_for_asize(self.symbols))
        symRep_n = ts_to_string(dat_znorm_n, cuts_for_asize(self.symbols))
        # transfer result to feature(string to tuple)
        feature_p = tuple(symRep_p)
        feature_n = tuple(symRep_n)
        print("found symbolic feature for SQ-P:", symRep_p)
        if feature_p in self.testObjective.feature_p:
            self.minimalp = 1
            self.testObjective.feature_p.remove(feature_p)
        self.coverage_p = 1 - len(self.testObjective.feature_p
                                  ) / self.testObjective.originalNumOfFeature
        self.displayCoverage1()

        print("found symbolic feature for SQ-N:", symRep_n)
        if feature_n in self.testObjective.feature_n:
            self.minimaln = 1
            self.testObjective.feature_n.remove(feature_n)
        self.coverage_n = 1 - len(self.testObjective.feature_n
                                  ) / self.testObjective.originalNumOfFeature
        self.displayCoverage2()

        return self.coverage_p, self.coverage_n
Пример #2
0
 def maximize_level_node(self, max_level):
     """
     Try to maximaxe the level value
     :param p_value:
     :return:
     """
     values_group = list(self.group.values())
     original_level = self.level
     equal = True
     while equal and self.level < max_level:
         temp_level = self.level + 1
         data = np.array(values_group[0])
         data_znorm = znorm(data)
         data_paa = paa(data_znorm, self.paa_value)
         pr = ts_to_string(data_paa, cuts_for_asize(temp_level))
         for index in range(1, len(values_group)):
             data = np.array(values_group[index])
             data_znorm = znorm(data)
             data_paa = paa(data_znorm, self.paa_value)
             pr_2 = ts_to_string(data_paa, cuts_for_asize(temp_level))
             if pr_2 != pr:
                 equal = False
         if equal:
             self.level = temp_level
     if original_level != self.level:
         logger.info("New level for node: {}".format(self.level))
         data = np.array(values_group[0])
         data_znorm = znorm(data)
         data_paa = paa(data_znorm, self.paa_value)
         self.pattern_representation = ts_to_string(data_paa, cuts_for_asize(self.level))
     else:
         logger.info("Can't split again, max level already reached")
Пример #3
0
def row_pattern_loss(row: np.ndarray, pr: Tuple[str, int]):
    pattern = []
    cuts = cuts_for_asize(pr[1] + 1)[1:]
    for c in pr[0]:
        n = ord(c) - 97
        pattern.append(cuts[n])
    if len(pattern) != len(row):
        normalized_row = paa(znorm(row), len(pattern))
    else:
        normalized_row = znorm(row)
    return distance(normalized_row, pattern)
Пример #4
0
def sax_via_window(series,
                   win_size,
                   paa_size,
                   alphabet_size=3,
                   nr_strategy='exact',
                   z_threshold=0.01):
    """Simple via window conversion implementation."""
    cuts = cuts_for_asize(alphabet_size)
    sax = defaultdict(list)

    prev_word = ''

    for i in range(0, len(series) - win_size):

        sub_section = series[i:(i + win_size)]

        zn = znorm(sub_section, z_threshold)

        paa_rep = paa(zn, paa_size)

        curr_word = ts_to_string(paa_rep, cuts)

        if '' != prev_word:
            if 'exact' == nr_strategy and prev_word == curr_word:
                continue
            elif 'mindist' == nr_strategy and\
                    is_mindist_zero(prev_word, curr_word):
                continue

        prev_word = curr_word

        sax[curr_word].append(i)

    return sax
Пример #5
0
def apply_adaptive_sax(ts, win_size, paa_size, alphabet_size, z_threshold):
    """
    This function applies the sax transformation to a 1-dim time series using adaptive break-points

    :param ts: 1-time series
    :type ts: 1D array
    :param win_size: size fo the sliding window that generated each sax word
    :type win_size: int
    :param paa_size: number of characters in a single sax word
    :type paa_size: int
    :param alphabet_size: number of unique characters to use in the sax representation
    :type alphabet_size: int
    :param z_threshold: z_threshold for the znorm method from saxpy
    :type z_threshold: float
    :return: the sax sequence, a list of strings, where each string represents a single sax word
    :rtype: list of str
    """
    sax_sequence = []
    cuts = cuts_for_asize(alphabet_size)
    for t in range(0, len(ts) - win_size + 1):
        ts_win = ts[t:(t + win_size)]
        ts_win_znormed = znorm(ts_win, z_threshold)
        paa_rep = paa(ts_win_znormed, paa_size)
        sax_word = ts_to_string(paa_rep, cuts)
        sax_sequence.append(sax_word)
    return sax_sequence
Пример #6
0
def find_discords_brute_force(series,
                              win_size,
                              num_discords=2,
                              znorm_threshold=0.01):
    """Early-abandoned distance-based discord discovery."""
    discords = list()

    globalRegistry = VisitRegistry(len(series) - win_size + 1)
    znorms = np.array([
        znorm(series[pos:pos + win_size], znorm_threshold)
        for pos in range(len(series) - win_size + 1)
    ])

    while len(discords) < num_discords:

        bestDiscord = find_best_discord_brute_force(series, win_size,
                                                    globalRegistry, znorms)

        if -1 == bestDiscord[0]:
            break

        discords.append(bestDiscord)

        mark_start = max(0, bestDiscord[0] - win_size + 1)
        mark_end = bestDiscord[0] + win_size

        globalRegistry.mark_visited_range(mark_start, mark_end)

    return discords
Пример #7
0
def test_znorm():
    """Test the znorm implementation."""
    # test std is 1 and mean is 0
    ts = array([-1., -2., -1., 0., 2., 1., 1., 0.])
    z_thrsh = 0.001
    x_scaled = [x / 100.0 for x in ts]
    assert pytest.approx(1.0, 0.000001) == std(znorm.znorm(x_scaled, z_thrsh))
    assert pytest.approx(0.0, 0.000001) == mean(znorm.znorm(x_scaled, z_thrsh))

    # test std and mean wouldnt change on hi threshold
    ts = array([-0.1, -0.2, 0.2, 0.1])
    z_thrsh = 0.5
    ts_mean = mean(ts)
    ts_sd = std(ts)
    assert ts_mean == mean(znorm.znorm(ts, z_thrsh))
    assert ts_sd == std(znorm.znorm(ts, z_thrsh))
Пример #8
0
def find_best_discord_brute_force(series,
                                  win_size,
                                  global_registry,
                                  z_threshold=0.01):
    """Early-abandoned distance-based discord discovery."""
    best_so_far_distance = -1.0
    best_so_far_index = -1

    outerRegistry = global_registry.clone()

    outer_idx = outerRegistry.get_next_unvisited()

    while ~np.isnan(outer_idx):

        outerRegistry.mark_visited(outer_idx)

        candidate_seq = znorm(series[outer_idx:(outer_idx + win_size)],
                              z_threshold)

        nnDistance = np.inf
        innerRegistry = VisitRegistry(len(series) - win_size)

        inner_idx = innerRegistry.get_next_unvisited()

        while ~np.isnan(inner_idx):
            innerRegistry.mark_visited(inner_idx)

            if abs(inner_idx - outer_idx) > win_size:

                curr_seq = znorm(series[inner_idx:(inner_idx + win_size)],
                                 z_threshold)
                dist = early_abandoned_dist(candidate_seq, curr_seq,
                                            nnDistance)

                if (~np.isnan(dist)) and (dist < nnDistance):
                    nnDistance = dist

            inner_idx = innerRegistry.get_next_unvisited()

        if ~(np.inf == nnDistance) and (nnDistance > best_so_far_distance):
            best_so_far_distance = nnDistance
            best_so_far_index = outer_idx

        outer_idx = outerRegistry.get_next_unvisited()

    return (best_so_far_index)
Пример #9
0
def SAX(sequence: np.ndarray, alphabet_size: int, length: int = 0) -> str:
    """
    Computes SAX string of a sequence of numbers with specified alphabet size.
    Length of the output string may be specified; length 0 will generate a string as long as the sequence.
    """
    debug("Calculating SAX of {}, with alphabet of size {}".format(
        sequence, alphabet_size))
    if alphabet_size == 1:
        if length == 0:
            return "a" * len(sequence)
        else:
            return "a" * length
    else:
        if length == 0 or length == len(sequence):
            return ts_to_string(znorm(sequence), cuts_for_asize(alphabet_size))
        else:
            return ts_to_string(paa(znorm(sequence), length),
                                cuts_for_asize(alphabet_size))
Пример #10
0
def _preprocess_ts(time_series_df):
    """
    z-normalizes the time series' numeric values
    """
    del time_series_df["time_delta_in_days"]
    time_series_df["value_representation"] = znorm(
        time_series_df.numeric_value)
    del time_series_df["numeric_value"]
    return time_series_df
Пример #11
0
def saxrepresentation(matrix):
    result = []
    index_ts = 0
    for ts in matrix.T:
        sax_representation = znorm(ts)
        dat_paa_3 = paa(sax_representation, 3)
        a = ts_to_string(dat_paa_3, cuts_for_asize(3))
        result.append(a)

    return result
Пример #12
0
    def update_features(self, data):
        self.hidden = self.state_manager.get_hidden_state(data)
        activation = self.get_activation()
        dat_znorm = znorm(activation[self.indices])
        sym_rep = ts_to_string(dat_znorm, cuts_for_asize(self.symbols))
        feature = tuple(sym_rep)

        if feature in self.feature:
            index = self.feature.index(feature)
            self.covered_dict[index] = True
Пример #13
0
    def update_features(self, data):
        self.hidden = self.state_manager.get_hidden_state(data)
        activation_p, activation_n = self.get_activation()
        dat_znorm_p = znorm(activation_p[self.indices])
        dat_znorm_n = znorm(activation_n[self.indices])
        sym_rep_p = ts_to_string(dat_znorm_p, cuts_for_asize(self.symbols))
        sym_rep_n = ts_to_string(dat_znorm_n, cuts_for_asize(self.symbols))
        feature_p = tuple(sym_rep_p)
        feature_n = tuple(sym_rep_n)

        if feature_p in self.feature_p:
            index = self.feature_p.index(feature_p)
            self.covered_dict_p[index] = True
            self.frequency_dict_p[index] += 1

        if feature_n in self.feature_n:
            index = self.feature_n.index(feature_n)
            self.covered_dict_n[index] = True
            self.frequency_dict_n[index] += 1
Пример #14
0
def ppa_representation(data, seq_len):
    data_reduced = np.zeros(shape=(int(data.shape[0] / seq_len), data.shape[1]))

    paa_segment = int(data.shape[0] / seq_len)

    for i in tqdm(range(data.shape[1])):
        dat_znorm = znorm(data[:, i])

        data_reduced[:, i] = paa(dat_znorm, paa_segment)

    return data_reduced
Пример #15
0
def find_discords_hotsax(series,
                         win_size=100,
                         num_discords=2,
                         alphabet_size=3,
                         paa_size=3,
                         znorm_threshold=0.01,
                         sax_type='unidim'):
    """HOT-SAX-driven discords discovery."""
    discords = list()

    global_registry = set()

    # Z-normalized versions for every subsequence.
    znorms = np.array([
        znorm(series[pos:pos + win_size], znorm_threshold)
        for pos in range(len(series) - win_size + 1)
    ])

    # SAX words for every subsequence.
    sax_data = sax_via_window(series,
                              win_size=win_size,
                              paa_size=paa_size,
                              alphabet_size=alphabet_size,
                              nr_strategy=None,
                              znorm_threshold=0.01,
                              sax_type=sax_type)
    """[2.0] build the 'magic' array"""
    magic_array = list()
    for k, v in sax_data.items():
        magic_array.append((k, len(v)))
    """[2.1] sort it ascending by the number of occurrences"""
    magic_array = sorted(magic_array, key=lambda tup: tup[1])

    while len(discords) < num_discords:

        best_discord = find_best_discord_hotsax(series, win_size,
                                                global_registry, sax_data,
                                                magic_array, znorms)

        if -1 == best_discord[0]:
            break

        discords.append(best_discord)

        mark_start = max(0, best_discord[0] - win_size + 1)
        mark_end = best_discord[0] + win_size

        for i in range(mark_start, mark_end):
            global_registry.add(i)

    return discords
Пример #16
0
def discretize_data(data, w, features, start_date, end_date, dataset, plotting):
    """
    Function that performs SAX discretization on the signals
    :param data: the signals to be discretized
    :param w: the number of PAA segments to represent the initial time series
    :param features: the name of the signals to be discretized
    :param start_date: date used mostly for visualization reasons
    :param end_date: date used mostly for visualization reasons
    :param dataset: the type of the dataset (used for saving reasons)
    :param plotting: if True then the discretized signals are plotted
    :return: the discretized series with the indices of each PAA segment
    """
    alphabet = 5  # the length of the alphabet to be used
    # dictionaries used for plotting reasons
    symbol_to_number = {'a': -1.5, 'b': -0.75, 'c': 0, 'd': 0.75, 'e': 1.5}  # just for visualization purposes
    number_to_symbol = {'0': 'a', '1': 'b', '2': 'c', '3': 'd', '4': 'e'}
    sax_seqs = {}
    sax_indices = {}
    for feature in features:
        print('------------------------------------- Discretizing %s -------------------------------------' % feature)
        sax_str, real_indices = sax.to_letter_rep(np.array(data[feature]), w, alphabet)  # SAX discretization
        sax_seqs[feature] = sax_str  # store the discretized series
        sax_indices[feature] = real_indices  # store the indices of the real blocks

        # plotting part
        if plotting:
            normalized_signal = znorm(np.array(data[feature]))
            discrete = uncompress_labels(sax_str, real_indices)
            discrete = [symbol_to_number[number_to_symbol[d]] for d in discrete]

            fig = plt.figure()
            ax = fig.add_subplot(111)
            ax.plot(pd.DataFrame(normalized_signal, index=data.index).loc[start_date:end_date],
                    label='normalized signal')
            ax.plot(pd.DataFrame(discrete, index=data.index).loc[start_date:end_date], label='discretized signal')
            ax.xaxis.set_major_locator(mdates.DayLocator([5, 10, 15, 20, 25, 30]))
            ax.xaxis.set_major_formatter(mdates.DateFormatter('%d/%m'))
            plt.xlabel("time")
            plt.ylabel(feature)
            plt.xticks(rotation=45)
            plt.yticks(np.array(list(symbol_to_number.values())), tuple(symbol_to_number.keys()))
            plt.grid()
            plt.legend(loc='lower right')
            plt.savefig('plots/sax/%s_discretization_%s.png' % (dataset, feature), bbox_inches='tight')
    return sax_seqs, sax_indices
Пример #17
0
def extract_features(
        song_name):  # returns mfcc and chroma features in SAX represetation
    try:
        x, fs = librosa.load(song_name)
    except:
        return None
    mfccs = librosa.feature.mfcc(x, sr=fs, n_mfcc=39)
    chroma = librosa.feature.chroma_stft(x, sr=fs)
    feature_matrix = np.concatenate((mfccs, chroma))

    sax_rep = []

    sax_rep = [
        ts_to_string(paa(znorm(feat), SAX_VOCAB_LENGTH),
                     cuts_for_asize(SAX_VOCAB_LENGTH))
        for feat in feature_matrix
    ]
    return sax_rep
Пример #18
0
def sax_via_window(series,
                   win_size,
                   paa_size,
                   alphabet_size=3,
                   nr_strategy='exact',
                   z_threshold=0.01):
    """Simple via window conversion implementation."""

    # 生成指定size的alphabet cuts
    cuts = cuts_for_asize(alphabet_size)
    # 初始化sax
    sax = defaultdict(list)

    prev_word = ''

    for i in range(0, len(series) - win_size):

        # series被当前窗口所围住的子部分
        sub_section = series[i:(i + win_size)]

        # 标准化
        zn = znorm(sub_section, z_threshold)

        # PAA分段聚合 将子部分降维到paa_size维
        paa_rep = paa(zn, paa_size)

        # PAA后的序列转化为字符串
        curr_word = ts_to_string(paa_rep, cuts)

        #
        if '' != prev_word:
            if 'exact' == nr_strategy and prev_word == curr_word:
                continue
            elif 'mindist' == nr_strategy and\
                    is_mindist_zero(prev_word, curr_word):
                continue

        prev_word = curr_word

        sax[curr_word].append(i)

    return sax
Пример #19
0
def data_creator(ticker):
  import fix_yahoo_finance as yf 
  from saxpy.znorm import znorm
  hist = yf.download(tickers = ticker, period = 'max')
  hist = hist["Close"]
  pc = [(hist[i + 1] - hist[i])/hist[i] for i in range(len(hist) -1)]
  pc2 = znorm(pc)
  pc3 = [np.floor(c) for c in pc2]
  X = ent.util_pattern_space(pc2, lag = 1, dim = 21)
  X.shape
  trainY = X[:,-1]
  trainX = X[:,:-1]
  trainY = np.where(trainY <= -4, True, False)
  drops = np.where(trainY == True)
  return trainX, trainY, drops
  
  tick_list = ["VTI","VOO","VEA","VWO","VTV","VUG",
             "VO","VB","VEU","VIG","VHT","VFH","VPL",
             "VPU","VSS","VGK","VOT","VSS","VAS","VGT",
             "EFA","EWA","EWH","EWG","EWU","EWQ","EWL","EWP",
             "EWD","EWN","EWI","ERUS","UAE","EIS","INDA"]
Пример #20
0
def znorm_paa_sax(time_series, alpha, w=3, missing='z'):
    """Takes an array containing real values, z-normalizes, reduces
    dimensionality to w, and finally returns a sax representation of length alpha
    
    time series:    array holding a time series of one measurement for one patient
    w:              the dimensionality to reduce to using PAA, set to len(time_series) in plain
    alpha:          alpha is the number of discretized segments that the SAX rep will reflect, set to 2, 3 or 5 in plain using RDS algo
    """

    # If time_series is a string, make it into list format e.g. 'abc' -> ['a', 'b', 'c']
    # why? because it's the structure we require for below and i CBA to change it
    if (isinstance(time_series, str)):
        time_series = list(time_series)

    if (len(time_series) > 0):
        # normalizing one time series, time series as numpy array (np.array([]))
        normalized_time_series = znorm(np.array(time_series))
        # dimensionality reduction of time series according to w
        paa_norm_time_series = paa(normalized_time_series, w)
        # turning a discretized and reduced time series into a sequence of characters
        return ts_to_string(paa_norm_time_series, cuts_for_asize(alpha))
    else:
        return missing
Пример #21
0
def PAA_aggregation(v):
    dat_znorm = znorm(v)
    r = paa(dat_znorm, len(v))
    #r = znorm(v)
    print("MIN MAX",min(r),max(r))
    return r
Пример #22
0
def sax_via_window(series, win_size, paa_size, alphabet_size=3,
                   nr_strategy='exact', znorm_threshold=0.01, sax_type='unidim'):
    """Simple via window conversion implementation.

    # SAX-ENERGY
    >>> sax_via_window([[1, 2, 3], [4, 5, 6]], win_size=1, paa_size=3, sax_type='energy', nr_strategy=None)['abc']
    [0, 1]

    >>> sax_via_window([[1, 2, 3, 4], [4, 5, 6, 7]], win_size=1, paa_size=4, sax_type='energy', nr_strategy=None)['aacc']
    [0, 1]

    >>> sax_via_window([[1, 2, 3, 4], [4, 5, 6, 7]], win_size=2, paa_size=4, sax_type='energy', nr_strategy=None)['aaccaacc']
    [0]

    # SAX-REPEAT
    >>> sax_via_window([[1, 2, 3], [4, 5, 6], [7, 8, 9]], win_size=2, paa_size=2, sax_type='repeat', nr_strategy=None)['ab']
    [0, 1]

    >>> sax_via_window([[1, 2, 3], [4, 5, 6], [7, 8, 9]], win_size=1, paa_size=1, sax_type='repeat', nr_strategy=None)['a']
    [0, 1, 2]

    # SAX-INDEPENDENT
    >>> sax_via_window([[1, 2, 3, 4], [4, 5, 6, 7]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acacacac']
    [0]

    >>> sax_via_window([[1, 2], [4, 5], [7, 8]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acac']
    [0, 1]

    >>> sax_via_window([[1, 2], [4, 8], [7, 5]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acac']
    [0]

    >>> sax_via_window([[1, 2], [4, 8], [7, 5]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acca']
    [1]

    """

    # Convert to numpy array.
    series = np.array(series)

    # Check on dimensions.
    if len(series.shape) > 2:
        raise ValueError('Please reshape time-series to stack dimensions along the 2nd dimension, so that the array shape is a 2-tuple.')

    # PAA size is the length of the PAA sequence.
    if sax_type != 'energy' and paa_size > win_size:
        raise ValueError('PAA size cannot be greater than the window size.')

    if sax_type == 'energy' and len(series.shape) == 1:
        raise ValueError('Must pass a multidimensional time-series to SAX-ENERGY.')

    # Breakpoints.
    cuts = cuts_for_asize(alphabet_size)

    # Dictionary mapping SAX words to indices.
    sax = defaultdict(list)

    if sax_type == 'repeat':
        # Maps indices to multi-dimensional SAX words.
        multidim_sax_dict = []

        # List of all the multi-dimensional SAX words.
        multidim_sax_list = []

        # Sliding window across time dimension.
        for i in range(series.shape[0] - win_size + 1):

            # Subsection starting at this index.
            sub_section = series[i: i + win_size]

            # Z-normalized subsection.
            if win_size == 1:
                zn = sub_section
            else:
                zn = znorm(sub_section, znorm_threshold)

            # PAA representation of subsection.
            paa_rep = paa(zn, paa_size, 'repeat')

            # SAX representation of subsection, but in terms of multi-dimensional vectors.
            multidim_sax = get_sax_list(paa_rep, cuts)

            # Update data-structures.
            multidim_sax_dict.append(multidim_sax)
            multidim_sax_list.extend(multidim_sax)

        # Cluster with k-means++.
        kmeans = KMeans(n_clusters=alphabet_size, random_state=0).fit(multidim_sax_list)

        # Cluster indices in sorted order.
        order = np.lexsort(np.rot90(kmeans.cluster_centers_))

        # Sliding window across time dimension.
        prev_word = ''
        for i in range(series.shape[0] - win_size + 1):

            # Map cluster indices to new SAX letters.
            curr_word_list = map(lambda cluster_index: idx2letter(order[cluster_index]), kmeans.predict(multidim_sax_dict[i]))
            curr_word = ''.join(curr_word_list)

            if '' != prev_word:
                if 'exact' == nr_strategy and prev_word == curr_word:
                    continue
                elif 'mindist' == nr_strategy and is_mindist_zero(prev_word, curr_word):
                    continue

            prev_word = curr_word

            sax[curr_word].append(i)

    else:
        # Sliding window across time dimension.
        prev_word = ''
        for i in range(series.shape[0] - win_size + 1):

            # Subsection starting at this index.
            sub_section = series[i: i + win_size]

            if sax_type == 'energy':
                curr_word = ''
                for energy_dist in sub_section:
                    # Normalize energy distribution.
                    energy_zn = znorm(energy_dist, znorm_threshold)

                    # PAA representation of energy distribution.
                    paa_rep = paa(energy_zn, paa_size, 'unidim')
                    # paa_rep = energy_zn

                    # SAX representation of the energy distribution.
                    energy_word = ts_to_string(paa_rep, cuts)

                    # Add to current word.
                    curr_word += energy_word

            elif sax_type == 'independent':
                curr_word = ''
                for dim in range(sub_section.shape[1]):
                    # Obtain the subsequence restricted to one dimension.
                    one_dimension_sub_section = sub_section[:, dim]

                    # Z-normalized subsection.
                    zn = znorm(one_dimension_sub_section, znorm_threshold)

                    # PAA representation of subsection.
                    paa_rep = paa(zn, paa_size, 'unidim')

                    # Get the SAX word - just a unidimensional SAX.
                    one_dim_word = ts_to_string(paa_rep, cuts)

                    # Add this dimensions' representation to the overall SAX word.
                    curr_word += one_dim_word

            else:
                # Z-normalized subsection.
                zn = znorm(sub_section, znorm_threshold)

                # PAA representation of subsection.
                paa_rep = paa(zn, paa_size, sax_type)

                # SAX representation of subsection.
                curr_word = ts_to_string(paa_rep, cuts)

            if '' != prev_word:
                if 'exact' == nr_strategy and prev_word == curr_word:
                    continue
                elif 'mindist' == nr_strategy and is_mindist_zero(prev_word, curr_word):
                    continue

            prev_word = curr_word

            sax[curr_word].append(i)

    return sax
print(sys.argv[1].strip().split(','))
flag = ''
if sys.argv[1] != '' and sys.argv[1] != 'a':
    index_to_plot = np.array(sys.argv[1].strip().split(','))
    index_to_plot = index_to_plot.astype(int)
else:
    flag = 'a'
series = np.genfromtxt('open_prices',
                       delimiter='\n',
                       missing_values='null',
                       filling_values=0)
all_series = np.asfarray(np.split(series, 57), float)
all_series = all_series[:, :
                        -1]  # removing the last element 'null' from all series (n=255-1=254)
for i in range(0, 57):
    all_series[i] = znorm(all_series[i])
print(all_series)
with open('open_prices', 'r') as f:
    cur_index = 0
    # print(index_to_plot)
    counter = 0  # Used to iterate through the multiple TS indices that need to be plotted, passed as arguments
    # print(all_series)
    print(all_series.shape)
    if flag == 'a':
        for series in all_series:
            plt.plot(series, label=str(cur_index))
    else:
        for series in all_series:
            if counter < len(
                    index_to_plot) and cur_index == index_to_plot[counter]:
                print(znorm(series))
Пример #24
0
fig, ax = plt.subplots(5, 3)
plotcounter=1
to_plot=[]
to_label_legend=[]
fig = plt.figure(figsize=(25, 10))

for i in range(0,len(index_to_plot)):
	if index_to_plot[i] != -100: #-100 because after adjustment to make 0-indexed, -99 becomes -100
		to_plot.append(index_to_plot[i])
		# print(index_to_plot[i])
		to_label_legend.append(labels[index_to_plot[i]])
	else:
		fig.add_subplot(5,3,plotcounter)
		for series in all_series:
			if counter < len(to_plot) and cur_index == to_plot[counter]:
				# print(series)
				# print(znorm(series))
				plt.plot(znorm(series[:-1]))
				counter+=1
			cur_index+=1
		cur_index = 0
		plt.legend(to_label_legend, fontsize=5)
		# plt.title('Plot '+str(plotcounter))
		counter=0
		plotcounter+=1
		to_plot=[]
		to_label_legend=[]

plt.savefig('Plots/diff_znorm_comparison_nested_'+str(P)+'_'+linkage_method+'_w='+str(w)+'_a='+str(a)+'_lsh_limit='+str(lsh_limit))
plt.show()
Пример #25
0
from keras.layers import Dense, Bidirectional, LSTM, TimeDistributed
from keras.optimizers import Adam



##########

hist = yf.download(tickers = "DJI", period = 'max')



words = [] 
dow_df = ent.util_pattern_space(hist_sma, lag = 1, dim = 50)
dow_df = dow_df[:]
for i in range(len(dow_df)):
    dat_znorm = znorm(dow_df[i,:])
    dat_paa= paa(dat_znorm, 3)
    word = ts_to_string(dat_paa, cuts_for_asize(2))
    words.append(word)
print(words)


print(collections.Counter(words))

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
sqn = le.fit_transform(words)

nb_classes = len(np.unique(sqn))

from keras.utils import to_categorical 
Пример #26
0
def discretise(data, number_of_bins):
    return ts_to_string(znorm(data), cuts_for_asize(number_of_bins))
Пример #27
0
def find_best_discord_hotsax(series, win_size, a_size, paa_size,
                             znorm_threshold, globalRegistry):  # noqa: C901
    """Find the best discord with hotsax."""
    """
    [1.0] get the sax data first
        将一个 time series 转化为 SAX字典 (key: 字符串, value: 窗口索引组成的列表)
    """
    sax_none = sax_via_window(series, win_size, a_size, paa_size, "none", 0.01)
    """
    [2.0] build the 'magic' array
        magic_array: a list of tuples
        (字符串, 窗口索引个数)
    """
    magic_array = list()
    for k, v in sax_none.items():
        magic_array.append((k, len(v)))
    """
    [2.1] sort it desc by the key
        按照 窗口索引个数降序 对 tuple 排序
    """
    m_arr = sorted(magic_array, key=lambda tup: tup[1])
    """
    [3.0] define the key vars

    bestSoFarPosition
        bestSoFarDistance对应的窗口开始索引
        这个窗口是该时间序列的异常子序列

    bestSoFarDistance
        max(min(distance))
        对于每一个窗口, 我们求出它与其他窗口的最小距离
        对所有的最小距离取一个最大值
    """
    bestSoFarPosition = -1
    bestSoFarDistance = 0.

    distanceCalls = 0

    visit_array = np.zeros(len(series), dtype=np.int)
    """[4.0] and we are off iterating over the magic array entries"""
    for entry in m_arr:
        """[5.0] some moar of teh vars"""
        curr_word = entry[0]
        # occurrences 当前word 的窗口索引列表
        occurrences = sax_none[curr_word]
        """[6.0] jumping around by the same word occurrences makes it easier to
        nail down the possibly small distance value 通过在相同的单词之间 跳转, 使得更容易确定可能的小距离值
         -- so we can be efficient and all that..."""

        # curr_pos 当前窗口索引 开始索引
        for curr_pos in occurrences:

            # 若 已经在 globalRegistry 跳出本次循环
            if curr_pos in globalRegistry:
                continue
            """[7.0] we don't want an overlapping subsequence"""
            # 避免 重复的子序列
            mark_start = curr_pos - win_size
            mark_end = curr_pos + win_size

            # 我们要找到 与 当前窗口 相似性最大的(距离最小的)窗口, 而 visit_set 定义我们已经看过的窗口的开始索引
            visit_set = set(range(mark_start, mark_end))
            """[8.0] here is our subsequence in question"""
            # cur_seq 标准化的子序列
            cur_seq = znorm(series[curr_pos:(curr_pos + win_size)],
                            znorm_threshold)
            """[9.0] let's see what is NN distance"""
            # 定义 nn_dist 为: 当前窗口 与 其他窗口 的最小距离 (两窗口 不能有重复部分 且 不能相邻?)
            nn_dist = np.inf
            # 定义 bool 是否进行随机搜索
            do_random_search = 1
            """[10.0] ordered by occurrences search first"""
            # 通过在相同的单词之间 跳转, 使得更容易确定可能的小距离值
            for next_pos in occurrences:
                """[11.0] skip bad pos"""
                # 避免 重复子序列
                if next_pos in visit_set:
                    continue
                else:
                    visit_set.add(next_pos)
                """[12.0] distance we compute"""
                dist = euclidean(
                    cur_seq,
                    znorm(series[next_pos:(next_pos + win_size)],
                          znorm_threshold))
                distanceCalls += 1
                """[13.0] keep the books up-to-date"""
                # 更新 nn_dist
                if dist < nn_dist:
                    nn_dist = dist
                if dist < bestSoFarDistance:
                    do_random_search = 0
                    break
            """[13.0] if not broken above,
            we shall proceed with random search"""
            # 上面循环正常结束 并没有提前跳出 那我们就要进行随机搜索

            if do_random_search:
                """[14.0] build that random visit order array"""
                curr_idx = 0
                for i in range(0,
                               (len(series) -
                                win_size)):  # 为什么不是 len(series) - win_size + 1

                    # 当前窗口开始索引 在上面没有查看过
                    if not (i in visit_set):

                        # 将其添加到 visit_array 中
                        visit_array[curr_idx] = i
                        curr_idx += 1

                # 此时 curr_idx 为 在上面没查看过的窗口开始索引的个数

                # 打乱顺序
                it_order = np.random.permutation(visit_array[0:curr_idx])
                curr_idx -= 1
                """[15.0] and go random"""

                while curr_idx >= 0:

                    # 随机选择 窗口开始索引 it_order[curr_idx]
                    rand_pos = it_order[curr_idx]

                    curr_idx -= 1

                    dist = euclidean(
                        cur_seq,
                        znorm(series[rand_pos:(rand_pos + win_size)],
                              znorm_threshold))
                    distanceCalls += 1
                    """[16.0] keep the books up-to-date again"""

                    # 更新 nn_dist
                    if dist < nn_dist:
                        nn_dist = dist

                    if dist < bestSoFarDistance:
                        nn_dist = dist
                        break
            """[17.0] and BIGGER books"""

            # 更新 bestSoFarDistance 和 bestSoFarPosition
            if (nn_dist > bestSoFarDistance) and (nn_dist < np.inf):
                bestSoFarDistance = nn_dist
                bestSoFarPosition = curr_pos

    return (bestSoFarPosition, bestSoFarDistance)
Пример #28
0
    def start_splitting(self, p_value: int, max_level: int, good_leaf_nodes: list(), bad_leaf_nodes: list()):
        """
        Splitting Node Naive algorithm (k, P) Anonymity
        :param p_value:
        :param max_level:
        :param paa_value
        :return:
        """
        # logger.info("good_leaf_nodes: {}, bad_leaf_nodes: {}".format(len(good_leaf_nodes), len(bad_leaf_nodes)))
        if self.size < p_value:
            logger.info("size:{}, p_value:{} == bad-leaf".format(self.size, p_value))
            self.label = "bad-leaf"
            bad_leaf_nodes.append(self)
            return

        if self.level == max_level:
            logger.info("size:{}, p_value:{} == good-leaf".format(self.size, p_value))
            self.label = "good-leaf"
            good_leaf_nodes.append(self)
            return

        if p_value <= self.size < 2*p_value:
            logger.info("Maximize-level, size:{}, p_value:{} == good-leaf".format(self.size, p_value))
            self.maximize_level_node(max_level)
            self.label = "good-leaf"
            good_leaf_nodes.append(self)
            return
        """
        Otherwise, we need to check if node N has to be split. The checking relies on a tentative split performed on N. 
        Suppose that, by increasing the level of N, N is tentatively split into a number of child nodes. 
        If all these child nodes contain fewer than P time series, no real split is performed and the original node N is
        labeled as good-leaf and the recursion terminates on N. Otherwise, there must exist tentative child node(s) 
        whose size >= P, also called TG-node(s) (Tentative Good Nodes). 
        The rest children whose size < P are called TB-nodes (Tentative Bad Nodes), if any. 
        If the total number of records in all TB-nodes under N is no less than P, we merge them into a single tentative
        node, denoted by childmerge, at the level of N.level. If the above tentative process produces nc tentative 
        child nodes (including TB and TG) and nc >= 2, N will really be split into nc children and then the node 
        splitting procedure will be recursively invoked on each of them 
        """
        tentative_child_node = dict()
        temp_level = self.level + 1
        for key, value in self.group.items():
            # to reduce dimensionality
            data = np.array(value)
            data_znorm = znorm(data)
            data_paa = paa(data_znorm, self.paa_value)
            pr = ts_to_string(data_paa, cuts_for_asize(temp_level))
            if pr in tentative_child_node.keys():
                tentative_child_node[pr].append(key)
            else:
                tentative_child_node[pr] = [key]
        length_all_tentative_child = [len(x) for x in list(tentative_child_node.values())]
        good_leaf = np.all(np.array(length_all_tentative_child) < p_value)

        if good_leaf:
            logger.info("Good-leaf, all_tentative_child are < {}".format(p_value))
            self.label = "good-leaf"
            good_leaf_nodes.append(self)
            return
        else:
            logger.info("N can be split")
            logger.info("Compute tentative good nodes and tentative bad nodes")
            # tentative good nodes
            # index of nodes in tentative_child_node with more p_value
            pr_keys = list(tentative_child_node.keys())
            # get index tentative good node
            pattern_representation_tg = list()
            tg_nodes_index = list(np.where(np.array(length_all_tentative_child) >= p_value)[0])
            # logger.info(pr_keys)
            tg_nodes = list()
            for index in tg_nodes_index:
                keys_elements = tentative_child_node[pr_keys[index]]
                dict_temp = dict()
                for key in keys_elements:
                    dict_temp[key] = self.group[key]
                tg_nodes.append(dict_temp)
                pattern_representation_tg.append(pr_keys[index])

            # tentative bad nodes
            tb_nodes_index = list(np.where(np.array(length_all_tentative_child) < p_value)[0])
            tb_nodes = list()
            pattern_representation_tb = list()

            for index in tb_nodes_index:
                keys_elements = tentative_child_node[pr_keys[index]]
                dict_temp = dict()
                for key in keys_elements:
                    dict_temp[key] = self.group[key]
                tb_nodes.append(dict_temp)
                pattern_representation_tb.append(pr_keys[index])

            total_size_tb_nodes = 0
            for tb_node in tb_nodes:
                total_size_tb_nodes += len(tb_node)

            if total_size_tb_nodes >= p_value:
                logger.info("Merge all bad nodes in a single node, and label it as good-leaf")
                child_merge_node_group = dict()
                for tb_node in tb_nodes:
                    for key, value in tb_node.items():
                        child_merge_node_group[key] = value
                node_merge = Node(level=self.level, pattern_representation=self.pattern_representation,
                                  label="good-leaf", group=child_merge_node_group, parent=self)
                self.child_node.append(node_merge)
                good_leaf_nodes.append(node_merge)

                nc = len(tg_nodes) + len(tb_nodes)  # tb_nodes sono un po' perplesso su questo tb_nodes
                logger.info("Split only tg_nodes {0}".format(len(tg_nodes)))
                if nc >= 2:
                    for index in range(0, len(tg_nodes)):
                        node = Node(level=self.level, pattern_representation=pattern_representation_tg[index],
                                    label="intermediate", group=tg_nodes[index], parent=self)
                        self.child_node.append(node)
                        node.start_splitting(p_value, max_level, good_leaf_nodes, bad_leaf_nodes)
                else:
                    for index in range(0, len(tg_nodes)):
                        node = Node(level=self.level, pattern_representation=pattern_representation_tg[index],
                                    label="good-leaf", group=tg_nodes[index], parent=self)
                        self.child_node.append(node)
                        good_leaf_nodes.append(node)

            else:
                nc = len(tg_nodes) + len(tb_nodes)  # tb_nodes sono un po' perplesso su questo tb_nodes
                logger.info("Label all tb_node {0} as bad-leaf and split only tg_nodes {1}".format(len(tb_nodes),len(tg_nodes)))
                for index in range(0, len(tb_nodes)):
                    node = Node(level=self.level, pattern_representation=pattern_representation_tb[index], label="bad-leaf",
                                group=tb_nodes[index], parent=self)
                    self.child_node.append(node)
                    bad_leaf_nodes.append(node)
                if nc >= 2:
                    for index in range(0, len(tg_nodes)):
                        node = Node(level=self.level, pattern_representation=pattern_representation_tg[index],
                                    label="intermediate", group=tg_nodes[index], parent=self)
                        self.child_node.append(node)
                        node.start_splitting(p_value, max_level, good_leaf_nodes, bad_leaf_nodes)
                else:
                    for index in range(0, len(tg_nodes)):
                        node = Node(level=self.level, pattern_representation=pattern_representation_tg[index],
                                    label="good-leaf", group=tg_nodes[index], parent=self)
                        self.child_node.append(node)
                        good_leaf_nodes.append(node)
Пример #29
0
def sax_by_chunking(series, paa_size, alphabet_size=3, z_threshold=0.01):
    """Simple chunking conversion implementation."""
    paa_rep = paa(znorm(series, z_threshold), paa_size)
    cuts = cuts_for_asize(alphabet_size)
    return ts_to_string(paa_rep, cuts)
Пример #30
0
def find_best_discord_brute_force(series,
                                  win_size,
                                  global_registry,
                                  z_threshold=0.01):
    """Early-abandoned distance-based discord discovery."""
    best_so_far_distance = -1.0
    best_so_far_index = -1

    outerRegistry = global_registry.clone()

    # 随机找到一个未看过的index
    outer_idx = outerRegistry.get_next_unvisited()

    # 若 outer_idx 不是nan值 则进入循环 ~按位取反
    while ~np.isnan(outer_idx):

        # 标记看过outer_idx
        outerRegistry.mark_visited(outer_idx)

        # 标准化候选子序列 开始索引outer_indx 结束索引outer_idx+win_size-1
        candidate_seq = znorm(series[outer_idx:(outer_idx + win_size)],
                              z_threshold)

        # 与candidate_seq的开始索引相距不小于窗口大小的 开始索引所代表的子序列 与其的最小距离 (两子序列形状相似)
        nnDistance = np.inf

        # 为什么不是 len(series) - win_size + 1 ???
        innerRegistry = VisitRegistry(len(series) - win_size)

        inner_idx = innerRegistry.get_next_unvisited()

        # 遍历所有开始索引 在两子序列距离大于窗口大小的条件下 找到与candidate_seq的最近距离nnDistance
        while ~np.isnan(inner_idx):
            innerRegistry.mark_visited(inner_idx)

            # 若 inner_indx 与 outer_idx 距离 大于 窗口大小 即两子序列不能有重复部分且不相邻
            if abs(inner_idx - outer_idx) > win_size:

                curr_seq = znorm(series[inner_idx:(inner_idx + win_size)],
                                 z_threshold)

                # 计算 标准化后两序列的欧式距离
                dist = early_abandoned_dist(candidate_seq, curr_seq,
                                            nnDistance)

                # 更新 nnDistance 使其逐渐变小
                if (~np.isnan(dist)) and (dist < nnDistance):
                    nnDistance = dist

            inner_idx = innerRegistry.get_next_unvisited()

        # 更新 best_so_far_distance 和 best_so_far_index
        """ 
        best_so_far_distance
            max(min(distance))
            相似性最小的子序列 与 距离最近的子序列 的距离
        
        best_so_far_index
            当前时间序列的异常子序列 开始索引
            这段子序列在当前时间序列中与其他子序列的相似性是最小的
        """
        if ~(np.inf == nnDistance) and (nnDistance > best_so_far_distance):
            best_so_far_distance = nnDistance
            best_so_far_index = outer_idx

        outer_idx = outerRegistry.get_next_unvisited()

    return (best_so_far_index, best_so_far_distance)