Exemplo n.º 1
0
 def maximize_level_node(self, max_level):
     """
     Try to maximaxe the level value
     :param p_value:
     :return:
     """
     values_group = list(self.group.values())
     original_level = self.level
     equal = True
     while equal and self.level < max_level:
         temp_level = self.level + 1
         data = np.array(values_group[0])
         data_znorm = znorm(data)
         data_paa = paa(data_znorm, self.paa_value)
         pr = ts_to_string(data_paa, cuts_for_asize(temp_level))
         for index in range(1, len(values_group)):
             data = np.array(values_group[index])
             data_znorm = znorm(data)
             data_paa = paa(data_znorm, self.paa_value)
             pr_2 = ts_to_string(data_paa, cuts_for_asize(temp_level))
             if pr_2 != pr:
                 equal = False
         if equal:
             self.level = temp_level
     if original_level != self.level:
         logger.info("New level for node: {}".format(self.level))
         data = np.array(values_group[0])
         data_znorm = znorm(data)
         data_paa = paa(data_znorm, self.paa_value)
         self.pattern_representation = ts_to_string(data_paa, cuts_for_asize(self.level))
     else:
         logger.info("Can't split again, max level already reached")
Exemplo n.º 2
0
    def update_features(self, indices):
        self.minimalp = 0
        self.minimaln = 0
        activation1, activation2 = self.get_activations()
        print("activation1")
        print(activation1)
        print("activation2")
        print(activation2)
        dat_znorm_p = znorm(activation1[indices])
        dat_znorm_n = znorm(activation2[indices])
        symRep_p = ts_to_string(dat_znorm_p, cuts_for_asize(self.symbols))
        symRep_n = ts_to_string(dat_znorm_n, cuts_for_asize(self.symbols))
        # transfer result to feature(string to tuple)
        feature_p = tuple(symRep_p)
        feature_n = tuple(symRep_n)
        print("found symbolic feature for SQ-P:", symRep_p)
        if feature_p in self.testObjective.feature_p:
            self.minimalp = 1
            self.testObjective.feature_p.remove(feature_p)
        self.coverage_p = 1 - len(self.testObjective.feature_p
                                  ) / self.testObjective.originalNumOfFeature
        self.displayCoverage1()

        print("found symbolic feature for SQ-N:", symRep_n)
        if feature_n in self.testObjective.feature_n:
            self.minimaln = 1
            self.testObjective.feature_n.remove(feature_n)
        self.coverage_n = 1 - len(self.testObjective.feature_n
                                  ) / self.testObjective.originalNumOfFeature
        self.displayCoverage2()

        return self.coverage_p, self.coverage_n
Exemplo n.º 3
0
def test_stringing():
    """Test string conversion."""
    # 11: np.array([-np.inf, -1.33517773611894, -0.908457868537385,
    #             -0.604585346583237, -0.348755695517045,
    #             -0.114185294321428, 0.114185294321428, 0.348755695517045,
    #             0.604585346583237, 0.908457868537385, 1.33517773611894]),
    ab = sax.ts_to_string(np.array([-1.33517773611895, -1.33517773611894]),
                          alphabet.cuts_for_asize(11))
    assert 'ab' == ab

    kj = sax.ts_to_string(np.array([1.33517773611895, 1.33517773611894]),
                          alphabet.cuts_for_asize(11))
    assert 'kj' == kj
Exemplo n.º 4
0
def sax_via_window(series,
                   win_size,
                   paa_size,
                   alphabet_size=3,
                   nr_strategy='exact',
                   z_threshold=0.01):
    """Simple via window conversion implementation."""
    cuts = cuts_for_asize(alphabet_size)
    sax = defaultdict(list)

    prev_word = ''

    for i in range(0, len(series) - win_size):

        sub_section = series[i:(i + win_size)]

        zn = znorm(sub_section, z_threshold)

        paa_rep = paa(zn, paa_size)

        curr_word = ts_to_string(paa_rep, cuts)

        if '' != prev_word:
            if 'exact' == nr_strategy and prev_word == curr_word:
                continue
            elif 'mindist' == nr_strategy and\
                    is_mindist_zero(prev_word, curr_word):
                continue

        prev_word = curr_word

        sax[curr_word].append(i)

    return sax
Exemplo n.º 5
0
def apply_adaptive_sax(ts, win_size, paa_size, alphabet_size, z_threshold):
    """
    This function applies the sax transformation to a 1-dim time series using adaptive break-points

    :param ts: 1-time series
    :type ts: 1D array
    :param win_size: size fo the sliding window that generated each sax word
    :type win_size: int
    :param paa_size: number of characters in a single sax word
    :type paa_size: int
    :param alphabet_size: number of unique characters to use in the sax representation
    :type alphabet_size: int
    :param z_threshold: z_threshold for the znorm method from saxpy
    :type z_threshold: float
    :return: the sax sequence, a list of strings, where each string represents a single sax word
    :rtype: list of str
    """
    sax_sequence = []
    cuts = cuts_for_asize(alphabet_size)
    for t in range(0, len(ts) - win_size + 1):
        ts_win = ts[t:(t + win_size)]
        ts_win_znormed = znorm(ts_win, z_threshold)
        paa_rep = paa(ts_win_znormed, paa_size)
        sax_word = ts_to_string(paa_rep, cuts)
        sax_sequence.append(sax_word)
    return sax_sequence
Exemplo n.º 6
0
def SAXtransform(df, nbCuts, segmentSize):
    ndf = pd.DataFrame()
    df = AvgDiscretisation(df, segmentSize)
    cuts = cuts_for_asize(nbCuts)
    for c in df.columns:
        ndf[c] = list(ts_to_string(df[c].values, cuts))
    return ndf
Exemplo n.º 7
0
def discretizar(v):
    alphabet_values = {'a':1,'b':2,'c':3,'d':4,'e':5}
    abc = ts_to_string(v, cuts_for_asize(5))  # abc : (cadena de String)
    r = []
    for i in range(len(abc)):
        val = alphabet_values[ abc[i] ]
        r.append(val)
    return r
Exemplo n.º 8
0
def SAX(sequence: np.ndarray, alphabet_size: int, length: int = 0) -> str:
    """
    Computes SAX string of a sequence of numbers with specified alphabet size.
    Length of the output string may be specified; length 0 will generate a string as long as the sequence.
    """
    debug("Calculating SAX of {}, with alphabet of size {}".format(
        sequence, alphabet_size))
    if alphabet_size == 1:
        if length == 0:
            return "a" * len(sequence)
        else:
            return "a" * length
    else:
        if length == 0 or length == len(sequence):
            return ts_to_string(znorm(sequence), cuts_for_asize(alphabet_size))
        else:
            return ts_to_string(paa(znorm(sequence), length),
                                cuts_for_asize(alphabet_size))
Exemplo n.º 9
0
    def update_features(self, data):
        self.hidden = self.state_manager.get_hidden_state(data)
        activation_p, activation_n = self.get_activation()
        dat_znorm_p = znorm(activation_p[self.indices])
        dat_znorm_n = znorm(activation_n[self.indices])
        sym_rep_p = ts_to_string(dat_znorm_p, cuts_for_asize(self.symbols))
        sym_rep_n = ts_to_string(dat_znorm_n, cuts_for_asize(self.symbols))
        feature_p = tuple(sym_rep_p)
        feature_n = tuple(sym_rep_n)

        if feature_p in self.feature_p:
            index = self.feature_p.index(feature_p)
            self.covered_dict_p[index] = True
            self.frequency_dict_p[index] += 1

        if feature_n in self.feature_n:
            index = self.feature_n.index(feature_n)
            self.covered_dict_n[index] = True
            self.frequency_dict_n[index] += 1
Exemplo n.º 10
0
    def update_features(self, data):
        self.hidden = self.state_manager.get_hidden_state(data)
        activation = self.get_activation()
        dat_znorm = znorm(activation[self.indices])
        sym_rep = ts_to_string(dat_znorm, cuts_for_asize(self.symbols))
        feature = tuple(sym_rep)

        if feature in self.feature:
            index = self.feature.index(feature)
            self.covered_dict[index] = True
Exemplo n.º 11
0
def saxrepresentation(matrix):
    result = []
    index_ts = 0
    for ts in matrix.T:
        sax_representation = znorm(ts)
        dat_paa_3 = paa(sax_representation, 3)
        a = ts_to_string(dat_paa_3, cuts_for_asize(3))
        result.append(a)

    return result
Exemplo n.º 12
0
def row_pattern_loss(row: np.ndarray, pr: Tuple[str, int]):
    pattern = []
    cuts = cuts_for_asize(pr[1] + 1)[1:]
    for c in pr[0]:
        n = ord(c) - 97
        pattern.append(cuts[n])
    if len(pattern) != len(row):
        normalized_row = paa(znorm(row), len(pattern))
    else:
        normalized_row = znorm(row)
    return distance(normalized_row, pattern)
Exemplo n.º 13
0
def sax_transform(time_series_df, num_cuts):
    """
    Applies SAX transformation for a number of cuts, e.g. 3, 5, ...
    to have symbolic representation.
    """
    time_series_df = _preprocess_ts(time_series_df)
    cuts = cuts_for_asize(num_cuts)
    return (time_series_df.groupby(OCCURRENCE_INDEX).agg({
        'value_representation':
        lambda x: ts_to_string(x.to_numpy(), cuts)
    }).reset_index())
Exemplo n.º 14
0
    def sax(self, cardinality):
        '''
        Creates SAX representation of the time series
        :param cardinality: number of symbols to use in SAX representation
        '''
        self.cardinality = cardinality
        self.cuts = cuts_for_asize(self.cardinality)
        self.string = ts_to_string(self.norm_values, self.cuts)

        #denormalize cuts for correct vizualisaton
        self.cuts_den = self.cuts * self.std + self.mean
        self.data['symbol'] = list(self.string)
        self.sax_freq = self.generate_freq()
Exemplo n.º 15
0
    def update_features(self, data):
        self.hidden = self.state_manager.get_hidden_state([data])
        activation = self.get_activation()
        dat_znorm = (activation[:, self.indices] - self.mean) / self.std
        dat_znorm = [paa(item, self.seq_len) for item in dat_znorm]

        features = [
            tuple(ts_to_string(item, cuts_for_asize(self.symbols)))
            for item in dat_znorm
        ]

        for feature in features:
            if feature in self.feature:
                index = self.feature.index(feature)
                self.covered_dict[index] = True
Exemplo n.º 16
0
 def fitness(self, hidden, sym):
     activation = self.get_activations(hidden)
     dat_znorm = Z_ScoreNormalization(
         activation[:, self.testObjective.indices], self.testObjective.mean,
         self.testObjective.std)
     dat_znorm = [
         paa(item, self.testObjective.seq_len) for item in dat_znorm
     ]
     cuts = cuts_for_asize(self.testObjective.symbols)
     cuts = np.append(cuts, np.array([np.inf]))
     sym_size = len(sym)
     out = np.array([
         self.cal_fittness_seq(cuts, sym_size, sym, series)
         for series in dat_znorm
     ])
     return out
Exemplo n.º 17
0
def extract_features(
        song_name):  # returns mfcc and chroma features in SAX represetation
    try:
        x, fs = librosa.load(song_name)
    except:
        return None
    mfccs = librosa.feature.mfcc(x, sr=fs, n_mfcc=39)
    chroma = librosa.feature.chroma_stft(x, sr=fs)
    feature_matrix = np.concatenate((mfccs, chroma))

    sax_rep = []

    sax_rep = [
        ts_to_string(paa(znorm(feat), SAX_VOCAB_LENGTH),
                     cuts_for_asize(SAX_VOCAB_LENGTH))
        for feat in feature_matrix
    ]
    return sax_rep
Exemplo n.º 18
0
def sax_via_window(series,
                   win_size,
                   paa_size,
                   alphabet_size=3,
                   nr_strategy='exact',
                   z_threshold=0.01):
    """Simple via window conversion implementation."""

    # 生成指定size的alphabet cuts
    cuts = cuts_for_asize(alphabet_size)
    # 初始化sax
    sax = defaultdict(list)

    prev_word = ''

    for i in range(0, len(series) - win_size):

        # series被当前窗口所围住的子部分
        sub_section = series[i:(i + win_size)]

        # 标准化
        zn = znorm(sub_section, z_threshold)

        # PAA分段聚合 将子部分降维到paa_size维
        paa_rep = paa(zn, paa_size)

        # PAA后的序列转化为字符串
        curr_word = ts_to_string(paa_rep, cuts)

        #
        if '' != prev_word:
            if 'exact' == nr_strategy and prev_word == curr_word:
                continue
            elif 'mindist' == nr_strategy and\
                    is_mindist_zero(prev_word, curr_word):
                continue

        prev_word = curr_word

        sax[curr_word].append(i)

    return sax
Exemplo n.º 19
0
    def update_features(self, hidden, test_num):
        activation = self.get_activations(hidden)
        dat_znorm = Z_ScoreNormalization(
            activation[:, self.testObjective.indices], self.testObjective.mean,
            self.testObjective.std)
        dat_znorm = [
            paa(item, self.testObjective.seq_len) for item in dat_znorm
        ]

        features = [
            tuple(
                ts_to_string(item, cuts_for_asize(self.testObjective.symbols)))
            for item in dat_znorm
        ]
        self.cov_count += 1
        for feature in features:
            if feature in self.testObjective.feature:
                self.cov_count = 0
                self.testObjective.feature.remove(feature)
                self.testObjective.covered_feature.append(feature)
                del self.testObjective.test_record[feature]

        self.coverage = 1 - len(self.testObjective.feature
                                ) / self.testObjective.originalNumOfFeature

        cov_fitness = np.array([
            self.fitness(hidden, listElem)
            for listElem in self.testObjective.feature
        ])
        cov_index = np.min(cov_fitness, axis=1)
        cov_fitness = np.argmin(cov_fitness, axis=1)

        for idx, feature in enumerate(self.testObjective.feature):
            test_record = self.testObjective.test_record[feature]
            if test_record == None or test_record[1] > cov_fitness[idx]:
                self.testObjective.test_record[feature] = list(
                    [test_num + cov_index[idx], cov_fitness[idx]])

        self.displayCoverage()
Exemplo n.º 20
0
def znorm_paa_sax(time_series, alpha, w=3, missing='z'):
    """Takes an array containing real values, z-normalizes, reduces
    dimensionality to w, and finally returns a sax representation of length alpha
    
    time series:    array holding a time series of one measurement for one patient
    w:              the dimensionality to reduce to using PAA, set to len(time_series) in plain
    alpha:          alpha is the number of discretized segments that the SAX rep will reflect, set to 2, 3 or 5 in plain using RDS algo
    """

    # If time_series is a string, make it into list format e.g. 'abc' -> ['a', 'b', 'c']
    # why? because it's the structure we require for below and i CBA to change it
    if (isinstance(time_series, str)):
        time_series = list(time_series)

    if (len(time_series) > 0):
        # normalizing one time series, time series as numpy array (np.array([]))
        normalized_time_series = znorm(np.array(time_series))
        # dimensionality reduction of time series according to w
        paa_norm_time_series = paa(normalized_time_series, w)
        # turning a discretized and reduced time series into a sequence of characters
        return ts_to_string(paa_norm_time_series, cuts_for_asize(alpha))
    else:
        return missing
Exemplo n.º 21
0
    def recycle_bad_leaves(p_value, good_leaf_nodes, bad_leaf_nodes,
                           suppressed_nodes, paa_value):
        """
        Recycle bad-leaves phase
        :param bad_leaf_nodes: [description]
        """

        bad_leaf_nodes_dict = dict()
        for node in bad_leaf_nodes:
            if node.level in bad_leaf_nodes_dict.keys():
                bad_leaf_nodes_dict[node.level].append(node)
            else:
                bad_leaf_nodes_dict[node.level] = [node]

        bad_leaf_nodes_size = sum([node.size for node in bad_leaf_nodes])

        if bad_leaf_nodes_size >= p_value:

            # max bad level
            current_level = max(bad_leaf_nodes_dict.keys())

            while bad_leaf_nodes_size >= p_value:

                if current_level in bad_leaf_nodes_dict.keys():
                    merge_dict = dict()
                    keys_to_be_removed = list()
                    merge = False
                    for current_level_node in bad_leaf_nodes_dict[
                            current_level]:
                        pr_node = current_level_node.pattern_representation
                        if pr_node in merge_dict.keys():
                            merge = True
                            merge_dict[pr_node].append(current_level_node)
                            if pr_node in keys_to_be_removed:
                                keys_to_be_removed.remove(pr_node)
                        else:
                            merge_dict[pr_node] = [current_level_node]
                            keys_to_be_removed.append(pr_node)

                    if merge:
                        for k in keys_to_be_removed:
                            del merge_dict[k]

                        for pr, node_list in merge_dict.items():
                            group = dict()
                            for node in node_list:
                                bad_leaf_nodes_dict[current_level].remove(node)
                                group.update(node.group)
                            if current_level > 1:
                                level = current_level
                            else:
                                level = 1
                            leaf_merge = Node(level=level,
                                              pattern_representation=pr,
                                              group=group,
                                              paa_value=paa_value)

                            if leaf_merge.size >= p_value:
                                leaf_merge.label = "good-leaf"
                                good_leaf_nodes.append(leaf_merge)
                                bad_leaf_nodes_size -= leaf_merge.size
                            else:
                                leaf_merge.label = "bad-leaf"
                                bad_leaf_nodes_dict[current_level].append(
                                    leaf_merge)

                temp_level = current_level - 1
                for node in bad_leaf_nodes_dict[current_level]:
                    if temp_level > 1:
                        values_group = list(node.group.values())
                        data = np.array(values_group[0])
                        data_znorm = znorm(data)
                        data_paa = paa(data_znorm, paa_value)
                        pr = ts_to_string(data_paa, cuts_for_asize(temp_level))
                    else:
                        pr = "a" * paa_value
                    node.level = temp_level
                    node.pattern_representation = pr

                if current_level > 0:
                    if temp_level not in bad_leaf_nodes_dict.keys():
                        bad_leaf_nodes_dict[
                            temp_level] = bad_leaf_nodes_dict.pop(
                                current_level)
                    else:
                        bad_leaf_nodes_dict[temp_level] = bad_leaf_nodes_dict[
                            temp_level] + bad_leaf_nodes_dict.pop(
                                current_level)
                    current_level -= 1
                else:
                    break

        #print("sopprimo le serie rimanenti")
        remaining_bad_leaf_nodes = list(bad_leaf_nodes_dict.values())[0]
        for node in remaining_bad_leaf_nodes:
            suppressed_nodes.append(node)
Exemplo n.º 22
0
with open(sys.argv[1], 'r') as h:
    lines = h.readlines()
    DATA = []
    time_series = []
    for line in lines:
        line = line.strip()
        if line != 'null' and line != '\n':
            time_series.append(float(line))
        else:
            DATA.append(time_series)
            time_series = []
    for data in DATA:
        data = np.asfarray(data, float)
        data = np.diff(data)
        data_znorm = znorm(data)
        data_paa = paa(data_znorm, w)
        sax_words.append(ts_to_string(data_paa, cuts_for_asize(a)))

# sax_words_ri = []
# i = 0
# for word in sax_words:
# 	perms = set([''.join(p) for p in permutations(word)])
# 	sax_words_ri.append(perms)
# 	i+=1

# Write all SAX words to file only once instead of generating again and again
with open('sax_words_ri_norot_w=' + str(w) + '_a=' + str(a),
          'w+') as sax_words_file:
    for l in sax_words:
        sax_words_file.write(l + '\n')
    # sax_words_file.write('\n')
Exemplo n.º 23
0
def main(argv):

    #load configuration
    parameters = load_configuration()

    #load parameters

    #dataset
    path_to_dataset = parameters['path_to_dataset']
    load_size = parameters['load_size']

    #SAX
    alphabet_size = parameters['alphabet_size']
    paa_size = parameters['paa_size']
    window_size = parameters['window_size']
    step = parameters['step']
    substring_size = parameters['substring_size']

    #smoothing
    threshold_freq = parameters['threshold_freq']

    #projections
    prj_size = parameters['prj_size']
    prj_iterations = parameters['prj_iterations']
    anomaly_threshold = parameters['anomaly_threshold']

    #loading data
    loader = DataLoader.DataLoader(path_to_dataset)
    data = DataTypes.Data()

    #loader.load_all(data,200)
    loader.load_subset(data, load_size, 100)

    #period from which extract anomalies
    begin_date = datetime.datetime.fromtimestamp(data.index_to_time[0])
    end_date = datetime.datetime.fromtimestamp(data.index_to_time[load_size -
                                                                  1])

    if parameters['power_type'] == -1:
        tank = parameters['tank']
        sensor_type = parameters['sensor_type']
        #print(data.measures[0])
        print("Loading of %i tank %i  data from %s to %s " %
              (sensor_type, tank, begin_date, end_date))
        s_values = [
            data.measures[i][0][tank][sensor_type]
            for i in range(0, len(data.measures))
        ]
    else:
        power_type = parameters['power_type']
        print("Loading measures of power %i from %s to %s " %
              (power_type, begin_date, end_date))
        s_values = [
            data.measures[i][1][power_type]
            for i in range(0, len(data.measures))
        ]

    len_serie = len(s_values)
    hash_table_substrings = {}

    #getting first n alphabet letters
    alphabet = get_alphabet_letters(alphabet_size)
    #creating hash table indexed by all of substrings of length k
    hash_table_substrings = get_hash_table(alphabet, prj_size)

    #list containg score for each window
    anomalies_score = []

    for index in range(0, len_serie, step):
        begin = index
        end = begin + window_size

        if end < len_serie:
            window_values = s_values[begin:end]
            window_znorm = znorm(s_values)
            window_paa = paa(window_znorm, paa_size)
            window_string = ts_to_string(window_paa,
                                         cuts_for_asize(alphabet_size))

            #each character of the string corresponds to k values of the series
            k = window_size // paa_size

            #get smoothed string
            window_smoothed = smoothing(window_string, threshold_freq)

            #fill hash table by applying random projection
            hash_table_substrings = put_in_bucket(hash_table_substrings,
                                                  window_smoothed, begin,
                                                  prj_iterations, prj_size,
                                                  substring_size, k)

            total = 0
            for key, values in hash_table_substrings.items():
                total = total + len(values)

            buckets_with_anomalies, bucket_freq = analyzed_bucket(
                hash_table_substrings, total, anomaly_threshold)
            #number of bucket with anomalies
            n_buckets_anomalies = len(buckets_with_anomalies.keys())

            #getting score for current window
            avg_window_score = getting_score(hash_table_substrings,
                                             buckets_with_anomalies,
                                             n_buckets_anomalies)
            anomalies_score.append(avg_window_score)

            #reset table
            hash_table_substrings = get_hash_table(alphabet, prj_size)

        else:
            break

    print(anomalies_score)
Exemplo n.º 24
0


##########

hist = yf.download(tickers = "DJI", period = 'max')



words = [] 
dow_df = ent.util_pattern_space(hist_sma, lag = 1, dim = 50)
dow_df = dow_df[:]
for i in range(len(dow_df)):
    dat_znorm = znorm(dow_df[i,:])
    dat_paa= paa(dat_znorm, 3)
    word = ts_to_string(dat_paa, cuts_for_asize(2))
    words.append(word)
print(words)


print(collections.Counter(words))

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
sqn = le.fit_transform(words)

nb_classes = len(np.unique(sqn))

from keras.utils import to_categorical 
onehot = to_categorical(sqn)
Exemplo n.º 25
0
def discretise(data, number_of_bins):
    return ts_to_string(znorm(data), cuts_for_asize(number_of_bins))
Exemplo n.º 26
0
def test_sizing():
    """Test alphabet sizes."""
    for s in range(2, 20):
        assert len(alphabet.cuts_for_asize(s)) == s
Exemplo n.º 27
0
def sax_via_window(series, win_size, paa_size, alphabet_size=3,
                   nr_strategy='exact', znorm_threshold=0.01, sax_type='unidim'):
    """Simple via window conversion implementation.

    # SAX-ENERGY
    >>> sax_via_window([[1, 2, 3], [4, 5, 6]], win_size=1, paa_size=3, sax_type='energy', nr_strategy=None)['abc']
    [0, 1]

    >>> sax_via_window([[1, 2, 3, 4], [4, 5, 6, 7]], win_size=1, paa_size=4, sax_type='energy', nr_strategy=None)['aacc']
    [0, 1]

    >>> sax_via_window([[1, 2, 3, 4], [4, 5, 6, 7]], win_size=2, paa_size=4, sax_type='energy', nr_strategy=None)['aaccaacc']
    [0]

    # SAX-REPEAT
    >>> sax_via_window([[1, 2, 3], [4, 5, 6], [7, 8, 9]], win_size=2, paa_size=2, sax_type='repeat', nr_strategy=None)['ab']
    [0, 1]

    >>> sax_via_window([[1, 2, 3], [4, 5, 6], [7, 8, 9]], win_size=1, paa_size=1, sax_type='repeat', nr_strategy=None)['a']
    [0, 1, 2]

    # SAX-INDEPENDENT
    >>> sax_via_window([[1, 2, 3, 4], [4, 5, 6, 7]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acacacac']
    [0]

    >>> sax_via_window([[1, 2], [4, 5], [7, 8]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acac']
    [0, 1]

    >>> sax_via_window([[1, 2], [4, 8], [7, 5]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acac']
    [0]

    >>> sax_via_window([[1, 2], [4, 8], [7, 5]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acca']
    [1]

    """

    # Convert to numpy array.
    series = np.array(series)

    # Check on dimensions.
    if len(series.shape) > 2:
        raise ValueError('Please reshape time-series to stack dimensions along the 2nd dimension, so that the array shape is a 2-tuple.')

    # PAA size is the length of the PAA sequence.
    if sax_type != 'energy' and paa_size > win_size:
        raise ValueError('PAA size cannot be greater than the window size.')

    if sax_type == 'energy' and len(series.shape) == 1:
        raise ValueError('Must pass a multidimensional time-series to SAX-ENERGY.')

    # Breakpoints.
    cuts = cuts_for_asize(alphabet_size)

    # Dictionary mapping SAX words to indices.
    sax = defaultdict(list)

    if sax_type == 'repeat':
        # Maps indices to multi-dimensional SAX words.
        multidim_sax_dict = []

        # List of all the multi-dimensional SAX words.
        multidim_sax_list = []

        # Sliding window across time dimension.
        for i in range(series.shape[0] - win_size + 1):

            # Subsection starting at this index.
            sub_section = series[i: i + win_size]

            # Z-normalized subsection.
            if win_size == 1:
                zn = sub_section
            else:
                zn = znorm(sub_section, znorm_threshold)

            # PAA representation of subsection.
            paa_rep = paa(zn, paa_size, 'repeat')

            # SAX representation of subsection, but in terms of multi-dimensional vectors.
            multidim_sax = get_sax_list(paa_rep, cuts)

            # Update data-structures.
            multidim_sax_dict.append(multidim_sax)
            multidim_sax_list.extend(multidim_sax)

        # Cluster with k-means++.
        kmeans = KMeans(n_clusters=alphabet_size, random_state=0).fit(multidim_sax_list)

        # Cluster indices in sorted order.
        order = np.lexsort(np.rot90(kmeans.cluster_centers_))

        # Sliding window across time dimension.
        prev_word = ''
        for i in range(series.shape[0] - win_size + 1):

            # Map cluster indices to new SAX letters.
            curr_word_list = map(lambda cluster_index: idx2letter(order[cluster_index]), kmeans.predict(multidim_sax_dict[i]))
            curr_word = ''.join(curr_word_list)

            if '' != prev_word:
                if 'exact' == nr_strategy and prev_word == curr_word:
                    continue
                elif 'mindist' == nr_strategy and is_mindist_zero(prev_word, curr_word):
                    continue

            prev_word = curr_word

            sax[curr_word].append(i)

    else:
        # Sliding window across time dimension.
        prev_word = ''
        for i in range(series.shape[0] - win_size + 1):

            # Subsection starting at this index.
            sub_section = series[i: i + win_size]

            if sax_type == 'energy':
                curr_word = ''
                for energy_dist in sub_section:
                    # Normalize energy distribution.
                    energy_zn = znorm(energy_dist, znorm_threshold)

                    # PAA representation of energy distribution.
                    paa_rep = paa(energy_zn, paa_size, 'unidim')
                    # paa_rep = energy_zn

                    # SAX representation of the energy distribution.
                    energy_word = ts_to_string(paa_rep, cuts)

                    # Add to current word.
                    curr_word += energy_word

            elif sax_type == 'independent':
                curr_word = ''
                for dim in range(sub_section.shape[1]):
                    # Obtain the subsequence restricted to one dimension.
                    one_dimension_sub_section = sub_section[:, dim]

                    # Z-normalized subsection.
                    zn = znorm(one_dimension_sub_section, znorm_threshold)

                    # PAA representation of subsection.
                    paa_rep = paa(zn, paa_size, 'unidim')

                    # Get the SAX word - just a unidimensional SAX.
                    one_dim_word = ts_to_string(paa_rep, cuts)

                    # Add this dimensions' representation to the overall SAX word.
                    curr_word += one_dim_word

            else:
                # Z-normalized subsection.
                zn = znorm(sub_section, znorm_threshold)

                # PAA representation of subsection.
                paa_rep = paa(zn, paa_size, sax_type)

                # SAX representation of subsection.
                curr_word = ts_to_string(paa_rep, cuts)

            if '' != prev_word:
                if 'exact' == nr_strategy and prev_word == curr_word:
                    continue
                elif 'mindist' == nr_strategy and is_mindist_zero(prev_word, curr_word):
                    continue

            prev_word = curr_word

            sax[curr_word].append(i)

    return sax
Exemplo n.º 28
0
def sax_by_chunking(series, paa_size, alphabet_size=3, z_threshold=0.01):
    """Simple chunking conversion implementation."""
    paa_rep = paa(znorm(series, z_threshold), paa_size)
    cuts = cuts_for_asize(alphabet_size)
    return ts_to_string(paa_rep, cuts)
Exemplo n.º 29
0
    def start_splitting(self, p_value: int, max_level: int, good_leaf_nodes: list(), bad_leaf_nodes: list()):
        """
        Splitting Node Naive algorithm (k, P) Anonymity
        :param p_value:
        :param max_level:
        :param paa_value
        :return:
        """
        # logger.info("good_leaf_nodes: {}, bad_leaf_nodes: {}".format(len(good_leaf_nodes), len(bad_leaf_nodes)))
        if self.size < p_value:
            logger.info("size:{}, p_value:{} == bad-leaf".format(self.size, p_value))
            self.label = "bad-leaf"
            bad_leaf_nodes.append(self)
            return

        if self.level == max_level:
            logger.info("size:{}, p_value:{} == good-leaf".format(self.size, p_value))
            self.label = "good-leaf"
            good_leaf_nodes.append(self)
            return

        if p_value <= self.size < 2*p_value:
            logger.info("Maximize-level, size:{}, p_value:{} == good-leaf".format(self.size, p_value))
            self.maximize_level_node(max_level)
            self.label = "good-leaf"
            good_leaf_nodes.append(self)
            return
        """
        Otherwise, we need to check if node N has to be split. The checking relies on a tentative split performed on N. 
        Suppose that, by increasing the level of N, N is tentatively split into a number of child nodes. 
        If all these child nodes contain fewer than P time series, no real split is performed and the original node N is
        labeled as good-leaf and the recursion terminates on N. Otherwise, there must exist tentative child node(s) 
        whose size >= P, also called TG-node(s) (Tentative Good Nodes). 
        The rest children whose size < P are called TB-nodes (Tentative Bad Nodes), if any. 
        If the total number of records in all TB-nodes under N is no less than P, we merge them into a single tentative
        node, denoted by childmerge, at the level of N.level. If the above tentative process produces nc tentative 
        child nodes (including TB and TG) and nc >= 2, N will really be split into nc children and then the node 
        splitting procedure will be recursively invoked on each of them 
        """
        tentative_child_node = dict()
        temp_level = self.level + 1
        for key, value in self.group.items():
            # to reduce dimensionality
            data = np.array(value)
            data_znorm = znorm(data)
            data_paa = paa(data_znorm, self.paa_value)
            pr = ts_to_string(data_paa, cuts_for_asize(temp_level))
            if pr in tentative_child_node.keys():
                tentative_child_node[pr].append(key)
            else:
                tentative_child_node[pr] = [key]
        length_all_tentative_child = [len(x) for x in list(tentative_child_node.values())]
        good_leaf = np.all(np.array(length_all_tentative_child) < p_value)

        if good_leaf:
            logger.info("Good-leaf, all_tentative_child are < {}".format(p_value))
            self.label = "good-leaf"
            good_leaf_nodes.append(self)
            return
        else:
            logger.info("N can be split")
            logger.info("Compute tentative good nodes and tentative bad nodes")
            # tentative good nodes
            # index of nodes in tentative_child_node with more p_value
            pr_keys = list(tentative_child_node.keys())
            # get index tentative good node
            pattern_representation_tg = list()
            tg_nodes_index = list(np.where(np.array(length_all_tentative_child) >= p_value)[0])
            # logger.info(pr_keys)
            tg_nodes = list()
            for index in tg_nodes_index:
                keys_elements = tentative_child_node[pr_keys[index]]
                dict_temp = dict()
                for key in keys_elements:
                    dict_temp[key] = self.group[key]
                tg_nodes.append(dict_temp)
                pattern_representation_tg.append(pr_keys[index])

            # tentative bad nodes
            tb_nodes_index = list(np.where(np.array(length_all_tentative_child) < p_value)[0])
            tb_nodes = list()
            pattern_representation_tb = list()

            for index in tb_nodes_index:
                keys_elements = tentative_child_node[pr_keys[index]]
                dict_temp = dict()
                for key in keys_elements:
                    dict_temp[key] = self.group[key]
                tb_nodes.append(dict_temp)
                pattern_representation_tb.append(pr_keys[index])

            total_size_tb_nodes = 0
            for tb_node in tb_nodes:
                total_size_tb_nodes += len(tb_node)

            if total_size_tb_nodes >= p_value:
                logger.info("Merge all bad nodes in a single node, and label it as good-leaf")
                child_merge_node_group = dict()
                for tb_node in tb_nodes:
                    for key, value in tb_node.items():
                        child_merge_node_group[key] = value
                node_merge = Node(level=self.level, pattern_representation=self.pattern_representation,
                                  label="good-leaf", group=child_merge_node_group, parent=self)
                self.child_node.append(node_merge)
                good_leaf_nodes.append(node_merge)

                nc = len(tg_nodes) + len(tb_nodes)  # tb_nodes sono un po' perplesso su questo tb_nodes
                logger.info("Split only tg_nodes {0}".format(len(tg_nodes)))
                if nc >= 2:
                    for index in range(0, len(tg_nodes)):
                        node = Node(level=self.level, pattern_representation=pattern_representation_tg[index],
                                    label="intermediate", group=tg_nodes[index], parent=self)
                        self.child_node.append(node)
                        node.start_splitting(p_value, max_level, good_leaf_nodes, bad_leaf_nodes)
                else:
                    for index in range(0, len(tg_nodes)):
                        node = Node(level=self.level, pattern_representation=pattern_representation_tg[index],
                                    label="good-leaf", group=tg_nodes[index], parent=self)
                        self.child_node.append(node)
                        good_leaf_nodes.append(node)

            else:
                nc = len(tg_nodes) + len(tb_nodes)  # tb_nodes sono un po' perplesso su questo tb_nodes
                logger.info("Label all tb_node {0} as bad-leaf and split only tg_nodes {1}".format(len(tb_nodes),len(tg_nodes)))
                for index in range(0, len(tb_nodes)):
                    node = Node(level=self.level, pattern_representation=pattern_representation_tb[index], label="bad-leaf",
                                group=tb_nodes[index], parent=self)
                    self.child_node.append(node)
                    bad_leaf_nodes.append(node)
                if nc >= 2:
                    for index in range(0, len(tg_nodes)):
                        node = Node(level=self.level, pattern_representation=pattern_representation_tg[index],
                                    label="intermediate", group=tg_nodes[index], parent=self)
                        self.child_node.append(node)
                        node.start_splitting(p_value, max_level, good_leaf_nodes, bad_leaf_nodes)
                else:
                    for index in range(0, len(tg_nodes)):
                        node = Node(level=self.level, pattern_representation=pattern_representation_tg[index],
                                    label="good-leaf", group=tg_nodes[index], parent=self)
                        self.child_node.append(node)
                        good_leaf_nodes.append(node)