Exemplo n.º 1
0
 def maximize_level_node(self, max_level):
     """
     Try to maximaxe the level value
     :param p_value:
     :return:
     """
     values_group = list(self.group.values())
     original_level = self.level
     equal = True
     while equal and self.level < max_level:
         temp_level = self.level + 1
         data = np.array(values_group[0])
         data_znorm = znorm(data)
         data_paa = paa(data_znorm, self.paa_value)
         pr = ts_to_string(data_paa, cuts_for_asize(temp_level))
         for index in range(1, len(values_group)):
             data = np.array(values_group[index])
             data_znorm = znorm(data)
             data_paa = paa(data_znorm, self.paa_value)
             pr_2 = ts_to_string(data_paa, cuts_for_asize(temp_level))
             if pr_2 != pr:
                 equal = False
         if equal:
             self.level = temp_level
     if original_level != self.level:
         logger.info("New level for node: {}".format(self.level))
         data = np.array(values_group[0])
         data_znorm = znorm(data)
         data_paa = paa(data_znorm, self.paa_value)
         self.pattern_representation = ts_to_string(data_paa, cuts_for_asize(self.level))
     else:
         logger.info("Can't split again, max level already reached")
Exemplo n.º 2
0
def sax_via_window(series,
                   win_size,
                   paa_size,
                   alphabet_size=3,
                   nr_strategy='exact',
                   z_threshold=0.01):
    """Simple via window conversion implementation."""
    cuts = cuts_for_asize(alphabet_size)
    sax = defaultdict(list)

    prev_word = ''

    for i in range(0, len(series) - win_size):

        sub_section = series[i:(i + win_size)]

        zn = znorm(sub_section, z_threshold)

        paa_rep = paa(zn, paa_size)

        curr_word = ts_to_string(paa_rep, cuts)

        if '' != prev_word:
            if 'exact' == nr_strategy and prev_word == curr_word:
                continue
            elif 'mindist' == nr_strategy and\
                    is_mindist_zero(prev_word, curr_word):
                continue

        prev_word = curr_word

        sax[curr_word].append(i)

    return sax
Exemplo n.º 3
0
def apply_adaptive_sax(ts, win_size, paa_size, alphabet_size, z_threshold):
    """
    This function applies the sax transformation to a 1-dim time series using adaptive break-points

    :param ts: 1-time series
    :type ts: 1D array
    :param win_size: size fo the sliding window that generated each sax word
    :type win_size: int
    :param paa_size: number of characters in a single sax word
    :type paa_size: int
    :param alphabet_size: number of unique characters to use in the sax representation
    :type alphabet_size: int
    :param z_threshold: z_threshold for the znorm method from saxpy
    :type z_threshold: float
    :return: the sax sequence, a list of strings, where each string represents a single sax word
    :rtype: list of str
    """
    sax_sequence = []
    cuts = cuts_for_asize(alphabet_size)
    for t in range(0, len(ts) - win_size + 1):
        ts_win = ts[t:(t + win_size)]
        ts_win_znormed = znorm(ts_win, z_threshold)
        paa_rep = paa(ts_win_znormed, paa_size)
        sax_word = ts_to_string(paa_rep, cuts)
        sax_sequence.append(sax_word)
    return sax_sequence
Exemplo n.º 4
0
def saxrepresentation(matrix):
    result = []
    index_ts = 0
    for ts in matrix.T:
        sax_representation = znorm(ts)
        dat_paa_3 = paa(sax_representation, 3)
        a = ts_to_string(dat_paa_3, cuts_for_asize(3))
        result.append(a)

    return result
Exemplo n.º 5
0
def ppa_representation(data, seq_len):
    data_reduced = np.zeros(shape=(int(data.shape[0] / seq_len), data.shape[1]))

    paa_segment = int(data.shape[0] / seq_len)

    for i in tqdm(range(data.shape[1])):
        dat_znorm = znorm(data[:, i])

        data_reduced[:, i] = paa(dat_znorm, paa_segment)

    return data_reduced
Exemplo n.º 6
0
def row_pattern_loss(row: np.ndarray, pr: Tuple[str, int]):
    pattern = []
    cuts = cuts_for_asize(pr[1] + 1)[1:]
    for c in pr[0]:
        n = ord(c) - 97
        pattern.append(cuts[n])
    if len(pattern) != len(row):
        normalized_row = paa(znorm(row), len(pattern))
    else:
        normalized_row = znorm(row)
    return distance(normalized_row, pattern)
Exemplo n.º 7
0
    def update_features(self, data):
        self.hidden = self.state_manager.get_hidden_state([data])
        activation = self.get_activation()
        dat_znorm = (activation[:, self.indices] - self.mean) / self.std
        dat_znorm = [paa(item, self.seq_len) for item in dat_znorm]

        features = [
            tuple(ts_to_string(item, cuts_for_asize(self.symbols)))
            for item in dat_znorm
        ]

        for feature in features:
            if feature in self.feature:
                index = self.feature.index(feature)
                self.covered_dict[index] = True
Exemplo n.º 8
0
 def fitness(self, hidden, sym):
     activation = self.get_activations(hidden)
     dat_znorm = Z_ScoreNormalization(
         activation[:, self.testObjective.indices], self.testObjective.mean,
         self.testObjective.std)
     dat_znorm = [
         paa(item, self.testObjective.seq_len) for item in dat_znorm
     ]
     cuts = cuts_for_asize(self.testObjective.symbols)
     cuts = np.append(cuts, np.array([np.inf]))
     sym_size = len(sym)
     out = np.array([
         self.cal_fittness_seq(cuts, sym_size, sym, series)
         for series in dat_znorm
     ])
     return out
Exemplo n.º 9
0
def SAX(sequence: np.ndarray, alphabet_size: int, length: int = 0) -> str:
    """
    Computes SAX string of a sequence of numbers with specified alphabet size.
    Length of the output string may be specified; length 0 will generate a string as long as the sequence.
    """
    debug("Calculating SAX of {}, with alphabet of size {}".format(
        sequence, alphabet_size))
    if alphabet_size == 1:
        if length == 0:
            return "a" * len(sequence)
        else:
            return "a" * length
    else:
        if length == 0 or length == len(sequence):
            return ts_to_string(znorm(sequence), cuts_for_asize(alphabet_size))
        else:
            return ts_to_string(paa(znorm(sequence), length),
                                cuts_for_asize(alphabet_size))
Exemplo n.º 10
0
def extract_features(
        song_name):  # returns mfcc and chroma features in SAX represetation
    try:
        x, fs = librosa.load(song_name)
    except:
        return None
    mfccs = librosa.feature.mfcc(x, sr=fs, n_mfcc=39)
    chroma = librosa.feature.chroma_stft(x, sr=fs)
    feature_matrix = np.concatenate((mfccs, chroma))

    sax_rep = []

    sax_rep = [
        ts_to_string(paa(znorm(feat), SAX_VOCAB_LENGTH),
                     cuts_for_asize(SAX_VOCAB_LENGTH))
        for feat in feature_matrix
    ]
    return sax_rep
Exemplo n.º 11
0
def sax_via_window(series,
                   win_size,
                   paa_size,
                   alphabet_size=3,
                   nr_strategy='exact',
                   z_threshold=0.01):
    """Simple via window conversion implementation."""

    # 生成指定size的alphabet cuts
    cuts = cuts_for_asize(alphabet_size)
    # 初始化sax
    sax = defaultdict(list)

    prev_word = ''

    for i in range(0, len(series) - win_size):

        # series被当前窗口所围住的子部分
        sub_section = series[i:(i + win_size)]

        # 标准化
        zn = znorm(sub_section, z_threshold)

        # PAA分段聚合 将子部分降维到paa_size维
        paa_rep = paa(zn, paa_size)

        # PAA后的序列转化为字符串
        curr_word = ts_to_string(paa_rep, cuts)

        #
        if '' != prev_word:
            if 'exact' == nr_strategy and prev_word == curr_word:
                continue
            elif 'mindist' == nr_strategy and\
                    is_mindist_zero(prev_word, curr_word):
                continue

        prev_word = curr_word

        sax[curr_word].append(i)

    return sax
Exemplo n.º 12
0
    def update_features(self, hidden, test_num):
        activation = self.get_activations(hidden)
        dat_znorm = Z_ScoreNormalization(
            activation[:, self.testObjective.indices], self.testObjective.mean,
            self.testObjective.std)
        dat_znorm = [
            paa(item, self.testObjective.seq_len) for item in dat_znorm
        ]

        features = [
            tuple(
                ts_to_string(item, cuts_for_asize(self.testObjective.symbols)))
            for item in dat_znorm
        ]
        self.cov_count += 1
        for feature in features:
            if feature in self.testObjective.feature:
                self.cov_count = 0
                self.testObjective.feature.remove(feature)
                self.testObjective.covered_feature.append(feature)
                del self.testObjective.test_record[feature]

        self.coverage = 1 - len(self.testObjective.feature
                                ) / self.testObjective.originalNumOfFeature

        cov_fitness = np.array([
            self.fitness(hidden, listElem)
            for listElem in self.testObjective.feature
        ])
        cov_index = np.min(cov_fitness, axis=1)
        cov_fitness = np.argmin(cov_fitness, axis=1)

        for idx, feature in enumerate(self.testObjective.feature):
            test_record = self.testObjective.test_record[feature]
            if test_record == None or test_record[1] > cov_fitness[idx]:
                self.testObjective.test_record[feature] = list(
                    [test_num + cov_index[idx], cov_fitness[idx]])

        self.displayCoverage()
Exemplo n.º 13
0
def znorm_paa_sax(time_series, alpha, w=3, missing='z'):
    """Takes an array containing real values, z-normalizes, reduces
    dimensionality to w, and finally returns a sax representation of length alpha
    
    time series:    array holding a time series of one measurement for one patient
    w:              the dimensionality to reduce to using PAA, set to len(time_series) in plain
    alpha:          alpha is the number of discretized segments that the SAX rep will reflect, set to 2, 3 or 5 in plain using RDS algo
    """

    # If time_series is a string, make it into list format e.g. 'abc' -> ['a', 'b', 'c']
    # why? because it's the structure we require for below and i CBA to change it
    if (isinstance(time_series, str)):
        time_series = list(time_series)

    if (len(time_series) > 0):
        # normalizing one time series, time series as numpy array (np.array([]))
        normalized_time_series = znorm(np.array(time_series))
        # dimensionality reduction of time series according to w
        paa_norm_time_series = paa(normalized_time_series, w)
        # turning a discretized and reduced time series into a sequence of characters
        return ts_to_string(paa_norm_time_series, cuts_for_asize(alpha))
    else:
        return missing
Exemplo n.º 14
0
def process_cell(matrix, cell, progress_indicator):
    alphabet_size = cell[0]
    paa_division_integer = cell[1]

    progression = 0 if progress_indicator == -1 else progress_indicator

    print(progression)

    # Download or use downloaded weather data files
    time_avg_temp_arr = pm.get_weather_data_from_files()

    # get ground truth for the slope computed by the lin. regression model for all weather data files.
    ground_truth_arr = pm.get_ground_truth_values(time_avg_temp_arr)

    # mean array of periods
    mean_period_arr = []

    # get flux, time, duration and ground thruth for i'th tuple
    ground_truth_slope = ground_truth_arr[progression]
    time_avg_temp_tuple = time_avg_temp_arr[progression]
    time = time_avg_temp_tuple[0]
    norm_avg_temp = time_avg_temp_tuple[1]
    dat_size = norm_avg_temp.size

    # PAA transformation procedure
    # Determine number of PAA points from the datasize devided by the paa_division_integer(number of points per segment)
    paa_points = int(dat_size / paa_division_integer)

    ## PAA transformation of data
    PAA_array = paa(norm_avg_temp, paa_points)
    PAA_array = np.asarray(PAA_array, dtype=np.float32)

    # SAX conversion
    # Get breakpoints to convert segments into SAX string
    breakPointsArray = pm.getBreakPointsArray(PAA_array, alphabet_size)
    sax_output = ts_to_string(PAA_array, breakPointsArray)

    # Convert to numeric SAX representation
    numericSaxConversionArray = pm.getNumericSaxArray(breakPointsArray)
    numeric_SAX_temp = []

    for symbol_index in range(len(sax_output)):
        letter_represented_as_int = pm.getAlfabetToNumericConverter(
            sax_output[symbol_index], numericSaxConversionArray)
        numeric_SAX_temp.append(letter_represented_as_int)

    numeric_SAX_temp = np.asarray(numeric_SAX_temp, dtype=np.float32)
    numeric_SAX_time = time

    # Repeat each element in array x times, where x is the number of PAA points
    repeated_x_array = np.repeat(numeric_SAX_time, paa_points)

    # How many elements each list should have
    n = int(len(repeated_x_array) / paa_points)
    final_x_array = []
    lists = list(pm.divide_array_in_chunks(repeated_x_array, n))

    # take mean of all chunks
    for l in lists:
        final_x_array.append(np.mean(l))
    numeric_SAX_time = final_x_array

    # Compute linear regression model
    slope, intercept, r_value, p_value, std_err = stats.linregress(
        numeric_SAX_time, numeric_SAX_temp)

    # Add error in percentage between best peiord and ground truth to array with periods
    ground_truth_error = (abs(slope - ground_truth_slope) /
                          ground_truth_slope) * 100

    # Update mean periods array
    if progression == 0:
        matrix[alphabet_size - MIN_SAX][paa_division_integer -
                                        MIN_PAA] = ground_truth_error
    else:
        # Update mean of particualr parameter combination
        current_value = matrix[alphabet_size - MIN_SAX][paa_division_integer -
                                                        MIN_PAA]
        matrix[alphabet_size -
               MIN_SAX][paa_division_integer -
                        MIN_PAA] = (current_value * progression +
                                    ground_truth_error) / (progression + 1)
Exemplo n.º 15
0
def sax_via_window(series, win_size, paa_size, alphabet_size=3,
                   nr_strategy='exact', znorm_threshold=0.01, sax_type='unidim'):
    """Simple via window conversion implementation.

    # SAX-ENERGY
    >>> sax_via_window([[1, 2, 3], [4, 5, 6]], win_size=1, paa_size=3, sax_type='energy', nr_strategy=None)['abc']
    [0, 1]

    >>> sax_via_window([[1, 2, 3, 4], [4, 5, 6, 7]], win_size=1, paa_size=4, sax_type='energy', nr_strategy=None)['aacc']
    [0, 1]

    >>> sax_via_window([[1, 2, 3, 4], [4, 5, 6, 7]], win_size=2, paa_size=4, sax_type='energy', nr_strategy=None)['aaccaacc']
    [0]

    # SAX-REPEAT
    >>> sax_via_window([[1, 2, 3], [4, 5, 6], [7, 8, 9]], win_size=2, paa_size=2, sax_type='repeat', nr_strategy=None)['ab']
    [0, 1]

    >>> sax_via_window([[1, 2, 3], [4, 5, 6], [7, 8, 9]], win_size=1, paa_size=1, sax_type='repeat', nr_strategy=None)['a']
    [0, 1, 2]

    # SAX-INDEPENDENT
    >>> sax_via_window([[1, 2, 3, 4], [4, 5, 6, 7]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acacacac']
    [0]

    >>> sax_via_window([[1, 2], [4, 5], [7, 8]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acac']
    [0, 1]

    >>> sax_via_window([[1, 2], [4, 8], [7, 5]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acac']
    [0]

    >>> sax_via_window([[1, 2], [4, 8], [7, 5]], win_size=2, paa_size=2, sax_type='independent', nr_strategy=None)['acca']
    [1]

    """

    # Convert to numpy array.
    series = np.array(series)

    # Check on dimensions.
    if len(series.shape) > 2:
        raise ValueError('Please reshape time-series to stack dimensions along the 2nd dimension, so that the array shape is a 2-tuple.')

    # PAA size is the length of the PAA sequence.
    if sax_type != 'energy' and paa_size > win_size:
        raise ValueError('PAA size cannot be greater than the window size.')

    if sax_type == 'energy' and len(series.shape) == 1:
        raise ValueError('Must pass a multidimensional time-series to SAX-ENERGY.')

    # Breakpoints.
    cuts = cuts_for_asize(alphabet_size)

    # Dictionary mapping SAX words to indices.
    sax = defaultdict(list)

    if sax_type == 'repeat':
        # Maps indices to multi-dimensional SAX words.
        multidim_sax_dict = []

        # List of all the multi-dimensional SAX words.
        multidim_sax_list = []

        # Sliding window across time dimension.
        for i in range(series.shape[0] - win_size + 1):

            # Subsection starting at this index.
            sub_section = series[i: i + win_size]

            # Z-normalized subsection.
            if win_size == 1:
                zn = sub_section
            else:
                zn = znorm(sub_section, znorm_threshold)

            # PAA representation of subsection.
            paa_rep = paa(zn, paa_size, 'repeat')

            # SAX representation of subsection, but in terms of multi-dimensional vectors.
            multidim_sax = get_sax_list(paa_rep, cuts)

            # Update data-structures.
            multidim_sax_dict.append(multidim_sax)
            multidim_sax_list.extend(multidim_sax)

        # Cluster with k-means++.
        kmeans = KMeans(n_clusters=alphabet_size, random_state=0).fit(multidim_sax_list)

        # Cluster indices in sorted order.
        order = np.lexsort(np.rot90(kmeans.cluster_centers_))

        # Sliding window across time dimension.
        prev_word = ''
        for i in range(series.shape[0] - win_size + 1):

            # Map cluster indices to new SAX letters.
            curr_word_list = map(lambda cluster_index: idx2letter(order[cluster_index]), kmeans.predict(multidim_sax_dict[i]))
            curr_word = ''.join(curr_word_list)

            if '' != prev_word:
                if 'exact' == nr_strategy and prev_word == curr_word:
                    continue
                elif 'mindist' == nr_strategy and is_mindist_zero(prev_word, curr_word):
                    continue

            prev_word = curr_word

            sax[curr_word].append(i)

    else:
        # Sliding window across time dimension.
        prev_word = ''
        for i in range(series.shape[0] - win_size + 1):

            # Subsection starting at this index.
            sub_section = series[i: i + win_size]

            if sax_type == 'energy':
                curr_word = ''
                for energy_dist in sub_section:
                    # Normalize energy distribution.
                    energy_zn = znorm(energy_dist, znorm_threshold)

                    # PAA representation of energy distribution.
                    paa_rep = paa(energy_zn, paa_size, 'unidim')
                    # paa_rep = energy_zn

                    # SAX representation of the energy distribution.
                    energy_word = ts_to_string(paa_rep, cuts)

                    # Add to current word.
                    curr_word += energy_word

            elif sax_type == 'independent':
                curr_word = ''
                for dim in range(sub_section.shape[1]):
                    # Obtain the subsequence restricted to one dimension.
                    one_dimension_sub_section = sub_section[:, dim]

                    # Z-normalized subsection.
                    zn = znorm(one_dimension_sub_section, znorm_threshold)

                    # PAA representation of subsection.
                    paa_rep = paa(zn, paa_size, 'unidim')

                    # Get the SAX word - just a unidimensional SAX.
                    one_dim_word = ts_to_string(paa_rep, cuts)

                    # Add this dimensions' representation to the overall SAX word.
                    curr_word += one_dim_word

            else:
                # Z-normalized subsection.
                zn = znorm(sub_section, znorm_threshold)

                # PAA representation of subsection.
                paa_rep = paa(zn, paa_size, sax_type)

                # SAX representation of subsection.
                curr_word = ts_to_string(paa_rep, cuts)

            if '' != prev_word:
                if 'exact' == nr_strategy and prev_word == curr_word:
                    continue
                elif 'mindist' == nr_strategy and is_mindist_zero(prev_word, curr_word):
                    continue

            prev_word = curr_word

            sax[curr_word].append(i)

    return sax
Exemplo n.º 16
0
 def _compute_1_dim_str_rep(self, segment_ts, dim):
     cuts = self._cuts[dim]
     paa_rep = paa(segment_ts, paa_segments=1)
     letter = ts_to_string(paa_rep, cuts)
     return letter
Exemplo n.º 17
0
from keras.optimizers import Adam



##########

hist = yf.download(tickers = "DJI", period = 'max')



words = [] 
dow_df = ent.util_pattern_space(hist_sma, lag = 1, dim = 50)
dow_df = dow_df[:]
for i in range(len(dow_df)):
    dat_znorm = znorm(dow_df[i,:])
    dat_paa= paa(dat_znorm, 3)
    word = ts_to_string(dat_paa, cuts_for_asize(2))
    words.append(word)
print(words)


print(collections.Counter(words))

from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
sqn = le.fit_transform(words)

nb_classes = len(np.unique(sqn))

from keras.utils import to_categorical 
onehot = to_categorical(sqn)
Exemplo n.º 18
0
    def start_splitting(self, p_value: int, max_level: int, good_leaf_nodes: list(), bad_leaf_nodes: list()):
        """
        Splitting Node Naive algorithm (k, P) Anonymity
        :param p_value:
        :param max_level:
        :param paa_value
        :return:
        """
        # logger.info("good_leaf_nodes: {}, bad_leaf_nodes: {}".format(len(good_leaf_nodes), len(bad_leaf_nodes)))
        if self.size < p_value:
            logger.info("size:{}, p_value:{} == bad-leaf".format(self.size, p_value))
            self.label = "bad-leaf"
            bad_leaf_nodes.append(self)
            return

        if self.level == max_level:
            logger.info("size:{}, p_value:{} == good-leaf".format(self.size, p_value))
            self.label = "good-leaf"
            good_leaf_nodes.append(self)
            return

        if p_value <= self.size < 2*p_value:
            logger.info("Maximize-level, size:{}, p_value:{} == good-leaf".format(self.size, p_value))
            self.maximize_level_node(max_level)
            self.label = "good-leaf"
            good_leaf_nodes.append(self)
            return
        """
        Otherwise, we need to check if node N has to be split. The checking relies on a tentative split performed on N. 
        Suppose that, by increasing the level of N, N is tentatively split into a number of child nodes. 
        If all these child nodes contain fewer than P time series, no real split is performed and the original node N is
        labeled as good-leaf and the recursion terminates on N. Otherwise, there must exist tentative child node(s) 
        whose size >= P, also called TG-node(s) (Tentative Good Nodes). 
        The rest children whose size < P are called TB-nodes (Tentative Bad Nodes), if any. 
        If the total number of records in all TB-nodes under N is no less than P, we merge them into a single tentative
        node, denoted by childmerge, at the level of N.level. If the above tentative process produces nc tentative 
        child nodes (including TB and TG) and nc >= 2, N will really be split into nc children and then the node 
        splitting procedure will be recursively invoked on each of them 
        """
        tentative_child_node = dict()
        temp_level = self.level + 1
        for key, value in self.group.items():
            # to reduce dimensionality
            data = np.array(value)
            data_znorm = znorm(data)
            data_paa = paa(data_znorm, self.paa_value)
            pr = ts_to_string(data_paa, cuts_for_asize(temp_level))
            if pr in tentative_child_node.keys():
                tentative_child_node[pr].append(key)
            else:
                tentative_child_node[pr] = [key]
        length_all_tentative_child = [len(x) for x in list(tentative_child_node.values())]
        good_leaf = np.all(np.array(length_all_tentative_child) < p_value)

        if good_leaf:
            logger.info("Good-leaf, all_tentative_child are < {}".format(p_value))
            self.label = "good-leaf"
            good_leaf_nodes.append(self)
            return
        else:
            logger.info("N can be split")
            logger.info("Compute tentative good nodes and tentative bad nodes")
            # tentative good nodes
            # index of nodes in tentative_child_node with more p_value
            pr_keys = list(tentative_child_node.keys())
            # get index tentative good node
            pattern_representation_tg = list()
            tg_nodes_index = list(np.where(np.array(length_all_tentative_child) >= p_value)[0])
            # logger.info(pr_keys)
            tg_nodes = list()
            for index in tg_nodes_index:
                keys_elements = tentative_child_node[pr_keys[index]]
                dict_temp = dict()
                for key in keys_elements:
                    dict_temp[key] = self.group[key]
                tg_nodes.append(dict_temp)
                pattern_representation_tg.append(pr_keys[index])

            # tentative bad nodes
            tb_nodes_index = list(np.where(np.array(length_all_tentative_child) < p_value)[0])
            tb_nodes = list()
            pattern_representation_tb = list()

            for index in tb_nodes_index:
                keys_elements = tentative_child_node[pr_keys[index]]
                dict_temp = dict()
                for key in keys_elements:
                    dict_temp[key] = self.group[key]
                tb_nodes.append(dict_temp)
                pattern_representation_tb.append(pr_keys[index])

            total_size_tb_nodes = 0
            for tb_node in tb_nodes:
                total_size_tb_nodes += len(tb_node)

            if total_size_tb_nodes >= p_value:
                logger.info("Merge all bad nodes in a single node, and label it as good-leaf")
                child_merge_node_group = dict()
                for tb_node in tb_nodes:
                    for key, value in tb_node.items():
                        child_merge_node_group[key] = value
                node_merge = Node(level=self.level, pattern_representation=self.pattern_representation,
                                  label="good-leaf", group=child_merge_node_group, parent=self)
                self.child_node.append(node_merge)
                good_leaf_nodes.append(node_merge)

                nc = len(tg_nodes) + len(tb_nodes)  # tb_nodes sono un po' perplesso su questo tb_nodes
                logger.info("Split only tg_nodes {0}".format(len(tg_nodes)))
                if nc >= 2:
                    for index in range(0, len(tg_nodes)):
                        node = Node(level=self.level, pattern_representation=pattern_representation_tg[index],
                                    label="intermediate", group=tg_nodes[index], parent=self)
                        self.child_node.append(node)
                        node.start_splitting(p_value, max_level, good_leaf_nodes, bad_leaf_nodes)
                else:
                    for index in range(0, len(tg_nodes)):
                        node = Node(level=self.level, pattern_representation=pattern_representation_tg[index],
                                    label="good-leaf", group=tg_nodes[index], parent=self)
                        self.child_node.append(node)
                        good_leaf_nodes.append(node)

            else:
                nc = len(tg_nodes) + len(tb_nodes)  # tb_nodes sono un po' perplesso su questo tb_nodes
                logger.info("Label all tb_node {0} as bad-leaf and split only tg_nodes {1}".format(len(tb_nodes),len(tg_nodes)))
                for index in range(0, len(tb_nodes)):
                    node = Node(level=self.level, pattern_representation=pattern_representation_tb[index], label="bad-leaf",
                                group=tb_nodes[index], parent=self)
                    self.child_node.append(node)
                    bad_leaf_nodes.append(node)
                if nc >= 2:
                    for index in range(0, len(tg_nodes)):
                        node = Node(level=self.level, pattern_representation=pattern_representation_tg[index],
                                    label="intermediate", group=tg_nodes[index], parent=self)
                        self.child_node.append(node)
                        node.start_splitting(p_value, max_level, good_leaf_nodes, bad_leaf_nodes)
                else:
                    for index in range(0, len(tg_nodes)):
                        node = Node(level=self.level, pattern_representation=pattern_representation_tg[index],
                                    label="good-leaf", group=tg_nodes[index], parent=self)
                        self.child_node.append(node)
                        good_leaf_nodes.append(node)
Exemplo n.º 19
0
def main(argv):

    #load configuration
    parameters = load_configuration()

    #load parameters

    #dataset
    path_to_dataset = parameters['path_to_dataset']
    load_size = parameters['load_size']

    #SAX
    alphabet_size = parameters['alphabet_size']
    paa_size = parameters['paa_size']
    window_size = parameters['window_size']
    step = parameters['step']
    substring_size = parameters['substring_size']

    #smoothing
    threshold_freq = parameters['threshold_freq']

    #projections
    prj_size = parameters['prj_size']
    prj_iterations = parameters['prj_iterations']
    anomaly_threshold = parameters['anomaly_threshold']

    #loading data
    loader = DataLoader.DataLoader(path_to_dataset)
    data = DataTypes.Data()

    #loader.load_all(data,200)
    loader.load_subset(data, load_size, 100)

    #period from which extract anomalies
    begin_date = datetime.datetime.fromtimestamp(data.index_to_time[0])
    end_date = datetime.datetime.fromtimestamp(data.index_to_time[load_size -
                                                                  1])

    if parameters['power_type'] == -1:
        tank = parameters['tank']
        sensor_type = parameters['sensor_type']
        #print(data.measures[0])
        print("Loading of %i tank %i  data from %s to %s " %
              (sensor_type, tank, begin_date, end_date))
        s_values = [
            data.measures[i][0][tank][sensor_type]
            for i in range(0, len(data.measures))
        ]
    else:
        power_type = parameters['power_type']
        print("Loading measures of power %i from %s to %s " %
              (power_type, begin_date, end_date))
        s_values = [
            data.measures[i][1][power_type]
            for i in range(0, len(data.measures))
        ]

    len_serie = len(s_values)
    hash_table_substrings = {}

    #getting first n alphabet letters
    alphabet = get_alphabet_letters(alphabet_size)
    #creating hash table indexed by all of substrings of length k
    hash_table_substrings = get_hash_table(alphabet, prj_size)

    #list containg score for each window
    anomalies_score = []

    for index in range(0, len_serie, step):
        begin = index
        end = begin + window_size

        if end < len_serie:
            window_values = s_values[begin:end]
            window_znorm = znorm(s_values)
            window_paa = paa(window_znorm, paa_size)
            window_string = ts_to_string(window_paa,
                                         cuts_for_asize(alphabet_size))

            #each character of the string corresponds to k values of the series
            k = window_size // paa_size

            #get smoothed string
            window_smoothed = smoothing(window_string, threshold_freq)

            #fill hash table by applying random projection
            hash_table_substrings = put_in_bucket(hash_table_substrings,
                                                  window_smoothed, begin,
                                                  prj_iterations, prj_size,
                                                  substring_size, k)

            total = 0
            for key, values in hash_table_substrings.items():
                total = total + len(values)

            buckets_with_anomalies, bucket_freq = analyzed_bucket(
                hash_table_substrings, total, anomaly_threshold)
            #number of bucket with anomalies
            n_buckets_anomalies = len(buckets_with_anomalies.keys())

            #getting score for current window
            avg_window_score = getting_score(hash_table_substrings,
                                             buckets_with_anomalies,
                                             n_buckets_anomalies)
            anomalies_score.append(avg_window_score)

            #reset table
            hash_table_substrings = get_hash_table(alphabet, prj_size)

        else:
            break

    print(anomalies_score)
Exemplo n.º 20
0
with open(sys.argv[1], 'r') as h:
    lines = h.readlines()
    DATA = []
    time_series = []
    for line in lines:
        line = line.strip()
        if line != 'null' and line != '\n':
            time_series.append(float(line))
        else:
            DATA.append(time_series)
            time_series = []
    for data in DATA:
        data = np.asfarray(data, float)
        data = np.diff(data)
        data_znorm = znorm(data)
        data_paa = paa(data_znorm, w)
        sax_words.append(ts_to_string(data_paa, cuts_for_asize(a)))

# sax_words_ri = []
# i = 0
# for word in sax_words:
# 	perms = set([''.join(p) for p in permutations(word)])
# 	sax_words_ri.append(perms)
# 	i+=1

# Write all SAX words to file only once instead of generating again and again
with open('sax_words_ri_norot_w=' + str(w) + '_a=' + str(a),
          'w+') as sax_words_file:
    for l in sax_words:
        sax_words_file.write(l + '\n')
    # sax_words_file.write('\n')
Exemplo n.º 21
0
def sax_by_chunking(series, paa_size, alphabet_size=3, z_threshold=0.01):
    """Simple chunking conversion implementation."""
    paa_rep = paa(znorm(series, z_threshold), paa_size)
    cuts = cuts_for_asize(alphabet_size)
    return ts_to_string(paa_rep, cuts)
Exemplo n.º 22
0
def visualize_p_anonymized_nodes(nodes_list: List[Node]):
    # https://stackoverflow.com/questions/50161140/how-to-plot-a-time-series-array-with-confidence-intervals-displayed-in-python
    # https://stackoverflow.com/questions/14720331/how-to-generate-random-colors-in-matplotlib

    pr_dict: Dict[str, int] = {}
    number_of_pr: int = 0
    for node in nodes_list:
        if node.pr not in pr_dict and node.pr != "a" * node.pr_len():
            pr_dict[node.pr] = number_of_pr
            number_of_pr += 1

    pr_cmap = get_cmap(len(pr_dict) + 1)

    n = len(nodes_list[0].table[0])
    if nodes_list[0].pr_len() != n:
        paa_linespace = np.linspace(0, n - 1, (2 * nodes_list[0].pr_len() + 1))
        paa_positions = paa_linespace[1::2]
        for node in nodes_list:
            if node.pr != "a" * node.pr_len():
                node_color = pr_cmap(pr_dict[node.pr])
                marker_alpha = 1
                line_alpha = 0.4
            else:
                node_color = "grey"
                marker_alpha = 0.5
                line_alpha = 0.2
            for row in node.table:
                plt.plot(range(n),
                         row,
                         color=node_color,
                         label=node.pr,
                         alpha=line_alpha)
                plt.plot(paa_positions,
                         paa(row, node.pr_len()),
                         color=node_color,
                         label="",
                         linestyle='',
                         marker="_",
                         markeredgewidth=2,
                         markersize=10,
                         alpha=marker_alpha)
                plt.plot(paa_positions,
                         paa(row, node.pr_len()),
                         color=node_color,
                         label="",
                         linestyle=':',
                         alpha=0.5)
    else:
        for node in nodes_list:
            if node.pr != "a" * node.pr_len():
                node_color = pr_cmap(pr_dict[node.pr])
                line_alpha = 1
            else:
                node_color = "grey"
                line_alpha = 0.5
            for row in node.table:
                plt.plot(range(n),
                         row,
                         color=node_color,
                         label=node.pr,
                         alpha=line_alpha)

    fontP = FontProperties()
    fontP.set_size('xx-small')

    # https://stackoverflow.com/questions/13588920/stop-matplotlib-repeating-labels-in-legend
    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    plt.legend(by_label.values(),
               by_label.keys(),
               loc='upper left',
               ncol=3,
               prop=fontP)

    plt.title("p-anonymization")
    plt.yscale("symlog")
    plt.show()
    return
Exemplo n.º 23
0
def PAA_aggregation(v):
    dat_znorm = znorm(v)
    r = paa(dat_znorm, len(v))
    #r = znorm(v)
    print("MIN MAX",min(r),max(r))
    return r
Exemplo n.º 24
0
    def recycle_bad_leaves(p_value, good_leaf_nodes, bad_leaf_nodes,
                           suppressed_nodes, paa_value):
        """
        Recycle bad-leaves phase
        :param bad_leaf_nodes: [description]
        """

        bad_leaf_nodes_dict = dict()
        for node in bad_leaf_nodes:
            if node.level in bad_leaf_nodes_dict.keys():
                bad_leaf_nodes_dict[node.level].append(node)
            else:
                bad_leaf_nodes_dict[node.level] = [node]

        bad_leaf_nodes_size = sum([node.size for node in bad_leaf_nodes])

        if bad_leaf_nodes_size >= p_value:

            # max bad level
            current_level = max(bad_leaf_nodes_dict.keys())

            while bad_leaf_nodes_size >= p_value:

                if current_level in bad_leaf_nodes_dict.keys():
                    merge_dict = dict()
                    keys_to_be_removed = list()
                    merge = False
                    for current_level_node in bad_leaf_nodes_dict[
                            current_level]:
                        pr_node = current_level_node.pattern_representation
                        if pr_node in merge_dict.keys():
                            merge = True
                            merge_dict[pr_node].append(current_level_node)
                            if pr_node in keys_to_be_removed:
                                keys_to_be_removed.remove(pr_node)
                        else:
                            merge_dict[pr_node] = [current_level_node]
                            keys_to_be_removed.append(pr_node)

                    if merge:
                        for k in keys_to_be_removed:
                            del merge_dict[k]

                        for pr, node_list in merge_dict.items():
                            group = dict()
                            for node in node_list:
                                bad_leaf_nodes_dict[current_level].remove(node)
                                group.update(node.group)
                            if current_level > 1:
                                level = current_level
                            else:
                                level = 1
                            leaf_merge = Node(level=level,
                                              pattern_representation=pr,
                                              group=group,
                                              paa_value=paa_value)

                            if leaf_merge.size >= p_value:
                                leaf_merge.label = "good-leaf"
                                good_leaf_nodes.append(leaf_merge)
                                bad_leaf_nodes_size -= leaf_merge.size
                            else:
                                leaf_merge.label = "bad-leaf"
                                bad_leaf_nodes_dict[current_level].append(
                                    leaf_merge)

                temp_level = current_level - 1
                for node in bad_leaf_nodes_dict[current_level]:
                    if temp_level > 1:
                        values_group = list(node.group.values())
                        data = np.array(values_group[0])
                        data_znorm = znorm(data)
                        data_paa = paa(data_znorm, paa_value)
                        pr = ts_to_string(data_paa, cuts_for_asize(temp_level))
                    else:
                        pr = "a" * paa_value
                    node.level = temp_level
                    node.pattern_representation = pr

                if current_level > 0:
                    if temp_level not in bad_leaf_nodes_dict.keys():
                        bad_leaf_nodes_dict[
                            temp_level] = bad_leaf_nodes_dict.pop(
                                current_level)
                    else:
                        bad_leaf_nodes_dict[temp_level] = bad_leaf_nodes_dict[
                            temp_level] + bad_leaf_nodes_dict.pop(
                                current_level)
                    current_level -= 1
                else:
                    break

        #print("sopprimo le serie rimanenti")
        remaining_bad_leaf_nodes = list(bad_leaf_nodes_dict.values())[0]
        for node in remaining_bad_leaf_nodes:
            suppressed_nodes.append(node)
Exemplo n.º 25
0
def process_cell(matrix, cell, progress_indicator):
    alphabet_size = cell[0]
    paa_division_integer = cell[1]

    progression = 0 if progress_indicator == -1 else progress_indicator

    # Download or use downloaded lightcurve files
    time_flux_tuple_arr = pm.get_lightcurve_data()

    # get ground truth values for all lc's with autocorrelation
    ground_truth_arr = pm.get_ground_truth_values(time_flux_tuple_arr)

    # transform durations from exoplanet archieve from hours to days
    actual_duration_arr = [
        3.88216 / 24, 2.36386 / 24, 3.98235 / 24, 4.56904 / 24, 3.60111 / 24,
        5.16165 / 24, 3.19843 / 24
    ]  ##kepler-2,3,4,5,6,7,8 https://exoplanetarchive.ipac.caltech.edu/cgi-bin/TblView/nph-tblView?app=ExoTbls&config=cumulative

    #mean array of periods
    mean_period_arr = []

    # Calculate matrix values for all lighcurves
    # get flux, time, duration and ground thruth for i'th tuple
    ground_truth_period = ground_truth_arr[progression]
    actual_duration = actual_duration_arr[progression]

    time_flux_tuple = time_flux_tuple_arr[progression]
    time = time_flux_tuple[0]
    norm_fluxes = time_flux_tuple[1]

    dat_size = norm_fluxes.size

    # Find Period for eaxh parameter combination alphabets_size/PAA_division_interger of SAX

    # PAA transformation procedure
    # Determine number of PAA points from the datasize devided by the paa_division_integer(number of points per segment)
    paa_points = int(dat_size / paa_division_integer)

    # PAA transformation of data
    PAA_array = paa(norm_fluxes, paa_points)
    PAA_array = np.asarray(PAA_array, dtype=np.float32)

    # SAX conversion
    # Get breakpoints to convert segments into SAX string
    breakPointsArray = pm.getBreakPointsArray(PAA_array, alphabet_size)
    sax_output = ts_to_string(PAA_array, breakPointsArray)

    # Convert to numeric SAX representation
    numericSaxConversionArray = pm.getNumericSaxArray(breakPointsArray)
    numeric_SAX_flux = []

    for symbol_index in range(len(sax_output)):
        letter_represented_as_int = pm.getAlfabetToNumericConverter(
            sax_output[symbol_index], numericSaxConversionArray)
        numeric_SAX_flux.append(letter_represented_as_int)

    numeric_SAX_flux = np.asarray(numeric_SAX_flux, dtype=np.float32)
    numeric_SAX_time = time

    # Repeat each element in array x times, where x is the number of PAA points
    repeated_x_array = np.repeat(numeric_SAX_time, paa_points)

    # How many elements each list should have
    n = int(len(repeated_x_array) / paa_points)
    final_x_array = []
    lists = list(pm.divide_array_in_chunks(repeated_x_array, n))

    # take mean of all chunks
    for l in lists:
        final_x_array.append(np.mean(l))
    numeric_SAX_time = final_x_array

    # BoxLeastSquares applied to numeric SAX representation
    BLS = BoxLeastSquares(numeric_SAX_time, numeric_SAX_flux)
    periodogram = BLS.autopower(actual_duration)

    # Find period with highest power in periodogram
    best_period = np.argmax(periodogram.power)
    period = periodogram.period[best_period]

    # Add error in percentage between best peiord and ground truth to array with periods
    ground_truth_error = (abs(period - ground_truth_period) /
                          ground_truth_period) * 100

    # Update matrix
    if progression == 0:
        matrix[alphabet_size - MIN_SAX][paa_division_integer -
                                        MIN_PAA] = ground_truth_error
    else:
        #Update mean of particualr parameter combination
        current_value = matrix[alphabet_size - MIN_SAX][paa_division_integer -
                                                        MIN_PAA]
        matrix[alphabet_size -
               MIN_SAX][paa_division_integer -
                        MIN_PAA] = (current_value * progression +
                                    ground_truth_error) / (progression + 1)
Exemplo n.º 26
0
from saxpy.alphabet import cuts_for_asize

df = []
for year in range(1994, 2019):
    hist = yf.download(tickers="SPY",
                       start="{}-01-01".format(year),
                       end="{}-12-31".format(year))
    close = hist["Close"]
    df.append(close)

words = []
for year in tqdm(df):

    #dat = ent.util_granulate_time_series(year, scale=3)
    dat_znorm = znorm(year)
    dat_paa = paa(dat_znorm, 10)
    word = ts_to_string(dat_paa, cuts_for_asize(5))
    words.append(word)
print(words)

from collections import Counter
Counter(words)

from collections import Counter
years = np.arange(1994, 2019)
frame = pd.DataFrame()
frame["Year"] = years
frame["Word"] = words
print(frame)

from fuzzywuzzy import fuzz