def entropy(seq, legacy=False): """ This function computes Shannon Entropy of a given sequence. Parameters ---------- seq : list or tuple Sequence of integers. Returns ------- float Shannon entropy of sequence. """ if isinstance(seq, array) and seq.typecode == "I" and not legacy: return estimates.entropy(seq) # Get counts from Counter, normalize by total, transform each and sum all return sum( -seq * log2(seq) for seq in (elem / len(seq) for elem in Counter(seq).values()) )
def _compute_verbose_truncated(seq_x, seq_y, order=2): """ This function runs the NSRWS algorithm for estimation of ETC and extracts additional metrics at each step of the algorithm. These include: - length of sequence - entropy of sequence - most frequent window - count of most frequent window The NSRWS algorithm is run iteratively until all elements are equal or the sequence has been reduced to a size smaller than the size of the window being substituted (specified by order). The number of steps taken till the iteration stops is the Effort-To-Compress (ETC) estimate for the sequence. Parameters ---------- seq : list or tuple Sequence of integers. order : int, optional Number of elements in window for substitution. The default is 2 for pairs. Returns ------- etc : int Effort-To-Compress estimate for given seq and order. output : list List of dictionaries corresponding to each step of NSRWS run during estimation of ETC for the given sequence. """ # Initialize ETC to 0 etc = 0 # Initialize an aggregator for collecting dictionaries of estimates output = list() signal = False # Append estimates for original sequence output.append({ "step": etc, "length": len(seq_x), "entropy_x": ce.entropy(seq_x), "entropy_y": ce.entropy(seq_y), "window_x": None, "window_y": None, "count": None, "time": None, }) if cc.check_equality(seq_x, seq_y): return etc, output # Execute iteration loop until either all elements are equal or sequence is # reduced to less than size of the window being substituted (order) while not signal and len(seq_x) >= order and not cc.check_equality( seq_x, seq_y): # Run one step of NSRWS in verbose mode (returns window and count) seq_x, seq_y, signal, pair_x, pair_y, count, time = _onestep( seq_x, seq_y, order, verbose=True) # Increment ETC etc += 1 # Compute estimates and append to aggregator output.append({ "step": etc, "length": len(seq_x), "entropy_x": ce.entropy(seq_x), "entropy_y": ce.entropy(seq_y), "window_x": pair_x, "window_y": pair_y, "count": count, "time": time, }) n = 0 if signal and not cc.check_equality(seq_x, seq_y): while len(seq_x) >= order and n < 5: # Run one step of NSRWS in verbose mode (returns window and count) seq_x, seq_y, signal, pair_x, pair_y, count, time = _onestep( seq_x, seq_y, order, verbose=True) # Increment ETC etc += 1 # Compute estimates and append to aggregator output.append({ "step": etc, "length": len(seq_x), "entropy_x": ce.entropy(seq_x), "entropy_y": ce.entropy(seq_y), "window_x": pair_x, "window_y": pair_y, "count": count, "time": time, }) n += 1 if len(seq_x) % (order - 1) == 0: etc += len(seq_x) // (order - 1) - 1 else: etc += len(seq_x) // (order - 1) # Display ETC and return it with aggregator # print(f"ETC={etc}") return etc, output