def test_get_codes(self, d): """the sum of len(code) * freq_dict[code] is optimal, so it must be invariant under permutation of the dictionary""" # NB: this also tests huffman_tree indirectly t = huffman_tree(d) c1 = get_codes(t) d2 = list(d.items()) shuffle(d2) d2 = dict(d2) t2 = huffman_tree(d2) c2 = get_codes(t2) self.assertEqual(sum([d[k] * len(c1[k]) for k in d]), sum([d2[k] * len(c2[k]) for k in d2]))
def recover_bits(self, token_inds, remaining_bits): ind = self.lm.SOS_ind prefix = [ind] p = self.lm.p_next_token(prefix) cipher_text = [] # Terminate the generation after we have consumed all indices or # have extracted all bits while 0 < len(token_inds) and 0 < remaining_bits: # Build Huffman codes for the conditional distribution heap = build_min_heap(p) hc = huffman_tree(heap) # Check if the total variation is low enough if tv_huffman(hc, p)[0] < self.tv_threshold: # We have controlled this step. Some bits are hidden. code = invert_code_tree(hc) # Look up the Huffman code for the token. ind = token_inds.pop(0) # Convert the Huffman code into bits # left => 0, right => 1 cipher_text_fragment = [ 0 if bit == 'l' else 1 for bit in code[ind] ] # Truncate possible trailing paddings cipher_text += cipher_text_fragment[:remaining_bits] remaining_bits -= len(cipher_text_fragment) # print(remaining_bits) prefix += [ind] p = self.lm.p_next_token(prefix) else: # We did not control this step. Skip. prefix.append(token_inds.pop(0)) p = self.lm.p_next_token(prefix) return cipher_text
def test_avg_length(self, d): """avg_length should return a float in the interval [0, 8]""" t = huffman_tree(d) f = avg_length(t, d) self.assertTrue(isinstance(f, float)) self.assertTrue(0 <= f <= 8.0)
def test_round_trip(self, b): """test inverting generate_compressed and generate_uncompressed""" orig_text = b freq = make_freq_dict(orig_text) assume(len(freq) > 1) tree = huffman_tree(freq) codes = get_codes(tree) compressed = generate_compressed(orig_text, codes) uncompressed = generate_uncompressed(tree, compressed, len(orig_text)) assert orig_text == uncompressed
def test_round_trip(self, b): """test inverting generate_compressed and generate_uncompressed""" orig_text = b freq = make_freq_dict(orig_text) assume(len(freq) > 1) tree = huffman_tree(freq) codes = get_codes(tree) compressed = generate_compressed(orig_text, codes) uncompressed = generate_uncompressed(tree, compressed, len(orig_text)) assert orig_text == uncompressed #, '\n'.join([str(list(orig_text)), str(codes), byte_to_bits(compressed[0]), str(list(uncompressed))])
def test_number_nodes(self, d): """if the root is an interior node, it must be numbered two less than the number of symbols""" # a complete tree has one fewer interior nodes than # it has leaves, and we are numbering from 0 # NB: this also tests huffman_tree indirectly t = huffman_tree(d) assume(not t.is_leaf()) count = len(d) number_nodes(t) self.assertEqual(count, t.number + 2)
def test_num_nodes_to_bytes(self, b): """num_nodes_to_bytes returns a bytes object that has length 1 (since the number of internal nodes cannot exceed 256)""" # NB: also indirectly tests make_freq_dict and huffman_tree d = make_freq_dict(b) assume(len(d) > 1) t = huffman_tree(d) number_nodes(t) n = num_nodes_to_bytes(t) self.assertTrue(isinstance(n, bytes)) self.assertEqual(len(n), 1)
def test_generate_compressed(self, b): """generate_compressed should return a bytes object that is no longer than the input bytes, and the size of the compressed object should be invariant under permuting the input""" # NB: this also indirectly tests make_freq_dict, huffman_tree, # and get_codes d = make_freq_dict(b) t = huffman_tree(d) c = get_codes(t) compressed = generate_compressed(b, c) self.assertTrue(isinstance(compressed, bytes)) self.assertTrue(len(compressed) <= len(b)) l = list(b) shuffle(l) b = bytes(l) d = make_freq_dict(b) t = huffman_tree(d) c = get_codes(t) compressed2 = generate_compressed(b, c) self.assertEqual(len(compressed2), len(compressed))
def test_tree_to_bytes(self, b): """tree_to_bytes generates a bytes representation of a post-order traversal of a trees internal nodes""" # Since each internal node requires 4 bytes to represent, # and there are 1 fewer internal node than distinct symbols, # the length of the bytes produced should be 4 times the # length of the frequency dictionary, minus 4""" # NB: also indirectly tests make_freq_dict, huffman_tree, and # number_nodes d = make_freq_dict(b) assume(len(d) > 1) t = huffman_tree(d) number_nodes(t) output_bytes = tree_to_bytes(t) dictionary_length = len(d) leaf_count = dictionary_length self.assertEqual(4 * (leaf_count - 1), len(output_bytes))
def embed_bits(self, coin_flips): '''We use a sequence of coin flips to control the generation of token indices from a language model. This returns _a sequence_ as defined by the language model, e.g. sentence, paragraph.''' ind = self.lm.SOS_ind prefix = [ind] p = self.lm.p_next_token(prefix) # Terminate the generation after we generate the EOS token while len(prefix) == 1 or (len(prefix) < self.max_sequence_length and ind != self.lm.EOS_ind): # There is still some cipher text to hide le = len(coin_flips) if le > 0: # Build Huffman codes for the conditional distribution heap = build_min_heap(p) hc = huffman_tree(heap) # Check if the total variation is low enough # print(len(prefix) - 1, tv_huffman(hc, p)) if tv_huffman(hc, p)[0] < self.tv_threshold: # Huffman-decode the cipher text into a token # Consume the cipher text until a token is generated decoder_state = hc while type(decoder_state) is tuple: left, right = decoder_state try: bit = coin_flips.pop(0) except IndexError: # No more cipher text. Pad with random bits bit = self.random.choice(2) # 0 => left, 1 => right decoder_state = left if bit == 0 else right # Decoder settles in a leaf node ind = decoder_state prefix.append(ind) p = self.lm.p_next_token(prefix) continue # Forward sample according to LM normally ind = self.random.choice(self.lm.vocabulary_size, p=p) prefix.append(ind) p = self.lm.p_next_token(prefix) # Drop the EOS index return prefix[1:]
def test_huffman_tree(self, d): """huffman_tree returns a non-leaf HuffmanNode""" t = huffman_tree(d) self.assertTrue(isinstance(t, HuffmanNode)) self.assertTrue(not t.is_leaf())
# Easily create a text with the desired distribution by adding as many items # to an array as the frequency of the item freqtable = [] for symbol, f in freq: freqtable += [symbol] * f text = "" # Add a random symbol from the freq table, over many iterations this converges # to the distribution given by the frequency table for i in range(50000): text += rand.choice(freqtable) # Generate the huffman tree and a corresponding translation table tree = huff.huffman_tree(freq) codes = huff.huffman_codes(tree) print "symbol bits:" for s, bits in codes.items(): print "\t%s \t%s" % (s, bits) # Entropy and mean bits per symbol: ent = entropy(freq) mbs = mean_bits(freq, codes) print "entropy ", ent print "mean bits", mbs # Compression ratio: cr = CHARSIZE / mbs
""" Sorted() by key test l = [[2, 3], [6, 7], [3, 34], [24, 64], [1, 43]] l = sorted(l, key=getkey) print(l) """ """ Test using multiple arguements in for loop d = make_freq_dict(bytes([65, 66, 67, 66])) for k, v in d.items(): print(k, 'corresponds to', v) """ # Testing get_codes() freq = {2: 6, 3: 4, 4: 13, 5: 17} t = huffman_tree(freq) d = get_codes(t) print(t) print(d) freqs = {'a': 2} d = HuffmanNode() print(d) print(get_codes(d)) print(bytes([65, 66, 67, 66])) left = HuffmanNode(None, HuffmanNode(3), HuffmanNode(2)) right = HuffmanNode(None, HuffmanNode(9), HuffmanNode(10)) tree = HuffmanNode(None, left, right)