Exemplo n.º 1
0
class HuffmanDecoder:
    # Assuming content comes as a huge string.
    def __init__(self, content):
        self.header = content[:4096]
        self.content = content[4096:]
        self.get_frequencies()
        self.tree = HuffmanTree(self.frequencies)

    def get_frequencies(self):
        gap = int(len(self.header) / 128)
        cnts = [
            int(self.header[i:i + gap], 2)
            for i in range(0, len(self.header), gap)
        ]
        self.frequencies = {chr(x): cnts[x] for x in range(128) if cnts[x] > 0}

    def get_decoding(self):
        cur = ''
        decoded_list = []
        for c in self.content:
            cur += c
            if self.tree.isDecodable(cur):
                decoded_list.append(self.tree.getChar(cur))
                cur = ''

        return ''.join(decoded_list)
Exemplo n.º 2
0
def create_huffman_tree(occurrences):
    '''
    Return a Huffman tree of the symbols given in `occurrences`.
    
    :param occurrences: Number of occurrences of each symbol.
    :type occurrences: dict
    :return: Return a single Huffman tree (obtained with Huffman algorithm)\
    of the symbols in `occurrences`.
    :rtype: huffman_tree.HuffmanTre
    :Examples:
    
    >>> create_huffman_tree({'a': 4, 'b': 1, 'c': 2})
    |bca:7|_<|bc:3|_<|b:1|, |c:2|>, |a:4|>
    >>> create_huffman_tree({'a': 1, 'b': 1, 'c': 2})
    |cab:4|_<|c:2|, |ab:2|_<|a:1|, |b:1|>>
    '''
    symbol_list = create_forest(occurrences)
    tree_list = []

    while len(tree_list) + len(symbol_list) != 1:
        (elem1, elem2) = (pop_least_element(symbol_list, tree_list),\
                          pop_least_element(symbol_list, tree_list))
        new_tree = HuffmanTree(left=elem1, right=elem2)
        tree_list.append(new_tree)

    if len(tree_list) == 1:
        return tree_list[0]
    return symbol_list[0]
Exemplo n.º 3
0
    def train_model(self, text_list):
        if self.huffman is None:
            if self.word_dict is None:
                wc = WordCounter(text_list)
                self.__generate_word_dict(wc.count_res.larger_than(5))
                self.cutted_text_list = wc.text_list
            self.huffman = HuffmanTree(self.word_dict, vec_len=self.vec_len)
        print('word_dict and huffman tree already generated')

        before = (self.win_len - 1) >> 1
        after = self.win_len - 1 - before

        if self.model == 'cbow':
            method = self.__deal_gram_cbow
        else:
            method = self.__deal_gram_skipgram

        if self.cutted_text_list:
            total = len(self.cutted_text_list)
            count = 0

            for line in self.cutted_text_list:
                line_len = len(line)
                for i in range(line_len):
                    method(line[i], line[max(0, i-before):i] +
                           line[i+1: min(line_len, i+after+1)])
        else:
            for line in text_list:
                line = list(jieba.cut(line, cut_all=False))
                line_len = len(line_len)
                for i in range(line_len):
                    method(line[i], line[max(0, i-before): i] +
                           line[i+1, min(line_len, i+after+1)])
        print('word vector has been generated')
Exemplo n.º 4
0
 def __init__(self, input_file_name, min_count):
     self.input_file_name = input_file_name
     self.input_file = open(self.input_file_name, encoding='utf-8')  # 数据文件
     self.min_count = min_count  # 要淘汰的低频数据的频度
     self.wordId_frequency_dict = dict()  # 词id-出现次数 dict
     self.word_count = 0  # 单词数(重复的词只算1个)
     self.word_count_sum = 0  # 单词总数 (重复的词 次数也累加)
     self.id2word_dict = dict()  # 词id-词 dict
     self.word2id_dict = dict()  # 词-词id dict
     self.word_sequence = list()
     self._init_dict()  # 初始化字典
     self.huffman_tree = HuffmanTree(self.wordId_frequency_dict)  # 霍夫曼树
     self.huffman_pos_path, self.huffman_neg_path = self.huffman_tree.get_all_pos_and_neg_path(
     )
     self.word_pairs_queue = deque()
     # 结果展示
     print('Word Count is:', self.word_count)
     print('Word Count Sum is', self.word_count_sum)
Exemplo n.º 5
0
    def generate_codes(self) -> Dict[int, Deque[bool]]:
        """
        Reads the whole input file and generates coding_table by the Huffman tree

        :return: coding table where key is byte and value is deque of bits
        """
        self.input_file.seek(0)
        freq_table = {}
        while True:
            input_buffer = self.input_file.read(INPUT_BUFFER_SIZE)
            if not input_buffer:
                break
            for byte in input_buffer:
                if byte in freq_table:
                    freq_table[byte] += 1
                else:
                    freq_table[byte] = 1
        tree = HuffmanTree(freq_table)
        return tree.generate_codes()
Exemplo n.º 6
0
class HuffmanEncoder:
    def __init__(self, content_source, type='PATH'):
        self.get_content(content_source, type)
        self.build_frequencies()
        self.build_tree()
        self.encode()
        self.build_header()

    def get_content(self, content_source, type):
        if type == 'PATH':
            f = open(content_source, 'r')
            self.content = f.read()
            f.close()
        else:
            self.content = content_source

        # TODO: Figure out what to do with characters outside of 128 ASCII
        self.content = ''.join(
            list(filter(lambda x: ord(x) < 128, self.content)))

    def build_frequencies(self):
        self.frequencies = {}
        for c in self.content:
            if c not in self.frequencies:
                self.frequencies[c] = 0
            self.frequencies[c] += 1

    def build_tree(self):
        self.tree = HuffmanTree(self.frequencies)

    def encode(self):
        enc_list = [self.tree.getCode(c) for c in self.content]
        self.encoding = ''.join(enc_list)

    def build_header(self):
        header_list = []
        for i in range(128):
            v = 0 if chr(i) not in self.frequencies else self.frequencies[chr(
                i)]

            b = bin(v)[2:]
            to_append = '0' * (32 - len(b)) + b
            header_list.append(to_append)

            if chr(i) in self.frequencies:
                assert int(to_append, 2) == self.frequencies[chr(i)]

        self.header = ''.join(header_list)

    # Prints the encoding as a binary string.
    def get_encoding(self, header=True):
        if header: return self.header + self.encoding
        else: return self.encoding
Exemplo n.º 7
0
    def generate_codes(self) -> Dict[int, Deque[bool]]:
        """
        Reads the whole input file and generates coding_table by the Huffman tree

        :return: coding table where key is byte and value is deque of bits
        """
        self.input_file.seek(0)  # переключаемся на первый байт входного файла
        freq_table = {
        }  # создаём пустую мапу частот, в котоорую в дальнейшем будем её записывать
        while True:
            input_buffer = self.input_file.read(
                INPUT_BUFFER_SIZE
            )  # считываем и помещаем данные из инпут файла в буффер (в память)
            if not input_buffer:  # если ничего не считалось, т.е. файл пустой - выходим
                break
            for byte in input_buffer:  # если считались данные, перебираем каждый байт этого файла
                if byte in freq_table:  # если данный байт уже есть в таблице частот, инкрементируем его значение
                    freq_table[byte] += 1
                else:  # если данного байта ещё нет в таблице частот, значит нужно его добавить и присвоить 1, как количество повторений
                    freq_table[byte] = 1
        tree = HuffmanTree(
            freq_table
        )  # строим дерево после формирования таблицы частот (внутри простроится дерево, и вернем коды которые получатся для каждого символа)
        return tree.generate_codes()  # возвращаем построенные коды
Exemplo n.º 8
0
def create_huffman_tree(occurrences):
    '''
    Return a Huffman tree of the symbols given in `occurrences`.
    
    :param occurrences: Number of occurrences of each symbol.
    :type occurrences: dict
    :return: Return a single Huffman tree (obtained with Huffman algorithm)\
    of the symbols in `occurrences`.
    :rtype: huffman_tree.HuffmanTre
    :Examples:
    
    >>> create_huffman_tree({'a': 1, 'b': 1, 'c': 2}) # doctest: +NORMALIZE_WHITESPACE
       /--▮ 'b':1
    /--◯ 'a, b':2
    |  \--▮ 'a':1
    ◯ 'c, a, b':4
    \--▮ 'c':2
    <BLANKLINE>
    >>> create_huffman_tree({'a': 4, 'b': 1, 'c': 2}) # doctest: +NORMALIZE_WHITESPACE
    /--▮ 'a':4
    ◯ 'b, c, a':7
    |  /--▮ 'c':2
    \--◯ 'b, c':3
       \--▮ 'b':1
    <BLANKLINE>
    >>> create_huffman_tree({97: 4, 98: 1, 99: 2}) # doctest: +NORMALIZE_WHITESPACE
    /--▮ 97:4
    ◯ '98, 99, 97':7
    |  /--▮ 99:2
    \--◯ '98, 99':3
       \--▮ 98:1
    <BLANKLINE>
    '''
    symbol_list = create_forest(occurrences)
    tree_list = []

    while len(tree_list) + len(symbol_list) != 1:
        (elem1, elem2) = (pop_least_element(symbol_list, tree_list),\
                          pop_least_element(symbol_list, tree_list))
        new_tree = HuffmanTree(left=elem1, right=elem2)
        tree_list.append(new_tree)

    if len(tree_list) == 1:
        return tree_list[0]
    return symbol_list[0]
Exemplo n.º 9
0
def create_forest(occurrences):
    '''
    Create the initial list of Huffman trees based on the dictionary of
    symbols given in parameter.
    
    :param occurrences: Number of occurrences of each symbol.
    :type occurrences: dict
    :return: A list sorted in ascending order on the number of occurrences\
    and on the symbols of Huffman trees of all symbols provided in\
    `occurrences`.
    :Examples: 

    >>> create_forest({'a': 4, 'c': 2, 'b': 1})
    [|b:1|, |c:2|, |a:4|]
    >>> create_forest({'e': 1, 'f': 1, 'g': 1, 'h': 1, 'a':2})
    [|e:1|, |f:1|, |g:1|, |h:1|, |a:2|]
    '''
    sorted_occs = sorted(occurrences.items(), key=lambda item: (item[1], item[0]))
    forest = [HuffmanTree(chr(leaf[0][0]),leaf[1]) for leaf in sorted_occs]
    return forest
Exemplo n.º 10
0
def create_forest(occurrences):
    '''
    Create the initial list of Huffman trees based on the dictionary of
    symbols given in parameter.
    
    :param occurrences: Number of occurrences of each symbol.
    :type occurrences: dict
    :return: A list sorted in ascending order on the number of occurrences\
    and on the symbols of Huffman trees of all symbols provided in\
    `occurrences`.
    :Examples: 

    >>> create_forest({'a': 4, 'c': 2, 'b': 1})
    [|b:1|, |c:2|, |a:4|]
    >>> create_forest({'e': 1, 'f': 1, 'g': 1, 'h': 1, 'a':2})
    [|e:1|, |f:1|, |g:1|, |h:1|, |a:2|]
    '''
    #key=lambda item: (item[1], item[0]) permet de trier par rapport à l'items 1 du dictionnaire
    #sinon le trie va se faire par ordre lexicographique (a,b,c,...,z)
    sorted_occs = sorted(occurrences.items(),
                         key=lambda item: (item[1], item[0]))
    forest = [HuffmanTree(leaf[0], leaf[1]) for leaf in sorted_occs]
    return forest
Exemplo n.º 11
0
class InputData:
    def __init__(self, input_file_name, min_count):
        self.input_file_name = input_file_name
        self.input_file = open(self.input_file_name)  # 数据文件
        self.index = 0
        self.min_count = min_count  # 要淘汰的低频数据的频度
        self.wordId_frequency_dict = dict()  # 词id-出现次数 dict
        self.word_count = 0  # 单词数(重复的词只算1个)
        self.word_count_sum = 0  # 单词总数 (重复的词 次数也累加)
        self.sentence_count = 0  # 句子数
        self.id2word_dict = dict()  # 词id-词 dict
        self.word2id_dict = dict()  # 词-词id dict
        self._init_dict()  # 初始化字典
        self.get_wordId_list()
        self.huffman_tree = HuffmanTree(self.wordId_frequency_dict)  # 霍夫曼树
        self.huffman_pos_path, self.huffman_neg_path = self.huffman_tree.get_all_pos_and_neg_path()
        self.word_pairs_queue = deque()
        # 结果展示
        print('Word Count is:', self.word_count)
        print('Word Count Sum is', self.word_count_sum)
        print('Sentence Count is:', self.sentence_count)
        print('Tree Node is:', len(self.huffman_tree.huffman))

    def _init_dict(self):
        word_freq = dict()
        # 统计 word_frequency
        for line in self.input_file:
            line = line.strip().split(' ')  # 去首尾空格
            self.word_count_sum += len(line)
            self.sentence_count += 1
            for word in line:
                try:
                    word_freq[word] += 1
                except:
                    word_freq[word] = 1
        word_id = 0
        # 初始化 word2id_dict,id2word_dict, wordId_frequency_dict字典
        for per_word, per_count in word_freq.items():
            if per_count < self.min_count:  # 去除低频
                self.word_count_sum -= per_count
                continue
            self.id2word_dict[word_id] = per_word
            self.word2id_dict[per_word] = word_id
            self.wordId_frequency_dict[word_id] = per_count
            word_id += 1
        self.word_count = len(self.word2id_dict)

    # 获取mini-batch大小的 正采样对 (Xw,w) Xw为上下文id数组,w为目标词id。上下文步长为window_size,即2c = 2*window_size
    def get_wordId_list(self):
        self.input_file = open(self.input_file_name, encoding="utf-8")
        sentence = self.input_file.readline()
        wordId_list = []  # 一句中的所有word 对应的 id
        sentence = sentence.strip().split(' ')
        for i, word in enumerate(sentence):
            if i % 1000000 == 0:
                print(i, len(sentence))
            try:
                word_id = self.word2id_dict[word]
                wordId_list.append(word_id)
            except:
                continue
        self.wordId_list = wordId_list

    def get_batch_pairs(self, batch_size, window_size):
        while len(self.word_pairs_queue) < batch_size:
            for _ in range(1000):
                if self.index == len(self.wordId_list):
                    self.index = 0
                wordId_w = self.wordId_list[self.index]
                context_ids = []
                for i in range(max(self.index - window_size, 0),
                               min(self.index + window_size + 1, len(self.wordId_list))):

                    if self.index == i:  # 上下文=中心词 跳过
                        continue
                    context_ids.append(self.wordId_list[i])
                self.word_pairs_queue.append((context_ids, wordId_w))
                self.index += 1
        result_pairs = []  # 返回mini-batch大小的正采样对
        for _ in range(batch_size):
            result_pairs.append(self.word_pairs_queue.popleft())
        return result_pairs

    def get_pairs(self, pos_pairs):
        neg_word_pair = []
        pos_word_pair = []
        for pair in pos_pairs:
            pos_word_pair += zip([pair[0]] * len(self.huffman_pos_path[pair[1]]), self.huffman_pos_path[pair[1]])
            neg_word_pair += zip([pair[0]] * len(self.huffman_neg_path[pair[1]]), self.huffman_neg_path[pair[1]])
        return pos_word_pair, neg_word_pair

    # 估计数据中正采样对数,用于设定batch
    def evaluate_pairs_count(self, window_size):
        return self.word_count_sum * (2 * window_size - 1)  - (self.sentence_count - 1) * (1 + window_size) * window_size
Exemplo n.º 12
0
 def __init__(self, content):
     self.header = content[:4096]
     self.content = content[4096:]
     self.get_frequencies()
     self.tree = HuffmanTree(self.frequencies)
Exemplo n.º 13
0
    freq = {}
    for char in input_string:
        if char not in freq:
            freq[char] = 1
        else:
            freq[char] += 1
    return freq


input_string = open("input.txt", "r").read()
probability = construct_probability(input_string)
encoding_array = {}
if len(probability) == 1:
    encoding_array.update({list(probability)[0]: '0'})
else:
    tree = HuffmanTree()
    tree.get_nodes_heap(probability)
    tree.construct_tree()
    encoding_array = tree.get_codes()

# Encode
output_string = ""
for char in input_string:
    output_string += encoding_array[char]
ouput = open('binary_output.bin', 'w')
ouput.write(output_string)
ouput.close()

# Decode
decoder = {}
for key in encoding_array:
Exemplo n.º 14
0
 def build_tree(self):
     self.tree = HuffmanTree(self.frequencies)
Exemplo n.º 15
0
class InputData:
    def __init__(self, input_file_name, min_count):
        self.input_file_name = input_file_name
        self.input_file = open(self.input_file_name, encoding='utf-8')  # 数据文件
        self.min_count = min_count  # 要淘汰的低频数据的频度
        self.wordId_frequency_dict = dict()  # 词id-出现次数 dict
        self.word_count = 0  # 单词数(重复的词只算1个)
        self.word_count_sum = 0  # 单词总数 (重复的词 次数也累加)
        self.id2word_dict = dict()  # 词id-词 dict
        self.word2id_dict = dict()  # 词-词id dict
        self.word_sequence = list()
        self._init_dict()  # 初始化字典
        self.huffman_tree = HuffmanTree(self.wordId_frequency_dict)  # 霍夫曼树
        self.huffman_pos_path, self.huffman_neg_path = self.huffman_tree.get_all_pos_and_neg_path(
        )
        self.word_pairs_queue = deque()
        # 结果展示
        print('Word Count is:', self.word_count)
        print('Word Count Sum is', self.word_count_sum)
        #print('Tree Node is:', len(self.huffman_tree.huffman))

    def _init_dict(self):
        word_freq = dict()
        # 统计 word_frequency
        for line in self.input_file:
            line = " ".join(line.split('\n'))
            line = re.sub(",", "", line)
            line = re.split('\.', line)
            line = " ".join(line).split()
            self.word_count_sum += len(line)
            self.word_sequence += line
            for word in line:
                try:
                    word_freq[word] += 1
                except:
                    word_freq[word] = 1
        word_id = 0
        # 初始化 word2id_dict,id2word_dict, wordId_frequency_dict字典
        for per_word, per_count in word_freq.items():
            if per_count < self.min_count:  # 去除低频
                self.word_count_sum -= per_count
                continue
            self.id2word_dict[word_id] = per_word
            self.word2id_dict[per_word] = word_id
            self.wordId_frequency_dict[word_id] = per_count
            word_id += 1
        self.word_count = len(self.word2id_dict)

    # 获取mini-batch大小的 正采样对 (Xw,w) Xw为上下文id数组,w为目标词id。上下文步长为window_size,即2c = 2*window_size
    def get_batch_pairs(self, batch_size, window_size):

        wordId_list = []  # 一句中的所有word 对应的 id
        for word in self.word_sequence:
            try:
                word_id = self.word2id_dict[word]
                wordId_list.append(word_id)
            except:
                continue
                # 寻找正采样对 (context(w),w) 加入正采样队列
        for i, wordId_w in enumerate(wordId_list):
            context_ids = []
            for j, wordId_u in enumerate(
                    wordId_list[max(i - window_size, 0):i + window_size + 1]):
                j += max(i - window_size, 0)
                assert wordId_w < self.word_count
                assert wordId_u < self.word_count
                if i == j:  # 上下文=中心词 跳过
                    continue
                elif max(0, i - window_size + 1) <= j <= min(
                        len(wordId_list), i + window_size - 1):
                    context_ids.append(wordId_u)
            if len(context_ids) == 0:
                continue
            self.word_pairs_queue.append((context_ids, wordId_w))
        result_pairs = []  # 返回mini-batch大小的正采样对
        for _ in range(batch_size):
            a = random.choice(self.word_pairs_queue)
            result_pairs.append(a)
        return result_pairs

    def get_pairs(self, pos_pairs):
        neg_word_pair = []
        pos_word_pair = []
        for pair in pos_pairs:
            a = [pair[0]]
            b = [pair[1]]
            pos_word_pair += zip([pair[0]] *
                                 len(self.huffman_pos_path[pair[1]]),
                                 self.huffman_pos_path[pair[1]])
            neg_word_pair += zip([pair[0]] *
                                 len(self.huffman_neg_path[pair[1]]),
                                 self.huffman_neg_path[pair[1]])
        return pos_word_pair, neg_word_pair