def __init__(self, sentences, context=1, hidden=5, concat=False):
        logging.info(msg="starting CBOW training..")
        self.context = context
        self.encoder = Encoder(sentences=sentences)
        self.huffman_encoder = HuffmanEncoder(self.encoder.counter)
        self.encoding_length = self.encoder.encoding_length
        self.hidden_units = hidden
        self.output_units = 1
        self.input_units = context if concat else 1
        self.input2hidden = np.random.rand(
            self.hidden_units, self.input_units * self.encoding_length) * 0.1
        self.hidden2output = np.random.rand(
            self.output_units * self.encoding_length - 1,
            self.hidden_units) * 0.1

        # train model
        word_count = 0
        last_time = time.time()
        for sentence in sentences:
            context_pairs = sentence2contexts(sentence, self.context)
            for w, c in context_pairs:
                self._train(w, c)
                # break
                word_count += 1
                if word_count % 100 == 0:
                    now = time.time()
                    time_spent = 1.0 / (now - last_time) * 100
                    logging.info(msg="trained on %s words. %s words/sec" %
                                 (word_count, time_spent))
                    last_time = time.time()
예제 #2
0
 def _HuffmanCode(self, words):
     charHuff = HuffmanEncoder()
     for w in words:
         for c in w:
             charHuff.Tally(c)
         charHuff.Tally('\n')
     return charHuff.Compile()
예제 #3
0
    def _HuffmanCodes(self, words):
        repeatHuff = HuffmanEncoder()
        tailHuff = HuffmanEncoder()
        charHuffTable = [HuffmanEncoder() for _ in range(27)]
        pw = ''
        for w in words:
            prefix = self._LettersInCommon(pw, w)
            repeatHuff.Tally(prefix)
            tailHuff.Tally(len(w) - prefix)
            if prefix == 0:
                # Use index 0 to represent that there is no previous character.
                i = 0
            else:
                # We take advantage of the fact that the backquote character '``
                # comes right before 'a' in the ASCII table.
                i = ord(w[prefix - 1]) - ord('`')

            for c in w[prefix:]:
                charHuffTable[i].Tally(c)
                i = ord(c) - ord('`')

            pw = w

        charRootTable = [h.Compile() for h in charHuffTable]
        return repeatHuff.Compile(), tailHuff.Compile(), charRootTable
예제 #4
0
 def run(self):  # 子线程函数,子线程类只有该函数运行在子线程
     self.encoder = HuffmanEncoder()  # 实例化HuffmanEncoder()对象
     self.encoder.progress.connect(lambda x: self.update_progress.emit(x)
                                   )  # 给HuffmanEncoder()对象的进度信号发送设置对应的槽函数
     if self.ena_decode:
         self.encoder.decodefile(inputfile=self.inputfile,
                                 outputfile=self.outputfile)
     else:
         self.encoder.encodefile(inputfile=self.inputfile,
                                 outputfile=self.outputfile)
     self.finished.emit()
예제 #5
0
    def test_something_2(self):
        frequencies = {'C': 2,
                       'B': 6,
                       'E': 7,
                       '_': 10,
                       'D': 10,
                       'A': 11}

        huffman_encoder = HuffmanEncoder(frequencies)
        huffman_encoding, huffman_decoding = huffman_encoder.generate_coding()

        print(huffman_encoding)
예제 #6
0
class encode_thread(QThread):  # 子线程类
    update_progress = pyqtSignal(int)  # 向主线程发送当前进度信号
    finished = pyqtSignal()  # 向主线程发送任务完成的信号

    # 初始化参数,inputfile:件载入路径;outputfile:文件压缩完后的保存路径;ena_decode:是否使能编码功能
    def __init__(self, inputfile='.', outputfile='.', ena_decode=False):
        super(QThread, self).__init__()
        self.inputfile = inputfile
        self.outputfile = outputfile
        self.ena_decode = ena_decode

    def run(self):  # 子线程函数,子线程类只有该函数运行在子线程
        self.encoder = HuffmanEncoder()  # 实例化HuffmanEncoder()对象
        self.encoder.progress.connect(lambda x: self.update_progress.emit(x)
                                      )  # 给HuffmanEncoder()对象的进度信号发送设置对应的槽函数
        if self.ena_decode:
            self.encoder.decodefile(inputfile=self.inputfile,
                                    outputfile=self.outputfile)
        else:
            self.encoder.encodefile(inputfile=self.inputfile,
                                    outputfile=self.outputfile)
        self.finished.emit()
예제 #7
0
파일: configuration.py 프로젝트: ekiwi/awap
def generate_huffman_code(env, symbols, symbol_files):
    from huffman import HuffmanCode, HuffmanEncoder
    hc = HuffmanCode()
    for symbol in symbols:
        hc.add_symbol(symbol)
    # this is somewhat inefficient, as file are loaded every time
    # one could move this into a scons builder, but I rally need to
    # get done with my thesis
    for src in symbol_files:
        hc.count_symbols_in_file(src)
    hc.generate()
    env['AWAP_HUFFMAN_ENCODER'] = HuffmanEncoder(hc)
    env['AWAP_AGENT_FORMAT_COMPRESSSION_CODE'] = hc.to_dict()
예제 #8
0
    def test_something(self):
        frequencies = {'A': 10,
                       'E': 15,
                       'I': 12,
                       'S': 3,
                       'T': 4,
                       'P': 13,
                       '\\n': 1}

        expected_encoding = {'I': '00',
                             'P': '01',
                             'E': '10',
                             'A': '110',
                             'T': '1110',
                             '\\n': '11110',
                             'S': '11111'}

        huffman_encoder = HuffmanEncoder(frequencies)
        huffman_encoding, huffman_decoding = huffman_encoder.generate_coding()

        print(huffman_encoding)

        self.assertEqual(expected_encoding, huffman_encoding)
class HierarchicalSoftmaxCBOW(object):
    def __init__(self, sentences, context=1, hidden=5, concat=False):
        logging.info(msg="starting CBOW training..")
        self.context = context
        self.encoder = Encoder(sentences=sentences)
        self.huffman_encoder = HuffmanEncoder(self.encoder.counter)
        self.encoding_length = self.encoder.encoding_length
        self.hidden_units = hidden
        self.output_units = 1
        self.input_units = context if concat else 1
        self.input2hidden = np.random.rand(
            self.hidden_units, self.input_units * self.encoding_length) * 0.1
        self.hidden2output = np.random.rand(
            self.output_units * self.encoding_length - 1,
            self.hidden_units) * 0.1

        # train model
        word_count = 0
        last_time = time.time()
        for sentence in sentences:
            context_pairs = sentence2contexts(sentence, self.context)
            for w, c in context_pairs:
                self._train(w, c)
                # break
                word_count += 1
                if word_count % 100 == 0:
                    now = time.time()
                    time_spent = 1.0 / (now - last_time) * 100
                    logging.info(msg="trained on %s words. %s words/sec" %
                                 (word_count, time_spent))
                    last_time = time.time()

    def _train(self, word, context):
        onehot_context = [self.encoder.word2onehot(w) for w in context]
        onehot_word = self.encoder.word2onehot(word)
        t = onehot_word
        x = np.zeros_like(onehot_word)
        for c in onehot_context:
            x += c
        # forward pass
        h = (1.0 / self.context) * self.input2hidden * x
        # probability of target word
        node_ids = self.huffman_encoder.get_internal_node_ids(word)
        huffman_code = self.huffman_encoder.get_code(word)
        indicator_vec = np.matrix(
            [1.0 if e == "1" else 0.0 for e in huffman_code])
        alpha = 0.1
        # dE/dw'h
        dEdw_prime_h = np.empty_like(indicator_vec)
        # dE/dh
        dEdh = np.zeros_like(h)
        for j, idx in enumerate(node_ids):
            dEdw_prime_h[:, j] = (logit(self.hidden2output[idx].T * h) -
                                  indicator_vec[:, j])
            # (equation 52 - 54)
            dEdh_component = np.multiply(dEdw_prime_h[:, j],
                                         self.hidden2output[idx]).T
            dEdh = dEdh + dEdh_component

            # update w_j_prime (Equation 51)
            self.hidden2output[idx] = self.hidden2output[idx] - np.asarray(
                alpha * dEdw_prime_h[:, j] * h.T)

        dEdw = dEdh * x.T

        # update W' and W
        # self.hidden2output -= alpha*dEdw_prime
        self.input2hidden -= 1.0 / self.context * alpha * dEdw

    def predict(self, context):
        onehot_context = [self.encoder.word2onehot(w) for w in context]
        x = np.zeros_like(onehot_context[0])
        for c in onehot_context:
            x += c
        _, y = self._forward_pass(x)
        return y

    def __getitem__(self, word):
        onehot_word = self.encoder.word2onehot(word)
        return self.input2hidden * onehot_word
예제 #10
0
 def _HuffmanCodes(self, words):
     repeatHuff = HuffmanEncoder()
     tailHuff = HuffmanEncoder()
     charHuff = HuffmanEncoder()
     pw = ''
     for w in words:
         prefix = self._LettersInCommon(pw, w)
         repeatHuff.Tally(prefix)
         tailHuff.Tally(len(w) - prefix)
         for c in w[prefix:]:
             charHuff.Tally(c)
         pw = w
     return repeatHuff.Compile(), tailHuff.Compile(), charHuff.Compile()
예제 #11
0
def compress(input_file_path, output_file_path):
    """ Compress with huffman encoding """

    encoder = HuffmanEncoder(input_file_path, output_file_path)
    encoder.compress()