def __init__(self, sentences, context=1, hidden=5, concat=False): logging.info(msg="starting CBOW training..") self.context = context self.encoder = Encoder(sentences=sentences) self.huffman_encoder = HuffmanEncoder(self.encoder.counter) self.encoding_length = self.encoder.encoding_length self.hidden_units = hidden self.output_units = 1 self.input_units = context if concat else 1 self.input2hidden = np.random.rand( self.hidden_units, self.input_units * self.encoding_length) * 0.1 self.hidden2output = np.random.rand( self.output_units * self.encoding_length - 1, self.hidden_units) * 0.1 # train model word_count = 0 last_time = time.time() for sentence in sentences: context_pairs = sentence2contexts(sentence, self.context) for w, c in context_pairs: self._train(w, c) # break word_count += 1 if word_count % 100 == 0: now = time.time() time_spent = 1.0 / (now - last_time) * 100 logging.info(msg="trained on %s words. %s words/sec" % (word_count, time_spent)) last_time = time.time()
def _HuffmanCode(self, words): charHuff = HuffmanEncoder() for w in words: for c in w: charHuff.Tally(c) charHuff.Tally('\n') return charHuff.Compile()
def _HuffmanCodes(self, words): repeatHuff = HuffmanEncoder() tailHuff = HuffmanEncoder() charHuffTable = [HuffmanEncoder() for _ in range(27)] pw = '' for w in words: prefix = self._LettersInCommon(pw, w) repeatHuff.Tally(prefix) tailHuff.Tally(len(w) - prefix) if prefix == 0: # Use index 0 to represent that there is no previous character. i = 0 else: # We take advantage of the fact that the backquote character '`` # comes right before 'a' in the ASCII table. i = ord(w[prefix - 1]) - ord('`') for c in w[prefix:]: charHuffTable[i].Tally(c) i = ord(c) - ord('`') pw = w charRootTable = [h.Compile() for h in charHuffTable] return repeatHuff.Compile(), tailHuff.Compile(), charRootTable
def run(self): # 子线程函数,子线程类只有该函数运行在子线程 self.encoder = HuffmanEncoder() # 实例化HuffmanEncoder()对象 self.encoder.progress.connect(lambda x: self.update_progress.emit(x) ) # 给HuffmanEncoder()对象的进度信号发送设置对应的槽函数 if self.ena_decode: self.encoder.decodefile(inputfile=self.inputfile, outputfile=self.outputfile) else: self.encoder.encodefile(inputfile=self.inputfile, outputfile=self.outputfile) self.finished.emit()
def test_something_2(self): frequencies = {'C': 2, 'B': 6, 'E': 7, '_': 10, 'D': 10, 'A': 11} huffman_encoder = HuffmanEncoder(frequencies) huffman_encoding, huffman_decoding = huffman_encoder.generate_coding() print(huffman_encoding)
class encode_thread(QThread): # 子线程类 update_progress = pyqtSignal(int) # 向主线程发送当前进度信号 finished = pyqtSignal() # 向主线程发送任务完成的信号 # 初始化参数,inputfile:件载入路径;outputfile:文件压缩完后的保存路径;ena_decode:是否使能编码功能 def __init__(self, inputfile='.', outputfile='.', ena_decode=False): super(QThread, self).__init__() self.inputfile = inputfile self.outputfile = outputfile self.ena_decode = ena_decode def run(self): # 子线程函数,子线程类只有该函数运行在子线程 self.encoder = HuffmanEncoder() # 实例化HuffmanEncoder()对象 self.encoder.progress.connect(lambda x: self.update_progress.emit(x) ) # 给HuffmanEncoder()对象的进度信号发送设置对应的槽函数 if self.ena_decode: self.encoder.decodefile(inputfile=self.inputfile, outputfile=self.outputfile) else: self.encoder.encodefile(inputfile=self.inputfile, outputfile=self.outputfile) self.finished.emit()
def generate_huffman_code(env, symbols, symbol_files): from huffman import HuffmanCode, HuffmanEncoder hc = HuffmanCode() for symbol in symbols: hc.add_symbol(symbol) # this is somewhat inefficient, as file are loaded every time # one could move this into a scons builder, but I rally need to # get done with my thesis for src in symbol_files: hc.count_symbols_in_file(src) hc.generate() env['AWAP_HUFFMAN_ENCODER'] = HuffmanEncoder(hc) env['AWAP_AGENT_FORMAT_COMPRESSSION_CODE'] = hc.to_dict()
def test_something(self): frequencies = {'A': 10, 'E': 15, 'I': 12, 'S': 3, 'T': 4, 'P': 13, '\\n': 1} expected_encoding = {'I': '00', 'P': '01', 'E': '10', 'A': '110', 'T': '1110', '\\n': '11110', 'S': '11111'} huffman_encoder = HuffmanEncoder(frequencies) huffman_encoding, huffman_decoding = huffman_encoder.generate_coding() print(huffman_encoding) self.assertEqual(expected_encoding, huffman_encoding)
class HierarchicalSoftmaxCBOW(object): def __init__(self, sentences, context=1, hidden=5, concat=False): logging.info(msg="starting CBOW training..") self.context = context self.encoder = Encoder(sentences=sentences) self.huffman_encoder = HuffmanEncoder(self.encoder.counter) self.encoding_length = self.encoder.encoding_length self.hidden_units = hidden self.output_units = 1 self.input_units = context if concat else 1 self.input2hidden = np.random.rand( self.hidden_units, self.input_units * self.encoding_length) * 0.1 self.hidden2output = np.random.rand( self.output_units * self.encoding_length - 1, self.hidden_units) * 0.1 # train model word_count = 0 last_time = time.time() for sentence in sentences: context_pairs = sentence2contexts(sentence, self.context) for w, c in context_pairs: self._train(w, c) # break word_count += 1 if word_count % 100 == 0: now = time.time() time_spent = 1.0 / (now - last_time) * 100 logging.info(msg="trained on %s words. %s words/sec" % (word_count, time_spent)) last_time = time.time() def _train(self, word, context): onehot_context = [self.encoder.word2onehot(w) for w in context] onehot_word = self.encoder.word2onehot(word) t = onehot_word x = np.zeros_like(onehot_word) for c in onehot_context: x += c # forward pass h = (1.0 / self.context) * self.input2hidden * x # probability of target word node_ids = self.huffman_encoder.get_internal_node_ids(word) huffman_code = self.huffman_encoder.get_code(word) indicator_vec = np.matrix( [1.0 if e == "1" else 0.0 for e in huffman_code]) alpha = 0.1 # dE/dw'h dEdw_prime_h = np.empty_like(indicator_vec) # dE/dh dEdh = np.zeros_like(h) for j, idx in enumerate(node_ids): dEdw_prime_h[:, j] = (logit(self.hidden2output[idx].T * h) - indicator_vec[:, j]) # (equation 52 - 54) dEdh_component = np.multiply(dEdw_prime_h[:, j], self.hidden2output[idx]).T dEdh = dEdh + dEdh_component # update w_j_prime (Equation 51) self.hidden2output[idx] = self.hidden2output[idx] - np.asarray( alpha * dEdw_prime_h[:, j] * h.T) dEdw = dEdh * x.T # update W' and W # self.hidden2output -= alpha*dEdw_prime self.input2hidden -= 1.0 / self.context * alpha * dEdw def predict(self, context): onehot_context = [self.encoder.word2onehot(w) for w in context] x = np.zeros_like(onehot_context[0]) for c in onehot_context: x += c _, y = self._forward_pass(x) return y def __getitem__(self, word): onehot_word = self.encoder.word2onehot(word) return self.input2hidden * onehot_word
def _HuffmanCodes(self, words): repeatHuff = HuffmanEncoder() tailHuff = HuffmanEncoder() charHuff = HuffmanEncoder() pw = '' for w in words: prefix = self._LettersInCommon(pw, w) repeatHuff.Tally(prefix) tailHuff.Tally(len(w) - prefix) for c in w[prefix:]: charHuff.Tally(c) pw = w return repeatHuff.Compile(), tailHuff.Compile(), charHuff.Compile()
def compress(input_file_path, output_file_path): """ Compress with huffman encoding """ encoder = HuffmanEncoder(input_file_path, output_file_path) encoder.compress()