def decode(self): output = np.zeros(self.img.shape) cnum = self.img.shape[1] + self.cpad horizontal_block_count = cnum // self.block_size h_tmp = HuffmanCoding([]) for channel in range(self.img.shape[2]): for index, (encoded, rev_map) in enumerate(self.encoded_blocks[channel]): i = index // horizontal_block_count j = index % horizontal_block_count r_min = i * self.block_size r_max = min((i + 1) * self.block_size, self.img.shape[0]) row_diff = r_max - r_min c_min = j * self.block_size c_max = min((j + 1) * self.block_size, self.img.shape[1]) col_diff = c_max - c_min h_tmp.reverse_mapping = rev_map zigzag = h_tmp.decompress(encoded) coeffs = fillDiagonal(zigzag, self.block_size) is_Y = (channel == 0) block = inv_dct(coeffs, mult=self.mult, is_Y=is_Y) output[r_min:r_max, c_min:c_max, channel] = block[:row_diff, :col_diff] return output
def compressFunc(self): path = self.name.get() global h h = HuffmanCoding(path) global output_path output_path = h.compress() print("Compressed file path: " + output_path)
def main(): """ In this function we read the folder healthy_cow where are store the csv files that contains the matrix of pixels of the images and we selected an specific file. At first, we applied a compression with FFT and it returns a csv file, which will be compress again using the huffman method and will return a binary file. Followed by this, we decompress the binary file with the huffman method and it returns a csv file that will be decompress with FFT. Finally, using the library PIL, we show the original imagen and the decompress image. :raises: file not found :rtype: Image in PNG format """ directory_sick_cow = "/Users/isabella/Documents/Segundo Semestre/Estructura de Datos /Proyecto/ganado_enfermo_csv" directory_healthy_cow = "/Users/isabella/Documents/Segundo Semestre/Estructura de Datos /Proyecto/Entrega 3/Codigo/huffman-coding-master/Vacas_Enferma" directory = directory_healthy_cow cont = os.listdir(directory) matriz_csv_var = load_img(directory+'/'+cont[0]) fft_compression(matriz_csv_var, 0.05) h = HuffmanCoding("compressFFT.csv") output_path = h.compress() print("Compressed file path: " + output_path) decom_path = h.decompress(output_path) print("Decompressed file path: " + decom_path) img_fft_descompress = fft_descompression(decom_path) show_img(matriz_csv_var) show_img(img_fft_descompress) savetxt('dataff.csv', matriz_csv_var, delimiter=',')
def main(): from huffman import HuffmanCoding import sys inputFilePath = "sample.txt" handle = HuffmanCoding(inputFilePath) output_path = handle.compress() print("Compressed file path: " + output_path) decom_path = h.decompress(output_path) print("Decompressed file path: " + decom_path)
def test_decoded_msg_given_the_original_msg(self): txt = "The bird is the word" huffman_coding = HuffmanCoding(txt) print("The size of the data is: {}\n".format(sys.getsizeof(txt))) print("The content of the data is: {}\n".format(txt)) encoded_data, tree = huffman_coding.huffman_encoding() print("The size of the encoded data is: {}\n".format( sys.getsizeof(int(encoded_data, base=2)))) print("The content of the encoded data is: {}\n".format(encoded_data)) self.assertTrue(txt, huffman_coding.huffman_decoding(encoded_data, tree))
def algorithm(path): h = HuffmanCoding(path) first1 = time.time() output_path = h.compress() second1 = time.time() delta_time1 = second1 - first1 first2 = time.time() decom_path = h.decompress(output_path) second3 = time.time() delta_time2 = second3 - first2 return output_path, delta_time1, delta_time2, decom_path
def test_when_msg_is_single_char(self): txt = "T" huffman_coding = HuffmanCoding(txt) print("The size of the data is: {}\n".format(sys.getsizeof(txt))) print("The content of the data is: {}\n".format(txt)) encoded_data, tree = huffman_coding.huffman_encoding() print(encoded_data) print("The size of the encoded data is: {}\n".format( sys.getsizeof(int(encoded_data, base=2)))) print("The content of the encoded data is: {}\n".format(encoded_data)) decoded_data = huffman_coding.huffman_decoding(encoded_data, tree) print(decoded_data) self.assertTrue(txt, huffman_coding.huffman_decoding(encoded_data, tree))
def main(argv): filepath = argv[1] read_bit_size = 8 if len(argv) > 2: read_bit_size = argv[2] print(read_bit_size) h = HuffmanCoding(filepath, read_bit_size) output_path = h.compress() print("Compressed file path: " + output_path) decom_path = h.decompress(output_path) print("Decompressed file path: " + decom_path)
def __init__(self, parent): Frame.__init__(self, parent) self.parent = parent self.elemental_coding = ElementalCoding() self.huffman_coding = HuffmanCoding() self.dictionary_coding = DictionaryCoding() self.init_ui() self.current_encoding = 5
def mask_compression(mask): prev = 1 rl = 0 cnt = 0 result = [] for e in mask: if e == prev: rl += 1 else: result += [rl] rl = 0 prev = e if rl > 0: result += [rl] huffman = HuffmanCoding() size = len(huffman.compress(result)) * 4 return size
def choice(self): filename = askopenfilename() lable = Label(self, text="", relief=RAISED) lable.configure(text=filename) lable.grid(column=0, row=2) lable1 = Label(self, text="") lable1.configure(text="Dung lượng " + str(os.path.getsize(filename)) + " bytes") lable1.grid(column=0, row=3) self.path = filename self.h = HuffmanCoding(filename)
def main(): parser = argparse.ArgumentParser(description='Word2vec') parser.add_argument('mode', metavar='mode', type=str, help='"SG" for skipgram, "CBOW" for CBOW') parser.add_argument('ns', metavar='negative_samples', type=int, help='0 for hierarchical softmax, the other numbers would be the number of negative samples') parser.add_argument('part', metavar='partition', type=str, help='"part" if you want to train on a part of corpus, "full" if you want to train on full corpus') args = parser.parse_args() mode = args.mode part = args.part ns = args.ns #Load and preprocess corpus print("loading...") if part=="part": text = open('text8.txt',mode='r').readlines()[0][:1000000] #Load a part of corpus for debugging elif part=="full": text = open('text8.txt',mode='r').readlines()[0] #Load full corpus for submission else: print("Unknown argument : " + part) exit() print("preprocessing...") corpus = text.split() stats = Counter(corpus) words = [] #Discard rare words for word in corpus: if stats[word]>4: words.append(word) vocab = set(words) #Give an index number to a word w2i = {} w2i[" "]=0 i = 1 for word in vocab: w2i[word] = i i+=1 i2w = {} for k,v in w2i.items(): i2w[v]=k #Code dict for hierarchical softmax freqdict={} freqdict[0]=10 for word in vocab: freqdict[w2i[word]]=stats[word] codedict, full_tree= HuffmanCoding().build(freqdict) subsampled_dic=subsampling_table(freqdict) #Frequency table for negative sampling freqtable = [0,0,0] for k,v in stats.items(): f = int(v**0.75) for _ in range(f): if k in w2i.keys(): freqtable.append(w2i[k]) #Make training set print("build training set...") train_set = [] input_set=[] target_set=[] window_size = 5 if mode=="CBOW": for j in range(len(words)): #sampling_index=random.choice(subsampled_dic[w2i[words[j]]]) sampling_index=1 if sampling_index == 1: if j<window_size: input_set.append([0 for _ in range(window_size-j)] + [w2i[words[k]] for k in range(j)] + [w2i[words[j+k+1]] for k in range(window_size)]) target_set.append(w2i[words[j]]) elif j>=len(words)-window_size: input_set.append([w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[len(words)-k-1]] for k in range(len(words)-j-1)] + [0 for _ in range(j+window_size-len(words)+1)]) target_set.append(w2i[words[j]]) else: input_set.append([w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[j+k+1]] for k in range(window_size)]) target_set.append(w2i[words[j]]) if mode=="SG": for j in range(len(words)): #sampling_index=random.choice(subsampled_dic[w2i[words[j]]]) sampling_index = 1 if sampling_index == 1: if j<window_size: input_set += [w2i[words[j]] for _ in range(window_size*2)] target_set += [0 for _ in range(window_size-j)] + [w2i[words[k]] for k in range(j)] + [w2i[words[j+k+1]] for k in range(window_size)] elif j>=len(words)-window_size: input_set += [w2i[words[j]] for _ in range(window_size*2)] target_set += [w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[len(words)-k-1]] for k in range(len(words)-j-1)] + [0 for _ in range(j+window_size-len(words)+1)] else: input_set += [w2i[words[j]] for _ in range(window_size*2)] target_set += [w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[j+k+1]] for k in range(window_size)] print("Vocabulary size") print(len(w2i)) print() print(input_set[:100]) #Training section emb,_ = word2vec_trainer(input_set, target_set, len(w2i), codedict, full_tree ,freqtable, mode=mode, NS=ns, dimension=64, epoch=1, learning_rate=0.01) Analogical_Reasoning_Task(emb, w2i, i2w) def save(file_name): f = open(file_name, 'w') for word in list(w2i.keys()): word_index = w2i[word] vector_str = ' '.join([str(s.item()) for s in emb[word_index]]) f.write('%s %s\n' % (word, vector_str)) f.close() print("저장 완료!!!") if mode=='SG': name='skip-gra' else: name='CBOW' if ns==0: name+='_hierarchical-softmax' else: name+='_negative-sampling' if part=='part': name+='_part' else: name+="_full" save(name+'_subsampling')
def main(): h = HuffmanCoding(sample_data) output_path = h.compress() h.decompress(output_path)
from huffman import HuffmanCoding #input file path of your pc where the files are stored path = "C:/Users/Panchal/Desktop/huffman/sample.txt" h = HuffmanCoding(path) h.compress() # calling compress function h.decompress("C:/Users/Panchal/Desktop/huffman/sample.bin" ) # calling decompresse fuction
import base64 #Taking a json path = input("Give json file...\n") while path_.exists(path) == False: print("No such file exist") path = input("Give json file...\n") deb_input = input("Run on Debug ? \n") debug = True if deb_input == '1' else False with open(path) as json_file: data = json.load(json_file) h = HuffmanCoding(path,debug) h.set_reverse_mapping(data["reverse_mapping"])# The statics from the huffman compression received_message = data["Messagebase64"] received_message = bytes(received_message[2:-1],'utf-8') #Make it back to base64 bytes income_message = str(base64.b64decode(received_message)) income_message = income_message[2:-1] #We take only what we need dec_out = '' i = 0 k = 7 while k <= len(income_message): input = income_message[i:k] # Do hamilton for input = income_message[i:k] lista = [] #make a list so i can make an numpy array
from time import sleep clientSocket = socket.socket() host = socket.gethostname() port = 9001 clientSocket.connect((host, port)) filePath = input("Enter the path of the file: ") fileName = os.path.basename(filePath) start = datetime.now() clientSocket.send(bytes(fileName, "utf-8")) huffman = HuffmanCoding(filePath) compressedFilePath = huffman.compress() sleep(1) clientSocket.send(pickle.dumps(huffman)) with open(compressedFilePath, "rb") as fp: data = fp.read(1024) while data: clientSocket.send(data) data = fp.read(1024) end = datetime.now() duration = end - start compressionRatio = os.path.getsize(filePath) / os.path.getsize(
def testing(text, test_number, path, test_name): ratio = [] timing = [] print(f"test number: {test_number}") output = open(path + f"/test_{test_number}.txt", 'w') output.write(text) original_size = os.path.getsize(path + f"/test_{test_number}.txt") # Huffman print("Compressing with Huffman...") h = HuffmanCoding(output.name) start = time.time() compressed = h.compress() timing.append((time.time() - start) * 1000) h.decompress(compressed) ratio.append(os.path.getsize(compressed) / original_size * 100) print("Compressing with Huffman finished") # RLE print("Compressing with RLE...") rle = RLE() output = open(path + f"/test_{test_number}_rle.rle", 'w') start = time.time() output.write(rle.encode(text)) timing.append((time.time() - start) * 1000) ratio.append( os.path.getsize(path + f"/test_{test_number}_rle.rle") / original_size * 100) print("Compressing with RLE finished") # LZW print("Compressing with LZW...") start = time.time() lzw3Compressor.LZWCompressor().compress( path + f"/test_{test_number}.txt", path + f"/test_{test_number}_lzw.lzw") timing.append((time.time() - start) * 1000) # lzw3Decompressor.LZWDecompressor().decompress(path + f"/test_{test_number}_lzw.lzw", path + f"/test_{test_number}_lzw_decompressed.txt") ratio.append( os.path.getsize(path + f"/test_{test_number}_lzw.lzw") / original_size * 100) print("Compressing with LZW finished") # LZ78 print("Compressing with LZ78...") output = open(path + f"/test_{test_number}_lz78.lz78", 'w') start = time.time() output.write(lz78_compress(text)) timing.append((time.time() - start) * 1000) ratio.append( os.path.getsize(path + f"/test_{test_number}_lz78.lz78") / original_size * 100) print("Compressing with LZ78 finished") # PPM print("compression with PPM...") start = time.time() ppm_compression(path + f"/test_{test_number}.txt", path + f"/test_{test_number}_ppm.ppm") timing.append((time.time() - start) * 1000) # ppm_decompression(path + f"/test_{test_number}_ppm.ppm", path + f"/test_{test_number}_ppm_decompresed.txt") ratio.append( os.path.getsize(path + f"/test_{test_number}_ppm.ppm") / original_size * 100) print("compressing with PPM finished") save_bar_graph( ratio, timing, f"{test_name} N°{test_number}\nOriginal Size: {original_size} bytes", f"graphs/{test_name} {test_number}.svg") tick_label = ['Huffman', 'RLE', 'LZW', 'LZ78', 'PPM'] with open(os.getcwd() + f"/data.txt", 'a') as records: records.write(f"\nOriginal Size: {original_size} bytes\n") records.write(f"\t\t\tSize\t\tCompression Ratio\t\t\tTime\n") for i in range(5): spacing = [ "\t" if i != 0 else "", "\t" if int(ratio[i]) < 100 else "", "\t" if int(ratio[i] / 100 * original_size) < 100000 else "" ] records.write( f"{tick_label[i]}:\t{spacing[0]}{int(ratio[i]/100*original_size)} bytes{spacing[2]}\t{ratio[i]}%\t\t{timing[i]} ms\n" ) return ratio, timing
def encode_huffman(model, enc, message, context, bits_per_word, finish_sent=False, device='cpu'): length = len(message) context = torch.tensor(context[-1022:], device=device, dtype=torch.long) prev = context output = context past = None total_num = 0 total_num_for_stats = 0 total_log_probs = 0 total_kl = 0 # in bits total_num_sents = 0 with torch.no_grad(): i = 0 sent_finish = False while i < length or (finish_sent and not sent_finish): logits, past = model(prev.unsqueeze(0), past=past) past = limit_past(past) logits[0, -1, -1] = -1e10 # endoftext can't happen logits[0, -1, 628] = -1e10 # 2 newlines can't happen logits, indices = logits[0, -1, :].sort(descending=True) # Get the top 2**bits options indices = indices[:2**bits_per_word] log_probs = F.log_softmax(logits, dim=-1)[:2**bits_per_word] probs = torch.exp(log_probs) if i >= length: selection = 0 sent_finish = is_sent_finish(indices[0].item(), enc) else: probs_array = probs.cpu().numpy() coding = HuffmanCoding() coding.make_heap_from_array(probs_array) coding.merge_nodes() root = coding.make_codes() #print(message[i:i+10]) while root.token is None: if i >= length or message[i] == 0: root = root.left else: root = root.right i += 1 selection = root.token logq = torch.tensor([ -len(coding.codes[idx]) for idx in range(len(probs_array)) ], dtype=torch.float, device=device) # in bits logq = logq * 0.69315 # in nats q = torch.exp(logq) total_kl += kl(q, logq, log_probs) total_log_probs += log_probs[selection].item() total_num_for_stats += 1 total_num += 1 prev = indices[selection].view(1) output = torch.cat((output, prev)) avg_NLL = -total_log_probs / total_num_for_stats avg_KL = total_kl / total_num_for_stats words_per_bit = total_num_for_stats / i return output[len(context):].tolist(), avg_NLL, avg_KL, words_per_bit
def main(): parser = argparse.ArgumentParser(description='Word2vec') parser.add_argument('mode', metavar='mode', type=str, help='"SG" for skipgram, "CBOW" for CBOW') parser.add_argument('ns', metavar='negative_samples', type=int, help='0 for hierarchical softmax, the other numbers would be the number of negative samples') parser.add_argument('part', metavar='partition', type=str, help='"part" if you want to train on a part of corpus, "full" if you want to train on full corpus') args = parser.parse_args() mode = args.mode part = args.part ns = args.ns print("loading...") if part=="part": # 텍스트 수 text = open('text8',mode='r').readlines()[0][:1000000] elif part=="full": text = open('text8',mode='r').readlines()[0] else: print("Unknown argument : " + part) exit() print("preprocessing...") corpus = text.split() stats = Counter(corpus) words = [] for word in corpus: if stats[word]>4: words.append(word) vocab = set(words) w2i = {} w2i[" "]=0 i = 1 for word in vocab: w2i[word] = i i+=1 i2w = {} for k,v in w2i.items(): i2w[v]=k freqdict={} freqdict[0]=10 for word in vocab: freqdict[w2i[word]]=stats[word] codedict = HuffmanCoding().build(freqdict) freqtable = [0,0,0] for k,v in stats.items(): f = int(v**0.75) for _ in range(f): if k in w2i.keys(): freqtable.append(w2i[k]) print("build training set...") input_set = [] target_set = [] window_size = 5 if mode=="CBOW": for j in range(len(words)): if j<window_size: input_set.append([0 for _ in range(window_size-j)] + [w2i[words[k]] for k in range(j)] + [w2i[words[j+k+1]] for k in range(window_size)]) target_set.append(w2i[words[j]]) elif j>=len(words)-window_size: input_set.append([w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[len(words)-k-1]] for k in range(len(words)-j-1)] + [0 for _ in range(j+window_size-len(words)+1)]) target_set.append(w2i[words[j]]) else: input_set.append([w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[j+k+1]] for k in range(window_size)]) target_set.append(w2i[words[j]]) if mode=="SG": for j in range(len(words)): if j<window_size: input_set += [w2i[words[j]] for _ in range(window_size*2)] target_set += [0 for _ in range(window_size-j)] + [w2i[words[k]] for k in range(j)] + [w2i[words[j+k+1]] for k in range(window_size)] elif j>=len(words)-window_size: input_set += [w2i[words[j]] for _ in range(window_size*2)] target_set += [w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[len(words)-k-1]] for k in range(len(words)-j-1)] + [0 for _ in range(j+window_size-len(words)+1)] else: input_set += [w2i[words[j]] for _ in range(window_size*2)] target_set += [w2i[words[j-k-1]] for k in range(window_size)] + [w2i[words[j+k+1]] for k in range(window_size)] print("Vocabulary size") print(len(w2i)) print() numwords = len(w2i) use_subsample = True W_in ,_ = word2vec_trainer(input_set, target_set, numwords, codedict, freqtable, mode=mode, NS=ns, dimension=64, epoch=1, learning_rate=0.01, do_subsampling=use_subsample) emb = {} for index in range(numwords): emb[i2w[index]] = W_in[index] Analogical_Reasoning_Task(emb, output_name=f"{mode} {ns} {use_subsample} dim=64.txt")
class ElementalCodingGUI(Frame): def __init__(self, parent): Frame.__init__(self, parent) self.parent = parent self.elemental_coding = ElementalCoding() self.huffman_coding = HuffmanCoding() self.dictionary_coding = DictionaryCoding() self.init_ui() self.current_encoding = 5 def init_ui(self): self.parent.title("Information Theory") Style().configure("TButton", padding=(0, 5, 0, 5), font='Verdana 10') self.columnconfigure(0, pad=3) self.columnconfigure(1, pad=3) self.columnconfigure(2, pad=3) self.columnconfigure(3, pad=3) self.columnconfigure(4, pad=3) self.rowconfigure(0, pad=3) self.rowconfigure(1, pad=3) self.rowconfigure(2, pad=3) self.rowconfigure(3, pad=3) self.rowconfigure(4, pad=3) self.rowconfigure(5, pad=3) string_to_search_label = Label(self, text="Search a string: ") string_to_search_label.grid(row=0, column=0, rowspan=2) self.string_to_search_textfield = Entry(self) self.string_to_search_textfield.grid(row=0, column=1, rowspan=2, columnspan=2, sticky=W) self.string_to_search_textfield.bind('<Return>', self.get_string_from_textfield) self.compression_ratio_text = StringVar() self.compression_ratio_text.set('Compression Ratio: ') compression_ratio_label = Label(self, textvariable=self.compression_ratio_text).grid(row=0, column=2, columnspan=4) Separator(self, orient=HORIZONTAL).grid(row=1) string_to_encode_label = Label(self, text="Encode a string: ") string_to_encode_label.grid(row=2, column=0, rowspan=2) self.string_to_encode_textfield = Entry(self) self.string_to_encode_textfield.grid(row=2, column=1, rowspan=2, columnspan=2, sticky=W) self.string_to_encode_textfield.bind('<Return>', self.get_string_from_textfield_to_encode) Separator(self, orient=HORIZONTAL).grid(row=3) self.area = Text(self) self.area.grid(row=4, column=0, columnspan=3, rowspan=1, padx=5, sticky=E + W) self.area.config(width=10, height=15) self.possible_options_text = StringVar() self.possible_options_text.set("Possible Options: ") self.possible_options_label = Label(self, textvariable=self.possible_options_text).grid(row=4, column=3, sticky=N) huffman_coding_button = Button(self, text="Huffman", command=self.huffman_coding_callback).grid(row=5, column=0) arithmetic_coding_button = Button(self, text="Arithmetic Coding", command=self.arithmetic_coding_callback).grid(row=5, column=1) dictionary_coding_button = Button(self, text="Dictionary", command=self.dictionary_coding_callback).grid(row=5, column=2) elias_coding_button = Button(self, text="Elias", command=self.elias_coding_callback).grid(row=5, column=3) our_coding_button = Button(self, text="Elemental Coding", command=self.elemental_coding_callback).grid(row=5, column=4) self.pack() self.elemental_coding_callback() def get_string_from_textfield_to_encode(self, event): text_to_encode = self.string_to_encode_textfield.get() if text_to_encode == '': text_to_encode = 'a' if self.current_encoding == 1: self.huffman_coding.encode(text_to_encode) self.set_text_in_text_area(output) compression_ratio = self.huffman_coding.compression_ratio self.compression_ratio_text.set('Compression Ratio: ' + str(compression_ratio)) if self.current_encoding == 2: pass if self.current_encoding == 3: pass if self.current_encoding == 4: pass if self.current_encoding == 5: self.elemental_coding.getElementList() self.elemental_coding.codeElemental(text_to_encode) self.elemental_coding.encodeText() output = self.elemental_coding.printCodedText() compression_ratio = self.elemental_coding.get_compression_ratio() self.compression_ratio_text.set('Compression Ratio: ' + str(compression_ratio)) #self.set_text_in_text_area(output) def get_string_from_textfield(self, event): text_to_encode = self.string_to_sitemearch_textfield.get() possible_options = self.elemental_coding.lookForString(text_to_encode) self.possible_options_text.set('Possible Options: ' + possible_options) self.string_to_search_textfield.delete(END) def huffman_coding_callback(self): self.current_encoding = 1 output = self.huffman_coding.encode_default_file() self.set_text_in_text_area(output) compression_ratio = self.huffman_coding.compression_ratio self.compression_ratio_text.set('Compression Ratio: ' + str(compression_ratio)) print "HUFMAAN!" def arithmetic_coding_callback(self): self.current_encoding = 2 text_to_encode = self.string_to_encode_textfield.get() if text_to_encode == '': text_to_encode = ' ' for char in text_to_encode: if char not in arithmetic_intervals: can_encode = False else: can_encode = True if can_encode: codificacion = interval_coding(text_to_encode, arithmetic_intervals) self.compression_ratio_text.set(str(codificacion[2])) else: self.compression_ratio_text.set("Error: no en intervalos\n" "Si desea comprobar este metodo," "introduce un string en el cuadro\n" "\"Encode text\"") print "Arithmetic!" def dictionary_coding_callback(self): self.current_encoding = 3 contents = get_file_contents(os.getcwd() + '/pagina.txt') compress_text = self.dictionary_coding.compress(contents) compression_ratio = len(compress_text) / float(len(contents)) self.compression_ratio_text.set('Compression Ratio: ' + str(compression_ratio) ) compress_text = [str(item) for item in compress_text] self.set_text_in_text_area(''.join(compress_text)) print "Dictionary!" def elias_coding_callback(self): self.current_encoding = 4 text_to_encode = self.string_to_encode_textfield.get() if text_to_encode == '': text_to_encode = ' ' for char in text_to_encode: if char not in elias_intervals: can_encode = False else: can_encode = True if can_encode: codificacion = interval_coding(text_to_encode, elias_intervals) self.compression_ratio_text.set(str(codificacion[2]) + "%") else: self.compression_ratio_text.set("Error: no en intervalos\n" "Si desea comprobar este metodo," "introduce un string en el cuadro\n" "\"Encode text\"") def set_text_in_text_area(self, output): self.area.config(state=NORMAL) self.area.delete("1.0", END) self.area.insert(INSERT, output) self.area.config(state=DISABLED) def elemental_coding_callback(self): self.current_encoding = 5 self.elemental_coding.getElementList() self.elemental_coding.processFile('pagina.txt') self.elemental_coding.encodeText() output = self.elemental_coding.printCodedText() self.set_text_in_text_area(output) compression_ratio = self.elemental_coding.get_compression_ratio() self.compression_ratio_text.set('Compression Ratio: ' + str(compression_ratio)) print "Our Coding!"
def forward(self, x, ss_map=None): # sample from input if self.use_subsampling: x, thresh = x self.sizes[0] += x.view(-1).size(0) * 8 # feature feat_1 = self.ctx(x) feat_1_ = self.unpool(feat_1) else: self.sizes[0] += x.view(-1).size(0) * 8 x = self.sample(x) # after CNN self.sizes[1] += x.view(-1).size(0) * 8 if ss_map is not None: ss_map = self.unpool(ss_map) > 0.5 unpooled = self.unpool(self.pool(x)) x = torch.where(ss_map, unpooled, x) # subsampling # data to be sent: mask + actual data B, C, H, W = x.size() if self.use_subsampling: th_1 = thresh # sub-sample ss_1 = self.unpool(self.pool1(x)) # conditions cond_1 = feat_1_ < th_1 mask_1 = feat_1 < th_1 # subsampled data in different areas data_1 = self.pool1(x)[mask_1] cond_0 = torch.logical_not(cond_1) data_0 = x[cond_0] comp_data = torch.cat((data_0, data_1), 0) # after RAF self.sizes[2] += comp_data.size(0) * 8 # affected data in the original shape if not self.training: x = torch.where(cond_1, ss_1, x) else: x = torch.mul(x, feat_1_) + torch.mul(ss_1, 1 - feat_1_) # quantization xsize = list(x.size()) x = x.view(*(xsize + [1])) quant_dist = torch.pow(x - self.centers, 2) softout = torch.sum(self.centers * nn.functional.softmax(-quant_dist, dim=-1), dim=-1) minval, index = torch.min(quant_dist, dim=-1, keepdim=True) hardout = torch.sum(self.centers * (minval == quant_dist), dim=-1) x = softout # x = softout + (hardout - softout).detach() if self.use_subsampling: comp_data = comp_data.view(*(list(comp_data.size()) + [1])) quant_dist = torch.pow(comp_data - self.centers, 2) index2 = torch.min(quant_dist, dim=-1, keepdim=True)[1] # after Q self.sizes[3] += index2.view(-1).size(0) * 3 # running length coding on bitmap huffman = HuffmanCoding() real_size = len(huffman.compress( index2.view(-1).cpu().numpy())) * 4 # bit rle_len1 = mask_compression(mask_1.view(-1).cpu().numpy()) real_size += rle_len1 # after lossless self.sizes[4] += real_size filter_loss = torch.mean(feat_1) real_cr = 1 / 16. * real_size / (H * W * C * B * 8) softmax_dist = nn.functional.softmax(-quant_dist, dim=-1) soft_prob = torch.mean(softmax_dist, dim=0) entropy = -torch.sum(torch.mul(soft_prob, torch.log(soft_prob))) return x, (filter_loss, real_cr, entropy) else: self.sizes[2] += index.view(-1).size(0) * 3 huffman = HuffmanCoding() real_size = len(huffman.compress(index.view(-1).cpu().numpy())) * 4 self.sizes[3] += real_size real_cr = 1 / 16. * real_size / (H * W * C * B * 8) return x, real_cr
#!/usr/bin/env python3 from huffman import HuffmanCoding import pickle # generates a huffman tree from Pride and Prejudice corpus from Project Gutenberg: https://www.gutenberg.org/ebooks/1342 # https://www.gutenberg.org/files/1342/1342-0.txt import urllib.request corpus_url = "https://www.gutenberg.org/files/1342/1342-0.txt" h = HuffmanCoding() txt = urllib.request.urlopen(corpus_url).read().decode('utf-8') #print(txt) txt = txt.replace("“", "\"") txt = txt.replace("”", "\"") output_path = h.generate_tree(txt) # save tree to file tree_loc = "tree.bin" with open(tree_loc, 'wb') as binary_file: pickle.dump(h, binary_file) print("Tree generated at {}".format(tree_loc))
def upload_file(): f = request.files['file'] tag = request.form['tag'] data = bytes(f.read()) input_file_size = len(data) filename, file_extension = os.path.splitext(f.filename) if (tag == "huffman"): h = HuffmanCoding(data) huffman_file_size = h.compress(f.filename) return jsonify({ 'success': True, 'fileSize': input_file_size, 'HuffmanEncoding': { 'compressionRatio': huffman_file_size / input_file_size, 'compressionFactor': input_file_size / huffman_file_size, 'savingPercentage': (input_file_size - huffman_file_size) / input_file_size, 'fileSize': huffman_file_size }, }) if (tag == "shannon"): ShannonCompress(data, f.filename) shf_file_size = os.path.getsize(filename + ".shf") os.remove(filename + ".shf") return jsonify({ 'success': True, 'fileSize': input_file_size, 'ShannonFano': { 'compressionRatio': shf_file_size / input_file_size, 'compressionFactor': input_file_size / shf_file_size, 'savingPercentage': (input_file_size - shf_file_size) / input_file_size, 'fileSize': shf_file_size }, }) if (tag == "lempel"): LempelZivWelch(data, f.filename, 8) lzw_file_size = os.path.getsize(filename + ".lzw") os.remove(filename + ".lzw") return jsonify({ 'success': True, 'fileSize': input_file_size, 'LempelZivWelch': { 'compressionRatio': lzw_file_size / input_file_size, 'compressionFactor': input_file_size / lzw_file_size, 'savingPercentage': (input_file_size - lzw_file_size) / input_file_size, 'fileSize': lzw_file_size } }) if (tag == "rle"): RunLengthEncoding(data, f.filename) rle_file_size = os.path.getsize(filename + ".rle") os.remove(filename + ".rle") return jsonify({ 'success': True, 'fileSize': input_file_size, 'RunLengthEncoding': { 'compressionRatio': rle_file_size / input_file_size, 'compressionFactor': input_file_size / rle_file_size, 'savingPercentage': (input_file_size - rle_file_size) / input_file_size, 'fileSize': rle_file_size }, }) return jsonify({'success': False, "message": "Please pass a valid tag"})
print("Invalid Switch/Usage " + sys.argv[1]) print("Usage :\n") print("To compress : \npython " + sys.argv[0] + " -c filename.txt [dictfile.dict]\n") print("To decompress : \npython " + sys.argv[0] + " -x filename.bin [dictfile.dict]") print( "filename.dict is optional, to be used if the dictionary was saved under a different name." ) exit(0) if [sys.argv[1]] == ['-c']: print() pathf = sys.argv[2] dictf = '' if len(sys.argv) > 3: dictf = sys.argv[3] h = HuffmanCoding(pathf) out = h.compress() h.save_codes(dictf) h.get_code() h.get_freq() elif [sys.argv[1]] == ['-x']: print() pathf = sys.argv[2] dictf = '' if len(sys.argv) > 3: dictf = sys.argv[3] h = HuffmanCoding(pathf, dictf) h.decompress()
"""User input """ txtin = raw_input("Write some symbols(blank for sample case):") txtin = TEST if txtin=="" else txtin txtout = txtin """Extract frecuency of each symbol of set """ symb2freq = defaultdict(int) for ch in txtin: symb2freq[ch] += 1 """Implementation of Huffman Algorithm """ start = time.time() huff = HuffmanCoding() huff.encode(symb2freq) end = time.time() time_lapse = end - start """Conversion from Huffman Coding Tree to Coding table """ coding_table = huff.tree_to_table() """Outputs """ print "Codes table" print "Symbol\tFrec\tCode" for coding in coding_table: print "\t".join(map(str,coding)) # Replace at the input text the symbol with the propper code
def main(): parser = argparse.ArgumentParser(description='Word2vec') parser.add_argument('mode', metavar='mode', type=str, help='"SG" for skipgram, "CBOW" for CBOW') parser.add_argument( 'part', metavar='partition', type=str, help= '"part" if you want to train on a part of corpus, "full" if you want to train on full corpus' ) parser.add_argument( 'mode2', metavar='mode2', type=str, help= "0 for Hierarchical Softmax, 1 or more for Negative Sampling, 'None' for None of two" ) parser.add_argument( 'use_subsampling', metavar='subsample', type=str, help="0 for not using subsampling, 1 for using subsampling") args = parser.parse_args() mode = args.mode part = args.part mode2 = args.mode2 subsample = args.use_subsampling #Load and tokenize corpus print("loading...") if part == "part": text = open('text8', mode='r').readlines( )[0][:1000000] #Load a part of corpus for debugging elif part == "full": text = open('text8', mode='r').readlines()[0] #Load full corpus for submission else: print("Unknown argument : " + part) exit() print("tokenizing...") corpus = text.split() frequency = Counter(corpus) processed = [] #Discard rare words for word in corpus: if frequency[word] > 4: processed.append(word) vocabulary = set(processed) #Assign an index number to a word word2ind = {} word2ind[" "] = 0 i = 1 for word in vocabulary: word2ind[word] = i i += 1 ind2word = {} for k, v in word2ind.items(): ind2word[v] = k print("Vocabulary size") print(len(word2ind)) # Create Huffman Coding freq = dict() freq[0] = 0 total_freq = 0 for word in vocabulary: freq[word2ind[word]] = frequency[word] total_freq += frequency[word] # subsampling if subsample == "1": freq_subsampling = {} for word in vocabulary: freq_subsampling[word] = frequency[word] / total_freq # calculate subsampling_probability prob_subsampling = {} for word in vocabulary: prob_subsampling[word] = max( 0, 1 - math.sqrt(0.001 / freq_subsampling[word])) # print(prob_subsampling) # exit() subsampled_corpus = [] discard = 0 for word in processed: prob = prob_subsampling[word] random_prob = np.random.rand() if random_prob > prob: subsampled_corpus.append(word) else: discard += 1 print(len(processed)) print("Discard : " + str(discard)) processed = subsampled_corpus huffmanCode = HuffmanCoding() codes, nonleaf_ind = huffmanCode.build(freq) # negative sampling freqtable = [0, 0, 0] for k, v in frequency.items(): f = int(v**0.75) for _ in range(f): if k in word2ind.keys(): freqtable.append(word2ind[k]) #Training section emb, _ = word2vec_trainer(processed, word2ind, codes=codes, freqtable=freqtable, nonleaf_ind=nonleaf_ind, mode=mode, mode2=mode2, use_subsample=subsample, dimension=64, learning_rate=0.05, iteration=50000) Analogical_Reasoning_Task(emb, word2ind, ind2word)
from huffman import HuffmanCoding import sys from pathlib import Path import time import cv2 path = "tiger.bmp" image = cv2.imread(path, 0) cv2.imwrite("tiger_gray.bmp", image) h = HuffmanCoding(path) output_path, image_shape = h.compress() print("Compressed file path: " + output_path) a = Path("tiger_gray.bmp").stat().st_size b = Path(output_path).stat().st_size print("Calculating size") for i in range(10): print(".", end='') time.sleep(1) decom_path = h.decompress(output_path, image_shape) print("compression percent", 100 * (a - b) / a) print("Decompressed file path: " + decom_path)
def decode_huffman(model, enc, text, context, bits_per_word, device='cpu'): # inp is a list of token indices # context is a list of token indices inp = enc.encode(text) i = 0 while i < len(inp): if inp[i] == 628: inp[i] = 198 inp[i + 1:i + 1] = [198] i += 2 else: i += 1 context = torch.tensor(context[-1022:], device=device, dtype=torch.long) prev = context past = None message = [] with torch.no_grad(): i = 0 while i < len(inp): if past and past[0].shape[3] >= 1023: raise RuntimeError logits, past = model(prev.unsqueeze(0), past=past) past = limit_past(past) logits[0, -1, -1] = -1e10 # endoftext can't happen logits[0, -1, 628] = -1e10 # 2 newlines can't happen logits, indices = logits[0, -1, :].sort(descending=True) # Get the top 2**bits options indices = indices[:2**bits_per_word] log_probs = F.log_softmax(logits, dim=-1)[:2**bits_per_word] probs = torch.exp(log_probs) if inp[i] not in indices: true_token_text = enc.decoder[inp[i]] for rank_idx in range(2**bits_per_word): prop_token_text = enc.decoder[indices[rank_idx].item()] # common case that is not caught if inp[i] == 128 and indices[rank_idx] == 198: rank = rank_idx inp[i] = indices[rank_idx].item() break # Is there a more likely prefix token that could be the actual token generated? if len(prop_token_text) <= len(true_token_text) and \ prop_token_text == true_token_text[:len(prop_token_text)]: rank = rank_idx suffix = true_token_text[len(prop_token_text):] suffix_tokens = enc.encode(suffix) # a list inp[i] = indices[rank_idx].item() inp[i + 1:i + 1] = suffix_tokens # insert suffix tokens into list break # Is there a more likely longer token that could be the actual token generated? elif len(prop_token_text) > len(true_token_text) and \ true_token_text == prop_token_text[:len(true_token_text)]: whole_text = true_token_text num_extra = 1 while len(whole_text) < len(prop_token_text): whole_text += enc.decoder[inp[i + num_extra]] num_extra += 1 if prop_token_text == whole_text[:len(prop_token_text )]: rank = rank_idx inp[i] = indices[rank_idx].item() for j in range(1, num_extra): del inp[i + j] if len(whole_text) > len(prop_token_text): suffix = whole_text[len(prop_token_text):] suffix_tokens = enc.encode(suffix) # a list inp[i + 1:i + 1] = suffix_tokens # insert suffix tokens into list break else: print( 'Unable to fix BPE error: token received: %s=%d, text: %s' % (true_token_text, inp[i], text)) rank = 0 else: rank = (indices == inp[i]).nonzero().item() probs_array = probs.cpu().numpy() coding = HuffmanCoding() coding.make_heap_from_array(probs_array) coding.merge_nodes() coding.make_codes() tokens_t = map(int, coding.codes[rank]) message.extend(tokens_t) prev = torch.tensor([inp[i]], device=device, dtype=torch.long) i += 1 return message
# # cv2.resizeWindow("deltaback", 1000, 1000) # # cv2.imshow("deltaback", img) # print "Redecoded entropy: " # print shannon_entropy(img) filenames = glob.glob("images/*.png") images = [cv2.imread(img) for img in filenames] sum_ratio = 0 for img in images: img = img[:, :, 0] img = delta_encode(img) h = HuffmanCoding(img, os.getcwd() + "/test") h.compress() img = h.decompress(os.getcwd() + "/test.bin") img = delta_decode(img) rawsize = os.stat('raw.bin') testsize = os.stat('test.bin') ratio = float(float(testsize.st_size) / float(rawsize.st_size)) sum_ratio += ratio print "Redecoded entropy: " print shannon_entropy(img) print "Compression ratio: " print ratio
def main(): parser = argparse.ArgumentParser(description='Word2vec') parser.add_argument('mode', metavar='mode', type=str, help='"SG" for skipgram, "CBOW" for CBOW') parser.add_argument( 'ns', metavar='negative_samples', type=int, help= '0 for hierarchical softmax, the other numbers would be the number of negative samples' ) parser.add_argument( 'part', metavar='partition', type=str, help= '"part" if you want to train on a part of corpus, "full" if you want to train on full corpus' ) args = parser.parse_args() mode = args.mode part = args.part ns = args.ns #Load and preprocess corpus print("loading...") if part == "part": text = open('text8', mode='r').readlines( )[0][:1000000] #Load a part of corpus for debugging elif part == "full": text = open('text8', mode='r').readlines()[0] #Load full corpus for submission else: print("Unknown argument : " + part) exit() print("preprocessing...") #subsampling of frequent words corpus = text.split() stats = Counter(corpus) words = [] #Discard rare words for word in corpus: if stats[word] > 4: words.append(word) vocab = set(words) #Give an index number to a word w2i = {} w2i[" "] = 0 i = 1 for word in vocab: w2i[word] = i i += 1 i2w = {} for k, v in w2i.items(): i2w[v] = k #Code dict for hierarchical softmax freqdict = {} for word in vocab: freqdict[w2i[word]] = stats[word] codedict = HuffmanCoding().build(freqdict) #Frequency table for negative sampling freqtable = [0, 0, 0] for k, v in stats.items(): f = int(v**0.75) for _ in range(f): if k in w2i.keys(): freqtable.append(w2i[k]) #Make training set print("build training set...") input_set = [] target_set = [] window_size = 5 if mode == "CBOW": for j in range(len(words)): if j < window_size: input_set.append( [0 for _ in range(window_size - j)] + [w2i[words[k]] for k in range(j)] + [w2i[words[j + k + 1]] for k in range(window_size)]) target_set.append(w2i[words[j]]) elif j >= len(words) - window_size: input_set.append( [w2i[words[j - k - 1]] for k in range(window_size)] + [ w2i[words[len(words) - k - 1]] for k in range(len(words) - j - 1) ] + [0 for _ in range(j + window_size - len(words) + 1)]) target_set.append(w2i[words[j]]) else: input_set.append( [w2i[words[j - k - 1]] for k in range(window_size)] + [w2i[words[j + k + 1]] for k in range(window_size)]) target_set.append(w2i[words[j]]) if mode == "SG": for j in range(len(words)): if j < window_size: input_set += [w2i[words[j]] for _ in range(window_size * 2)] target_set += [0 for _ in range(window_size - j)] + [ w2i[words[k]] for k in range(j) ] + [w2i[words[j + k + 1]] for k in range(window_size)] elif j >= len(words) - window_size: input_set += [w2i[words[j]] for _ in range(window_size * 2)] target_set += [ w2i[words[j - k - 1]] for k in range(window_size) ] + [ w2i[words[len(words) - k - 1]] for k in range(len(words) - j - 1) ] + [0 for _ in range(j + window_size - len(words) + 1)] else: input_set += [w2i[words[j]] for _ in range(window_size * 2)] target_set += [ w2i[words[j - k - 1]] for k in range(window_size) ] + [w2i[words[j + k + 1]] for k in range(window_size)] print("Vocabulary size") print(len(w2i)) print() #Training section emb, _ = word2vec_trainer(input_set, target_set, len(w2i), codedict, freqtable, mode=mode, NS=ns, dimension=300, epoch=1, learning_rate=0.01) Analogical_Reasoning_Task(emb, w2i, i2w, mode, part, ns)
def use_huffman(filename, wordlength=14): h = HuffmanCoding(filename, wordlength) output_path = h.compress() #h.decompress(output_path) return output_path
from huffman import HuffmanCoding import sys path = "textfile.txt" h = HuffmanCoding(path) output_path = h.compress() print("Compressed file path: " + output_path) decom_path = h.decompress(output_path) print("Decompressed file path: " + decom_path)
from huffman import HuffmanCoding import sys if __name__ == "__main__": path = sys.argv[1] #arg[1] = file name h = HuffmanCoding(path) # path ,code, heap and reverse mapping creating print("Compressing...") output_path = h.compress() print(f"Compressed file: {output_path}\n") print("Decompressing...") #print('otput_com',output_path) output_path = h.decompress(output_path) #pass as argument output compressed file print(f"Decompressed file: {output_path}")