def __init__(self, inPutPath, outPutPath, isPaired, readNum, bucketIndexLen, lossless, verbose): self.mutiDict = {} ### kmers for buckets self.sequenceTable = [] #store sequence for output self.kmerLen = bucketIndexLen self.indexLen = bucketIndexLen self.bucketDict = {} #nested_dict(2, int) self.encodeBucketPath = {} self.newNodeNum = 0 self.simpleNodeNum = 0 self.tipNodeNum = 0 self.bifurNodeNum = 0 self.deleteBifurRatio = 0.2 self.inPutPath = inPutPath self.outPutPath = outPutPath self.dna2num = {"A": 0, "C": 1, "G": 2, "T": 3} self.num2dna = {0: "A", 1: "C", 2: "G", 3: "T"} self.recdna = {"A": "T", "C": "G", "G": "C", "T": "A"} self.dna2bit = {"A": '0b00', "C": '0b01', "G": '0b10', "T": '0b11'} self.num2dna = {0: "A", 1: "C", 2: "G", 3: "T"} self.firstSeq = BitStream() self.numFlag = BitStream() self.freq3 = arithmeticcoding.SimpleFrequencyTable( arithmeticcoding.FlatFrequencyTable(3)) self.freq4 = arithmeticcoding.SimpleFrequencyTable( arithmeticcoding.FlatFrequencyTable(4)) self.freqs = arithmeticcoding.SimpleFrequencyTable( arithmeticcoding.FlatFrequencyTable(4)) self.bitoutL = arithmeticcoding.BitInputStream( open(self.inPutPath + ".bifurL", "rb")) self.bitoutR = arithmeticcoding.BitInputStream( open(self.inPutPath + ".bifurR", "rb")) self.decodeSeqPathL = self.openFileLeft() self.decodeSeqPathR = self.openFileRight() self.outFileName = outPutPath + ".dna" self.paired = isPaired self.seqLen = 0 #length for current read self.outPairFileName = [outPutPath + "_1.dna", outPutPath + "_2.dna"] self.outFile = None self.outPairFile = None self.readNum = readNum self.seqence = "" ##encode seq self.bucketIndex = [] #bucket index self.bucketCov = [] # reads number in bucket self.readIndexPos = [] #index positions in each read self.readLen = [] self.readrc = sream() # read in forward or backward self.readN = { "flag": sream(), "pos": [], "l": [] } # N in read indicate, number, position and length self.numFlag = sream() #new nodes indicate self.lossless = lossless self.verbose = verbose self.openOutFile() #prepare output file
def compress(quantized, output_file): """ Function to load d Input: filename : Input hdf5 file consisting of training dataset Output: dataframe of paths to images dataset """ data = pickle.dumps(quantized) with open(output_file, "wb") as file: bitout = arithmeticcoding.BitOutputStream(file) initfreqs = arithmeticcoding.FlatFrequencyTable(257) freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs) enc = arithmeticcoding.ArithmeticEncoder(32, bitout) i = 0 while i < len(data): # Read and encode one byte symbol = data[i] i += 1 enc.write(freqs, symbol) freqs.increment(symbol) enc.write(freqs, 256) # EOF enc.finish() # Flush remaining code bits
def decomparess(inputfile, outfile, model): bitin = arithmeticcoding.BitInputStream(open(inputfile, "rb")) initfreqs = arithmeticcoding.FlatFrequencyTable(AE_SIZE) freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs) dec = arithmeticcoding.ArithmeticDecoder(bitin) prev_chars = [] i = 0 with open(outfile, "w") as out: while(True): guesses = dec.read(freqs) if guesses == MAGIC_EOF: break print('guesses',guesses) freqs.increment(guesses) for _ in range(guesses): char = predict(prev_chars, model, indices_char) out.write(char) print("i",i) literal = dec.read(freqs) print('lit',chr(literal)) out.write(chr(literal)) freqs.increment(literal) prev_chars.append(chr(literal)) if len(prev_chars) > maxlen: prev_chars.pop(0) i = i + 1 bitin.close()
def get_frequencies(filepath): freqs = arithmeticcoding.SimpleFrequencyTable([0] * 257) with open(filepath, "rb") as input: while True: b = input.read(1) if len(b) == 0: break freqs.increment(b[0]) return freqs
def get_frequencies(self, inp, frequencies, num_symbols): freqs = arithmeticcoding.SimpleFrequencyTable(frequencies) #self.f = [0 for i in range(num_symbols+1)] #for i in range(len(inp)): # b = inp[i] # freqs.increment(b) # self.f[b] += 1 #self.f[num_symbols] += 1 #self.f = frequencies return freqs
def compressTree( self, node, overall_freqs, N ): #n is the number of nodes in the hidden layer and pw is the list of all the normalized probability; use cummulative frequencies, then, #won't have to normalize enc = arithmeticcoding.ArithmeticEncoder() q = deque([node]) #self.j = 0 while len(q) != 0: temp = q.popleft() if temp.v > 1: tempValue = temp.v i = 0 for child in temp.childNodes: if child != None: if tempValue > 0: q.append(child) binomial_frequencies = ec( ).binomial_encoder_frequencies( overall_freqs[i:], tempValue ) # binomial encoder can convert to frequencies. convert to binary independently and check compression ratio for confirming correct amount of compression freqs = arithmeticcoding.SimpleFrequencyTable( binomial_frequencies) enc.write(freqs, child.v) tempValue = tempValue - child.v #a = a + '1011' i += 1 #print('Compressing Tree...',self.j) #self.j += 1 #print (i) elif temp.v == 1: for child in temp.childNodes: if child != None: if child.v == 1: symbol = child.c q.append(child) freqs = arithmeticcoding.SimpleFrequencyTable( overall_freqs) enc.write(freqs, symbol) compressed_tree = enc.finish() return compressed_tree
def read_frequencies(bitin): def read_int(n): result = 0 for _ in range(n): result = (result << 1) | bitin.read_no_eof() # Big endian return result freqs = [read_int(32) for _ in range(256)] freqs.append(1) # EOF symbol return arithmeticcoding.SimpleFrequencyTable(freqs)
def decompress(bitin, out): initfreqs = arithmeticcoding.FlatFrequencyTable(257) freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs) dec = arithmeticcoding.ArithmeticDecoder(32, bitin) while True: # Decode and write one byte symbol = dec.read(freqs) if symbol == 256: # EOF symbol break out.write(bytes((symbol, ))) freqs.increment(symbol)
def generate_freqs(pro, first_step=False, resolution=1e9): freqs = arithmeticcoding.SimpleFrequencyTable([0] * (1 + len(characters))) for i in range(len(characters)): # freqs.set(i, static_freqs[characters[i]]) freqs.set(i, 1) if first_step is False: for i in range(pro.shape[0]): if (pro[i] * resolution).astype(np.int64) > 1: freqs.set(i, (pro[i] * resolution).astype(np.int64)) freqs.set(len(characters), 1) # \n # freqs.set(41, 1) # EOF return freqs
def compress(inp, bitout): initfreqs = arithmeticcoding.FlatFrequencyTable(257) freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs) enc = arithmeticcoding.ArithmeticEncoder(32, bitout) while True: # Read and encode one byte symbol = inp.read(1) if len(symbol) == 0: break enc.write(freqs, symbol[0]) freqs.increment(symbol[0]) enc.write(freqs, 256) # EOF enc.finish() # Flush remaining code bits
def comparess(file1, model, indices_char): #this is painfully slow #if at all possible it should be revised so that it can mostly be run on the gpu #by painfully slow i mean on the order of .02 seconds per character guess. #ie ~16 minutes for a 50k character file. f1 = open(file1, 'r').read() data_size = len(f1) i = 0 #output = [0, f1[0]] bitout = arithmeticcoding.BitOutputStream(open(file1 + '.comp', "wb")) initfreqs = arithmeticcoding.FlatFrequencyTable(AE_SIZE) freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs) enc = arithmeticcoding.ArithmeticEncoder(bitout) guesses_right = 0 gss = '' while i < data_size: current = ord(f1[i]) if i < maxlen: enc.write(freqs, 0) # Always 'guessing' zero correctly before maxlen freqs.increment(0) enc.write(freqs, current) freqs.increment(current) else: guess = predict(f1[(i - maxlen):i], model, indices_char) if (f1[i] == guess and guesses_right < 255): guesses_right += 1 print("Guessed", f1[i], "correctly") else: enc.write(freqs, guesses_right) print("Wrong guess. Outputing", guesses_right, "correct guesses") freqs.increment(guesses_right) print(i, "Outputing char", current) enc.write(freqs, current) freqs.increment(current) guesses_right = 0 if (i % 100 == 0): print("i:", i) i += 1 if guesses_right > 0: enc.write(freqs, guesses_right) enc.write(freqs, MAGIC_EOF) print("out eof sanity check") enc.finish() bitout.close() return None
def decompress(input_file): decode = bytearray() with open(input_file, "rb") as inp: bitin = arithmeticcoding.BitInputStream(inp) initfreqs = arithmeticcoding.FlatFrequencyTable(257) freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs) dec = arithmeticcoding.ArithmeticDecoder(32, bitin) while True: # Decode and write one byte symbol = dec.read(freqs) if symbol == 256: # EOF symbol break decode.extend(bytes((symbol, ))) freqs.increment(symbol) return pickle.loads(decode)
def __init__(self, path, ispaired, kmerLen, verbose, sequenceTable): self.mutiDict = {} ### kmers for buckets self.sequenceTable = sequenceTable self.kmerLen = kmerLen self.indexLen = kmerLen self.paired = ispaired self.seqLen = 0 #self.bucketDict = defaultdict(lambda : defaultdict(dict)) self.bucketDict = {} #nested_dict(2, int) self.encodeBucketPath = {} self.newNodeNum = 0 self.simpleNodeNum = 0 self.tipNodeNum = 0 self.bifurNodeNum = 0 self.deleteBifurRatio = 0.2 self.outPutPath = path self.dna2num = {"A": 0, "C": 1, "G": 2, "T": 3} self.num2dna = {0: "A", 1: "C", 2: "G", 3: "T"} self.dna2bit = {"A": '0b00', "C": '0b01', "G": '0b10', "T": '0b11'} self.num2dna = {0: "A", 1: "C", 2: "G", 3: "T"} self.firstSeq = BitStream() self.numFlag = BitStream() self.freq3 = arithmeticcoding.SimpleFrequencyTable( arithmeticcoding.FlatFrequencyTable(3)) self.freq4 = arithmeticcoding.SimpleFrequencyTable( arithmeticcoding.FlatFrequencyTable(4)) self.freqs = arithmeticcoding.SimpleFrequencyTable( arithmeticcoding.FlatFrequencyTable(4)) self.bitoutL = arithmeticcoding.BitOutputStream( open(self.outPutPath + ".bifurL", "wb")) self.bitoutR = arithmeticcoding.BitOutputStream( open(self.outPutPath + ".bifurR", "wb")) self.encodeSeqPathL = self.openFileLeft() self.encodeSeqPathR = self.openFileRight() self.verbose = verbose self.removeOutputFile()
def decompress_next(self, new_freq_table_256): if isinstance(new_freq_table_256, (list, set)): new_table_copy = list(new_freq_table_256) new_table_copy.extend([int(1)]) self.freqsTable = arithmeticcoding.SimpleFrequencyTable( new_table_copy) #self.decoder = arithmeticcoding.ArithmeticDecoder(32, self.bitin) symbol = self.decoder.read(self.freqsTable) if symbol < 256: self.out.write(bytes((symbol, ))) return symbol
def compress(snp, numsymbol): initfreqs = arithmeticcoding.FlatFrequencyTable(numsymbol) freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs) enc = arithmeticcoding.ArithmeticEncoder(32) snp = np.squeeze(snp) rows,cols,channel = snp.shape for c in range(channel): for i in range(rows): for j in range(cols): # Read and encode one byte symbol = snp[i,j,c] enc.write(freqs, symbol) freqs.increment(symbol) enc.write(freqs, numsymbol-1) # EOF enc.finish() # Flush remaining code bits return enc.bit_nums
def read_frequencies(bitin): freqs = [] for i in range(256): freqs.append(read_int(bitin, 32)) freqs.append(1) # EOF symbol return arithmeticcoding.SimpleFrequencyTable(freqs)
def __init__(self, symbols, hassubctx): self.frequencies = arithmeticcoding.SimpleFrequencyTable([0] * symbols) self.subcontexts = ([None] * symbols) if hassubctx else None
def inferenceNN( self, x, M, N, overall_freqs, L, activationFunction ): #N is the number of hidden nodes, the weights are of dimension MxN y = [0 for i in range(N)] enc = arithmeticcoding.ArithmeticEncoder() dec = arithmeticcoding.ArithmeticDecoder(L) q = deque([N]) #q_node = deque([node]) self.w = 0 tot_queue_length = floor(2 * log2(N + 1) + 1) max_queue_length = floor(2 * log2(N + 1) + 1) current_queue_length = floor(2 * log2(N + 1) + 1) j = 0 level = 0 flag = 0 flagp = 0 k = len(overall_freqs) print('M:', M, 'N:', N) while len(q) != 0 and level < M: currentNodeValue = q.popleft() current_queue_length -= floor(2 * log2(currentNodeValue + 1) + 1) if flagp == 0: print('current_queue_length', current_queue_length) flagp = 1 #currentnode = q_node.popleft() if currentNodeValue > 1: c = 0 #colour initialized with 0 while c <= k - 1 and currentNodeValue > 0: #kth colour need not be encoded binomial_frequencies = ec().binomial_encoder_frequencies( overall_freqs[c:], currentNodeValue) freqs = arithmeticcoding.SimpleFrequencyTable( binomial_frequencies) childNodeValue = dec.read(freqs) #if childNodeValue != currentnode.childNodes[c].v: # print('Not Matching!', childNodeValue, currentnode.childNodes[c].v) #else: # print('No problems here') enc.write(freqs, childNodeValue) currentNodeValue -= childNodeValue q.append(childNodeValue) current_queue_length += floor(2 * log2(childNodeValue + 1) + 1) max_queue_length = max(max_queue_length, current_queue_length) tot_queue_length += current_queue_length self.w += 1 #q_node.append(currentnode.childNodes[c]) #print('childNodeValue',childNodeValue) if childNodeValue > 0: flag = 1 for i in range(childNodeValue): # print('level:',level,'x[level]',x[level]) # print('Calculating Y....', level,':',self.w) y[j + i] += uc().index_to_weight(c) * x[level] #print(x[level], c) #y[j+i] += c*x[level] c = c + 1 j = (j + childNodeValue) % N if j == 0 and flag: level = level + 1 #print('level:',level) flag = 0 elif currentNodeValue == 1: freqs = arithmeticcoding.SimpleFrequencyTable(overall_freqs) c = dec.read(freqs) enc.write(freqs, c) q.append(1) current_queue_length += 3 max_queue_length = max(max_queue_length, current_queue_length) tot_queue_length += current_queue_length self.w += 1 y[j + i] += uc().index_to_weight(c) * x[level] j = (j + 1) % N if j == 0: level += 1 avg_queue_length = tot_queue_length / self.w L1 = enc.finish() #return L1 if needed y = np.array(y) if activationFunction == 'ReLU': y = uc().ReLU(y) elif activationFunction == 'sigmoid': y = uc().sigmoid(y) elif activationFunction == None: y = y return y, avg_queue_length, max_queue_length
def read_frequencies(self, frequencies): return arithmeticcoding.SimpleFrequencyTable(frequencies)
def compress_next(self, new_freq_table_256, symbol_number): if isinstance(new_freq_table_256, (list, set)): new_table_copy = list(new_freq_table_256) new_table_copy.extend([int(1)]) self.freqsTable = arithmeticcoding.SimpleFrequencyTable( new_table_copy) #self.encoder = arithmeticcoding.ArithmeticEncoder(32, self.bitout) self.encoder.write(self.freqsTable, symbol_number) ## set new frequency for the symbol #self.freqsTable.set(symbol_number, freq_pred) ## Returns a frequency table based on the bytes in the given file. ## Also contains an extra entry for symbol 256, whose frequency is set to 0. # def get_frequencies(self, filepath): # freqs = arithmeticcoding.SimpleFrequencyTable([0] * 257) # with open(filepath, "rb") as input: # while True: # b = input.read(1) # if len(b) == 0: # break # freqs.increment(b[0]) # return freqs # def write_frequencies(self, bitout, freqs): # for i in range(256): # write_int(bitout, 32, freqs.get(i)) # def compress(self, freqs, inp, bitout): # enc = arithmeticcoding.ArithmeticEncoder(32, bitout) # while True: # symbol = inp.read(1) # if len(symbol) == 0: # break # enc.write(freqs, symbol[0]) # enc.write(freqs, 256) # EOF # enc.finish() # Flush remaining code bits ## Writes an unsigned integer of the given bit width to the given stream. #def write_int(bitout, numbits, value): # for i in reversed(range(numbits)): # bitout.write((value >> i) & 1) # Big endian # # ## Command line main application function. #def main(args): # # Handle command line arguments # if len(args) != 2: # sys.exit("Usage: python arithmeticcompress.py InputFile OutputFile") # inputfile, outputfile = args # # # Read input file once to compute symbol frequencies # freqs = get_frequencies(inputfile) # freqs.increment(256) # EOF symbol gets a frequency of 1 # # # Read input file again, compress with arithmetic coding, and write output file # with open(inputfile, "rb") as inp, \ # contextlib.closing(arithmeticcoding.BitOutputStream(open(outputfile, "wb"))) as bitout: # write_frequencies(bitout, freqs) # compress(freqs, inp, bitout) # # ## Main launcher #if __name__ == "__main__": # main(sys.argv[1:])