def __init__(self, inPutPath, outPutPath, isPaired, readNum, bucketIndexLen, lossless, verbose): self.mutiDict = {} ### kmers for buckets self.sequenceTable = [] #store sequence for output self.kmerLen = bucketIndexLen self.indexLen = bucketIndexLen self.bucketDict = {} #nested_dict(2, int) self.encodeBucketPath = {} self.newNodeNum = 0 self.simpleNodeNum = 0 self.tipNodeNum = 0 self.bifurNodeNum = 0 self.deleteBifurRatio = 0.2 self.inPutPath = inPutPath self.outPutPath = outPutPath self.dna2num = {"A": 0, "C": 1, "G": 2, "T": 3} self.num2dna = {0: "A", 1: "C", 2: "G", 3: "T"} self.recdna = {"A": "T", "C": "G", "G": "C", "T": "A"} self.dna2bit = {"A": '0b00', "C": '0b01', "G": '0b10', "T": '0b11'} self.num2dna = {0: "A", 1: "C", 2: "G", 3: "T"} self.firstSeq = BitStream() self.numFlag = BitStream() self.freq3 = arithmeticcoding.SimpleFrequencyTable( arithmeticcoding.FlatFrequencyTable(3)) self.freq4 = arithmeticcoding.SimpleFrequencyTable( arithmeticcoding.FlatFrequencyTable(4)) self.freqs = arithmeticcoding.SimpleFrequencyTable( arithmeticcoding.FlatFrequencyTable(4)) self.bitoutL = arithmeticcoding.BitInputStream( open(self.inPutPath + ".bifurL", "rb")) self.bitoutR = arithmeticcoding.BitInputStream( open(self.inPutPath + ".bifurR", "rb")) self.decodeSeqPathL = self.openFileLeft() self.decodeSeqPathR = self.openFileRight() self.outFileName = outPutPath + ".dna" self.paired = isPaired self.seqLen = 0 #length for current read self.outPairFileName = [outPutPath + "_1.dna", outPutPath + "_2.dna"] self.outFile = None self.outPairFile = None self.readNum = readNum self.seqence = "" ##encode seq self.bucketIndex = [] #bucket index self.bucketCov = [] # reads number in bucket self.readIndexPos = [] #index positions in each read self.readLen = [] self.readrc = sream() # read in forward or backward self.readN = { "flag": sream(), "pos": [], "l": [] } # N in read indicate, number, position and length self.numFlag = sream() #new nodes indicate self.lossless = lossless self.verbose = verbose self.openOutFile() #prepare output file
def compress(quantized, output_file): """ Function to load d Input: filename : Input hdf5 file consisting of training dataset Output: dataframe of paths to images dataset """ data = pickle.dumps(quantized) with open(output_file, "wb") as file: bitout = arithmeticcoding.BitOutputStream(file) initfreqs = arithmeticcoding.FlatFrequencyTable(257) freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs) enc = arithmeticcoding.ArithmeticEncoder(32, bitout) i = 0 while i < len(data): # Read and encode one byte symbol = data[i] i += 1 enc.write(freqs, symbol) freqs.increment(symbol) enc.write(freqs, 256) # EOF enc.finish() # Flush remaining code bits
def decomparess(inputfile, outfile, model): bitin = arithmeticcoding.BitInputStream(open(inputfile, "rb")) initfreqs = arithmeticcoding.FlatFrequencyTable(AE_SIZE) freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs) dec = arithmeticcoding.ArithmeticDecoder(bitin) prev_chars = [] i = 0 with open(outfile, "w") as out: while(True): guesses = dec.read(freqs) if guesses == MAGIC_EOF: break print('guesses',guesses) freqs.increment(guesses) for _ in range(guesses): char = predict(prev_chars, model, indices_char) out.write(char) print("i",i) literal = dec.read(freqs) print('lit',chr(literal)) out.write(chr(literal)) freqs.increment(literal) prev_chars.append(chr(literal)) if len(prev_chars) > maxlen: prev_chars.pop(0) i = i + 1 bitin.close()
def start(self, dictionary_size=256): self.dictionary_size = dictionary_size self.bitout = arithmeticcoding.BitOutputStream( open(self.outputfile, "wb")) #self.freqsTable = arithmeticcoding.SimpleFrequencyTable([float(i % 8 + 1) for i in range(self.dictionary_size + 1)]) self.freqsTable = arithmeticcoding.FlatFrequencyTable( self.dictionary_size + 1) self.encoder = arithmeticcoding.ArithmeticEncoder(32, self.bitout)
def start(self, dictionary_size=256): self.dictionary_size = dictionary_size self.inp = open(self.inputfile, "rb") self.out = open(self.outputfile, "wb") self.bitin = arithmeticcoding.BitInputStream(self.inp) #self.freqsTable = arithmeticcoding.SimpleFrequencyTable([float(i % 8 + 1) for i in range(self.dictionary_size + 1)]) self.freqsTable = arithmeticcoding.FlatFrequencyTable( self.dictionary_size + 1) self.decoder = arithmeticcoding.ArithmeticDecoder(32, self.bitin)
def decompress(bitin, out): initfreqs = arithmeticcoding.FlatFrequencyTable(257) freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs) dec = arithmeticcoding.ArithmeticDecoder(32, bitin) while True: # Decode and write one byte symbol = dec.read(freqs) if symbol == 256: # EOF symbol break out.write(bytes((symbol, ))) freqs.increment(symbol)
def __init__(self, order, symbollimit, escapesymbol): if order < -1 or symbollimit <= 0 or not (0 <= escapesymbol < symbollimit): raise ValueError() self.model_order = order self.symbol_limit = symbollimit self.escape_symbol = escapesymbol if order >= 0: self.root_context = PpmModel.Context(symbollimit, order >= 1) self.root_context.frequencies.increment(escapesymbol) else: self.root_context = None self.order_minus1_freqs = arithmeticcoding.FlatFrequencyTable(symbollimit)
def compress(inp, bitout): initfreqs = arithmeticcoding.FlatFrequencyTable(257) freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs) enc = arithmeticcoding.ArithmeticEncoder(32, bitout) while True: # Read and encode one byte symbol = inp.read(1) if len(symbol) == 0: break enc.write(freqs, symbol[0]) freqs.increment(symbol[0]) enc.write(freqs, 256) # EOF enc.finish() # Flush remaining code bits
def comparess(file1, model, indices_char): #this is painfully slow #if at all possible it should be revised so that it can mostly be run on the gpu #by painfully slow i mean on the order of .02 seconds per character guess. #ie ~16 minutes for a 50k character file. f1 = open(file1, 'r').read() data_size = len(f1) i = 0 #output = [0, f1[0]] bitout = arithmeticcoding.BitOutputStream(open(file1 + '.comp', "wb")) initfreqs = arithmeticcoding.FlatFrequencyTable(AE_SIZE) freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs) enc = arithmeticcoding.ArithmeticEncoder(bitout) guesses_right = 0 gss = '' while i < data_size: current = ord(f1[i]) if i < maxlen: enc.write(freqs, 0) # Always 'guessing' zero correctly before maxlen freqs.increment(0) enc.write(freqs, current) freqs.increment(current) else: guess = predict(f1[(i - maxlen):i], model, indices_char) if (f1[i] == guess and guesses_right < 255): guesses_right += 1 print("Guessed", f1[i], "correctly") else: enc.write(freqs, guesses_right) print("Wrong guess. Outputing", guesses_right, "correct guesses") freqs.increment(guesses_right) print(i, "Outputing char", current) enc.write(freqs, current) freqs.increment(current) guesses_right = 0 if (i % 100 == 0): print("i:", i) i += 1 if guesses_right > 0: enc.write(freqs, guesses_right) enc.write(freqs, MAGIC_EOF) print("out eof sanity check") enc.finish() bitout.close() return None
def decompress(input_file): decode = bytearray() with open(input_file, "rb") as inp: bitin = arithmeticcoding.BitInputStream(inp) initfreqs = arithmeticcoding.FlatFrequencyTable(257) freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs) dec = arithmeticcoding.ArithmeticDecoder(32, bitin) while True: # Decode and write one byte symbol = dec.read(freqs) if symbol == 256: # EOF symbol break decode.extend(bytes((symbol, ))) freqs.increment(symbol) return pickle.loads(decode)
def __init__(self, path, ispaired, kmerLen, verbose, sequenceTable): self.mutiDict = {} ### kmers for buckets self.sequenceTable = sequenceTable self.kmerLen = kmerLen self.indexLen = kmerLen self.paired = ispaired self.seqLen = 0 #self.bucketDict = defaultdict(lambda : defaultdict(dict)) self.bucketDict = {} #nested_dict(2, int) self.encodeBucketPath = {} self.newNodeNum = 0 self.simpleNodeNum = 0 self.tipNodeNum = 0 self.bifurNodeNum = 0 self.deleteBifurRatio = 0.2 self.outPutPath = path self.dna2num = {"A": 0, "C": 1, "G": 2, "T": 3} self.num2dna = {0: "A", 1: "C", 2: "G", 3: "T"} self.dna2bit = {"A": '0b00', "C": '0b01', "G": '0b10', "T": '0b11'} self.num2dna = {0: "A", 1: "C", 2: "G", 3: "T"} self.firstSeq = BitStream() self.numFlag = BitStream() self.freq3 = arithmeticcoding.SimpleFrequencyTable( arithmeticcoding.FlatFrequencyTable(3)) self.freq4 = arithmeticcoding.SimpleFrequencyTable( arithmeticcoding.FlatFrequencyTable(4)) self.freqs = arithmeticcoding.SimpleFrequencyTable( arithmeticcoding.FlatFrequencyTable(4)) self.bitoutL = arithmeticcoding.BitOutputStream( open(self.outPutPath + ".bifurL", "wb")) self.bitoutR = arithmeticcoding.BitOutputStream( open(self.outPutPath + ".bifurR", "wb")) self.encodeSeqPathL = self.openFileLeft() self.encodeSeqPathR = self.openFileRight() self.verbose = verbose self.removeOutputFile()
def compress(snp, numsymbol): initfreqs = arithmeticcoding.FlatFrequencyTable(numsymbol) freqs = arithmeticcoding.SimpleFrequencyTable(initfreqs) enc = arithmeticcoding.ArithmeticEncoder(32) snp = np.squeeze(snp) rows,cols,channel = snp.shape for c in range(channel): for i in range(rows): for j in range(cols): # Read and encode one byte symbol = snp[i,j,c] enc.write(freqs, symbol) freqs.increment(symbol) enc.write(freqs, numsymbol-1) # EOF enc.finish() # Flush remaining code bits return enc.bit_nums
def __init__(self, order, symbollimit, escapesymbol): # order must be at least -1, symbol limit must be at least 0, and the escape symbol must be a positive value # and smaller than symbol limit if order < -1 or symbollimit <= 0 or not (0 <= escapesymbol < symbollimit): raise ValueError() self.model_order = order # order of the model self.symbol_limit = symbollimit # symbol limit self.escape_symbol = escapesymbol # escape symbol # building frequency table if order >= 0: self.root_context = PpmModel.Context(symbollimit, order >= 1) self.root_context.frequencies.increment(escapesymbol) else: self.root_context = None self.order_minus1_freqs = arithmeticcoding.FlatFrequencyTable( symbollimit)