def test_chunkpos_iter(): seq = tuple(range(10)) nsize = 3 w = 5 # |0 1 2 3 4 5 6 7 8 9| # |---------| : : # : |---------| : # : : : |------| # 0 : 5 : : : # 3 : 8 : # 6 | for slice, check in zip(chunkpos_iter(nsize, len(seq), w), ((0, 5), (3, 8), (6, len(seq)))): assert slice == check w = 8 for slice, check in zip(chunkpos_iter(nsize, len(seq), w), ((0, 8), (6, len(seq)))): assert slice == check # windows size larger than sequence w = int(len(seq) * 1.1) for slice, check in zip(chunkpos_iter(nsize, len(seq), w), ((0, len(seq)), )): assert slice == check
def add(self, seq, hashbuffer=array.array('Q', [0,]*250)): """ Add all sub-sequences of length `self.nsize` found in the sequence "seq". - seq: a bytes-like sequence than can be sliced, and the slices be consummed by the function in the property `hashfun` (given to the constructor) - hashbuffer: a buffer array to store hash values during batch C calls """ hashfun = self._hashfun seed = self._seed heap = self._heap maxsize = self._maxsize nsize = self._nsize lseq = len(seq) w = len(hashbuffer) assert nsize <= w anynew = self._anynew make_elt = self._make_elt extracthash = self._extracthash lheap = len(heap) if lheap > 0: heaptop = extracthash(heap[0]) else: heaptop = self._initheap for slice_beg, slice_end in chunkpos_iter(nsize, lseq, w): subs = seq[slice_beg:slice_end] # safe: no out-of-bound in Python nsubs = hashfun(subs, nsize, hashbuffer, seed) heaptop = self._add(subs, nsubs, hashbuffer, heaptop, extracthash, make_elt, self._replace, anynew) self._nvisited += nsubs
def reads_in_chunks(reader, chunksize, nsize, metrics): chunk = list() currentsize = 0 progress_step = 50000 for n, (header, sequence, quality) in enumerate(reader, 1): if n % progress_step == 0: print('\r %i entries' % n, end='', flush=True) lseq = len(sequence) if lseq > chunksize: for beg, end in chunkpos_iter(nsize, lseq, chunksize): metrics[1] += lseq yield (sequence[beg:end], ) continue else: chunk.append(sequence) currentsize += len(sequence) if currentsize >= chunksize: metrics[1] += currentsize yield chunk chunk = list() currentsize = 0 metrics[0] = n if currentsize >= 0: metrics[1] += currentsize yield chunk print('\r %i records' % (n + 1), end='', flush=True)
def test_chunkpos_iter(): seq = tuple(range(10)) nsize = 3 w = 5 for slice, check in zip(chunkpos_iter(nsize, len(seq), w), ((0, 5), (3, 8), (6, len(seq)))): assert slice == check w = 8 for slice, check in zip(chunkpos_iter(nsize, len(seq), w), ((0, 8), (6, len(seq)))): assert slice == check # windows size larger than sequence w = int(len(seq) * 1.1) for slice, check in zip(chunkpos_iter(nsize, len(seq), w), ((0, len(seq)), )): assert slice == check
def test_sketch_map_sequences(): nsize = 21 maxsize = 10 hashfun = hasharray seed = DEFAULT_SEED cls = minhashsketch.MaxSketch mashingpumpkins.parallel.Sketch.initializer(cls, nsize, maxsize, hashfun, seed) random.seed(123) sequence = b''.join(random.choice((b'A',b'T',b'G',b'C')) for x in range(250)) sequences = (sequence[beg:end] for beg, end in chunkpos_iter(nsize, len(sequence), 100)) mhs = mashingpumpkins.parallel.Sketch.map_sequences(sequences) assert mhs.nsize == nsize assert mhs.maxsize == maxsize assert mhs.nvisited == len(sequence)-nsize+1