예제 #1
0
def test_chunkpos_iter():

    seq = tuple(range(10))
    nsize = 3
    w = 5

    # |0 1 2 3 4 5 6 7 8 9|
    #  |---------|     :  :
    #  :     |---------|  :
    #  :     :   : |------|
    #  0     :   5 :   :  :
    #        3     :   8  :
    #              6      |
    for slice, check in zip(chunkpos_iter(nsize, len(seq), w),
                            ((0, 5), (3, 8), (6, len(seq)))):
        assert slice == check

    w = 8
    for slice, check in zip(chunkpos_iter(nsize, len(seq), w),
                            ((0, 8), (6, len(seq)))):
        assert slice == check

    # windows size larger than sequence
    w = int(len(seq) * 1.1)
    for slice, check in zip(chunkpos_iter(nsize, len(seq), w),
                            ((0, len(seq)), )):
        assert slice == check
예제 #2
0
    def add(self, seq, hashbuffer=array.array('Q', [0,]*250)):
        """ Add all sub-sequences of length `self.nsize` found in the sequence "seq".

        - seq: a bytes-like sequence than can be sliced, and the slices be consummed
               by the function in the property `hashfun` (given to the constructor)
        - hashbuffer: a buffer array to store hash values during batch C calls

        """
        hashfun = self._hashfun
        seed = self._seed
        heap = self._heap
        maxsize = self._maxsize
        nsize = self._nsize
        lseq = len(seq)
        
        w = len(hashbuffer)
        assert nsize <= w

        anynew = self._anynew
        make_elt = self._make_elt
        extracthash = self._extracthash
        lheap = len(heap)
        if lheap > 0:
            heaptop = extracthash(heap[0])
        else:
            heaptop = self._initheap

        for slice_beg, slice_end in chunkpos_iter(nsize, lseq, w):
            subs = seq[slice_beg:slice_end] # safe: no out-of-bound in Python
            nsubs = hashfun(subs, nsize, hashbuffer, seed)
            heaptop = self._add(subs, nsubs, hashbuffer, heaptop,
                                extracthash, make_elt, self._replace, anynew)
            self._nvisited += nsubs
예제 #3
0
def reads_in_chunks(reader, chunksize, nsize, metrics):
    chunk = list()
    currentsize = 0
    progress_step = 50000
    for n, (header, sequence, quality) in enumerate(reader, 1):
        if n % progress_step == 0:
            print('\r    %i entries' % n, end='', flush=True)
        lseq = len(sequence)
        if lseq > chunksize:
            for beg, end in chunkpos_iter(nsize, lseq, chunksize):
                metrics[1] += lseq
                yield (sequence[beg:end], )
            continue
        else:
            chunk.append(sequence)
            currentsize += len(sequence)
            if currentsize >= chunksize:
                metrics[1] += currentsize
                yield chunk
                chunk = list()
                currentsize = 0
    metrics[0] = n
    if currentsize >= 0:
        metrics[1] += currentsize
        yield chunk
    print('\r    %i records' % (n + 1), end='', flush=True)
예제 #4
0
def test_chunkpos_iter():

    seq = tuple(range(10))
    nsize = 3

    w = 5
    for slice, check in zip(chunkpos_iter(nsize, len(seq), w),
                            ((0, 5), (3, 8), (6, len(seq)))):
        assert slice == check

    w = 8
    for slice, check in zip(chunkpos_iter(nsize, len(seq), w),
                            ((0, 8), (6, len(seq)))):
        assert slice == check

    # windows size larger than sequence
    w = int(len(seq) * 1.1)
    for slice, check in zip(chunkpos_iter(nsize, len(seq), w),
                            ((0, len(seq)), )):
        assert slice == check
예제 #5
0
def test_sketch_map_sequences():

    nsize = 21
    maxsize = 10
    hashfun = hasharray
    seed = DEFAULT_SEED
    cls = minhashsketch.MaxSketch
    mashingpumpkins.parallel.Sketch.initializer(cls, nsize, maxsize, hashfun, seed)

    random.seed(123)
    sequence = b''.join(random.choice((b'A',b'T',b'G',b'C')) for x in range(250))
    sequences = (sequence[beg:end] for beg, end in chunkpos_iter(nsize, len(sequence), 100)) 
    mhs = mashingpumpkins.parallel.Sketch.map_sequences(sequences)

    assert mhs.nsize == nsize
    assert mhs.maxsize == maxsize
    assert mhs.nvisited == len(sequence)-nsize+1