Exemplo n.º 1
0
    def restart_barcode_4sets(self, AT_max, GC_max, prev_bc4sets_fpath, tmp_fpath=None):
        with open(prev_bc4sets_fpath) as f:
            while True:
                try:
                    seq_4set = [next(f).strip() for _ in range(4)]
                except StopIteration:
                    break
                assert all(len(seq) == self.bc_len for seq in seq_4set), 'Prev bcs not specified length'
                self.dna_barcode_4sets.append(seq_4set)
                assert not next(f).strip(), next(f)
        
        log.info('Read previous barcodes file')
        
        for seq_4set in sorted(self.dna_barcode_4sets):
            for seq in seq_4set:
                self._add_barcode(dna2num(seq))
            log.info('Added prev set {}: {}'.format(len(self.dna_barcode_4sets), seq_4set))

        if tmp_fpath:
            with open(tmp_fpath, 'w') as out:
                for seq_4set in sorted(self.dna_barcode_4sets):
                    out.write('\n'.join(seq_4set) + '\n\n')

        Aseqs = [seq for seq in self.dna_barcodes if seq.startswith('A')]
        Aseqs.sort()
        last_prev_Aidx = dna2num(Aseqs[-1])

        self.find_barcode_4sets(
            AT_max,
            GC_max,
            tmp_fpath=tmp_fpath,
            last_prev_Aidx=last_prev_Aidx,
        )
Exemplo n.º 2
0
 def dna_nums_given_nerr_tup(nerr_tup):
     nsub, ndel, nins = nerr_tup
     return [
         seqtools.dna2num(seq)
         for seq in self._freediv_subsphere_given_counts(
             nsub, ndel, nins)
     ]
Exemplo n.º 3
0
    def restart_Conway_closure(self, prev_fpath, tmp_fpath=None):
        log.info('Restarting {}...'.format(prev_fpath))
        prev_bc_idxs = [dna2num(line.strip()) for line in open(prev_fpath)]
        for bc_idx in prev_bc_idxs:
            self._add_barcode(bc_idx)
            log.info('Adding previous {}: {}'.format(len(self.barcodes),
                                                     num2dna(bc_idx, self.bc_len)))
        with open(tmp_fpath, 'w') as out:
            for bc_idx in prev_bc_idxs:
                out.write('{}\n'.format(num2dna(bc_idx, self.bc_len)))

        seq_iter = self.seq_idx_iter_func()
        max_prev = max(prev_bc_idxs)
        bc_idx = next(seq_iter)
        while bc_idx < max_prev:
            bc_idx = next(seq_iter)
        log.info('Reached last previous barcode: {}'.format(num2dna(max_prev, self.bc_len)))
        log.info('Restarting after {}'.format(num2dna(bc_idx, self.bc_len)))

        for seq_idx in seq_iter:
            if self._idx_is_available(seq_idx):
                self._add_barcode(seq_idx)
                log.info('Found barcode {}: {}'.format(len(self.barcodes),
                                                       num2dna(seq_idx, self.bc_len)))
                if tmp_fpath:
                    with open(tmp_fpath, 'a') as out:
                        out.write('{}\n'.format(num2dna(seq_idx, self.bc_len)))
Exemplo n.º 4
0
 def decode(self, seq):
     seq_idx = seqtools.dna2num(seq)
     cw_idx = self._codebook[seq_idx]
     if cw_idx == 0:
         return
     else:
         cw_idx -= 1
         return self._codewords[cw_idx]
Exemplo n.º 5
0
 def exclude_barcodes(self, exclude_fpath):
     log.info('Excluding barcodes in {}...'.format(exclude_fpath))
     exclude_bc_idxs = [
         dna2num(line.strip()) for line in open(exclude_fpath)
     ]
     for seq_idx in exclude_bc_idxs:
         assert self.reserved_words[seq_idx] == 0, num2dna(
             seq_idx, self.bc_len)
         self.reserved_words[seq_idx] = 1
Exemplo n.º 6
0
    def find_barcode_4sets(
        self,
        AT_max,
        GC_max,
        seqs_so_far=[],
        prev_spheres=[],
        tmp_fpath=None,
        last_prev_Aidx=None,
    ):
        """
        A barcode 4-set is here defined as a set of four barcodes such that no two barcodes have
        the same base in the same position. I.e., all four bases are in each position in exactly
        one barcode.
        """
        first_base = bases[len(seqs_so_far)]

        seq_idx_iter_func = idx_seq_iterator_avoiding_prev_bases(
            self.bc_len, AT_max, GC_max, first_base, seqs_so_far)

        # Restart previous run (For 'A' seqs only)
        seq_idx_iter = seq_idx_iter_func()
        if last_prev_Aidx is not None:
            for seq_idx in seq_idx_iter:
                if seq_idx >= last_prev_Aidx:
                    break

        for seq_idx in seq_idx_iter:
            if self._idx_is_available(seq_idx):
                seq_sphere = set(self.iterate_decode_sphere(seq_idx))
                for prev_sphere in prev_spheres:
                    if prev_sphere & seq_sphere:
                        break
                else:
                    new_seqs = seqs_so_far + [num2dna(seq_idx, self.bc_len)]
                    new_spheres = prev_spheres + [seq_sphere]
                    if len(new_seqs) == 4:
                        assert first_base == bases[-1], new_seqs
                        for seq in new_seqs:
                            self._add_barcode(dna2num(seq))
                        if tmp_fpath:
                            with open(tmp_fpath, 'a') as out:
                                out.write('\n'.join(new_seqs) + '\n\n')
                        self.dna_barcode_4sets.append(new_seqs)
                        log.info('Found barcode set {}: {}'.format(
                            len(self.dna_barcode_4sets), new_seqs))
                        return
                    else:
                        self.find_barcode_4sets(AT_max, GC_max, new_seqs,
                                                new_spheres, tmp_fpath)
                        if len(new_seqs) > 1:
                            return
            elif self.reserved_words[seq_idx] == 0:
                self.reserved_words[seq_idx] = 1
Exemplo n.º 7
0
    def build_codebook_from_codewords(self, codewords, max_err):
        """
        Builds codebook given list or set of codewords

            codewords :iterable: list or set of codewords
            max_err :int: max correctible error
        """
        self.max_err = max_err
        self._codewords = list(codewords)
        self._codewords.sort()
        self._set_cw_len()

        if len(self._codewords) < 2**8:
            dtype = np.uint8
            cw_bytes = 1
        elif len(self._codewords) < 2**16:
            dtype = np.uint16
            cw_bytes = 2
        elif len(self._codewords) < 2**32:
            dtype = np.uint32
            cw_bytes = 4
        elif len(self._codewords) < 2**64:
            dtype = np.uint64
            cw_bytes = 8
        else:
            raise ValueError('More than 2^64 barcodes currently not supported')

        space_size = 4**self.cw_len + 1
        needed_bytes = space_size * cw_bytes
        available_bytes = psutil.virtual_memory().available
        if needed_bytes > available_bytes:
            raise RuntimeError(
                'Not enough memory. {:,d} bytes needed, {:,d} bytes available'.
                format(needed_bytes, available_bytes))

        self._codebook = np.zeros((space_size, ), dtype=dtype)

        for i, cw in enumerate(self._codewords):
            cw_idx = i + 1
            for seq in FreeDivSphere.FreeDivSphere(cw, self.max_err):
                seq_idx = seqtools.dna2num(seq)
                self._codebook[seq_idx] = cw_idx
Exemplo n.º 8
0
 def dnastr_codeword_is_available(self, dnastring):
     seq_idx = dna2num(dnastring)
     self._idx_is_available(seq_idx)
Exemplo n.º 9
0
 def add_dnastr_nonbarcode_codeword(self, dnastring):
     seq_idx = dna2num(dnastring)
     self.add_idx_nonbarcode_codeword(seq_idx)
Exemplo n.º 10
0
 def iterate_idxs():
     for seq in seq_iterator_generator(*args, **kw_args)():
         yield dna2num(seq)
Exemplo n.º 11
0
 def iterate_good_barcodes():
     for seq in bc_list:
         yield seqtools.dna2num(seq)
Exemplo n.º 12
0
 def iterate_seqs():
     for seq in possible_barcode_iterator(k, AT_max, GC_max)():
         yield dna2num(seq)