Пример #1
0
    def close_enough_old(self, primer, sequence, errors) :
        err_count = 0

        for i,j in zip(sequence, primer) :
            if not IUPAC.equal(i, j) :
                err_count += 1

        #if len(a) > 10 :
        #    print a, b, err_count

        return err_count <= errors
Пример #2
0
    def close_enough(self, primer, sequence, diff) :
        if diff < 0 :
            return False

        if (len(primer) == 0) or (len(sequence) == 0) :
            return True

        m = IUPAC.equal(sequence[0], primer[0])

        return self.close_enough(primer[1:], sequence[1:], diff if m else diff-1) or \
               self.close_enough(primer[1:], sequence, diff-1) or \
               self.close_enough(primer, sequence[1:], diff-1)
Пример #3
0
    def accept(self, seq) :
        seqprimer = seq.sequence[:self.len]

        ret = IUPAC.close_enough(self.primer, seq.sequence, self.err)

        #print self.primer, seq.sequence[:self.len], ret

        if ret and self.clip :
            # primer part of sequence may be longer or
            # shorter, but it does not really matter
            # as terminal gaps are not included in our
            # definition of identity
            seq.remove_mid(self.len)
            
        return ret
Пример #4
0
    def read_nematodes(self, fastq_fname, fprimer, rprimer, diffs, length) :
        tmp = []
        acc2name = {}

        # read in sequences
        fq = FastqFile(fastq_fname)
        fq.open()

        for seq in fq :
            if 'Nematoda' not in seq.id :
                continue
    
            seq.ungap()
            seq.back_translate()

            new_id = seq.id.split()[0][1:]
            tmp.append((new_id, seq.sequence))
    
            acc2name[new_id] = seq.id[seq.id.find('Nematoda'):]

        fq.close()


        # test sequences
        p = Progress("Looking for primer sequences", len(tmp), False)
        p.start()

        tmp2 = []

        for label,seq in tmp :
            findex = IUPAC.seq_position(fprimer, seq, diffs)
 
            if findex != -1 :
                #if IUPAC.seq_position_reverse(rprimer, seq, diffs) != -1 :

                shortseq = seq[findex + len(fprimer) : findex + len(fprimer) + length]
                if 'N' not in shortseq :
                    tmp2.append((label, shortseq))          

            p.increment()

        p.end()

        return tmp2,acc2name
Пример #5
0
    def distance(self, aligned) :
        leng = float(min(len(aligned[0]), len(aligned[1])))

        last_gap = True
        diff = 0

        for c1,c2 in zip(aligned[0], aligned[1]) :
            if (c1 == '-') and (c2 == '-') :
                continue

            gap = '-' in (c1,c2)

            if last_gap and gap :
                continue

            if not IUPAC.equal(c1, c2) :
                diff += 1

            last_gap = gap

        if last_gap :
            diff -= 1

        return (leng - diff) / leng
Пример #6
0
    def extract(self, sff, outdir, primer, primer_errors, barcode, barcode_errors, max_homopolymer) :
        try :
            from Bio import SeqIO
        except ImportError :
            print >> sys.stderr, "BioPython not installed (only required for working with SFF files)"
            sys.exit(1)

        barcode_len = len(barcode)
        primer_len = len(primer)

        raw_seq_total = 0

        names = []
        flows = []
        flowlens = []

        for record in SeqIO.parse(sff.get_filename(), "sff") :
            raw_seq_total += 1
            good_bases = record.seq[record.annotations["clip_qual_left"] : record.annotations["clip_qual_right"]]
            barcode_seq = good_bases[:barcode_len]
            primer_seq = good_bases[barcode_len : barcode_len + primer_len]

            new_length = 0

            for i in range(0, len(record.annotations["flow_values"]), 4) : 
                signal = 0
                noise = 0

                for j in range(4) :
                    f = float(record.annotations["flow_values"][i + j]) / 100.0

                    if int(f + 0.5) > max_homopolymer :
                        break

                    if f > 0.5 :
                        signal += 1
                        if f < 0.7 :
                            noise += 1

                if noise > 0 or signal == 0 :
                    break

                new_length += 1

            new_length *= 4

            if new_length > 450 :
                new_length = 450

            if new_length >= 360 and \
                    IUPAC.close_enough(barcode, barcode_seq, barcode_errors) and \
                    IUPAC.close_enough(primer, primer_seq, primer_errors) :
                flows.append(record.annotations["flow_values"])
                flowlens.append(new_length)
                names.append(record.id)



        if len(flows) == 0 :
            self.log.info("kept 0/%d sequences" % raw_seq_total)
            return 0, None

        # output pyronoise input file
        # see http://userweb.eng.gla.ac.uk/christopher.quince/Software/PyroNoise.html
        f = open(join(outdir, "flows.dat"), 'w')

        print >> f, "%d %d" % (len(flows), max([ len(i) for i in flows ]))
        for i in range(len(flows)) :
            print >> f, " ".join([ names[i], str(flowlens[i]) ] + [ "%.2f" % (float(i) / 100.0) for i in flows[i] ])

        f.close()

        self.log.info("kept %d/%d sequences" % (len(flows), raw_seq_total))
        return len(flows), f.name