示例#1
0
文件: scores.py 项目: ajm/seance
    def build_distance_matrix(self, fname) :
        tmp = []
        fq = FastqFile(fname)
        fq.open()

        for seq in fq :
            tmp.append((seq.id[1:], seq.sequence))

        fq.close()


        p = Progress("Calculating distance matrix", (len(tmp)*(len(tmp)-1)) / 2.0, False)
        p.start()

        dist = {}
        for index, labelseq in enumerate(tmp) :
            label1,seq1 = labelseq
            dist[(label1,label1)] = 1.0
            for label2,seq2 in tmp[index+1:] :
                dist[(label1,label2)] = \
                        dist[(label2,label1)] = \
                        self.distance([seq1,seq2])
                
                p.increment()

        p.end()

        return dist, [ label for label, seq in tmp ]
示例#2
0
文件: workflow.py 项目: ajm/seance
    def __read_fasta(self, filename, include=None) :
        tmp = {}

        if include :
            include = include.lower()

        f = FastqFile(filename)
        f.open()

        for seq in f :
            if include :
                if include not in seq.id.lower() :
                    continue

            seq.id = seq.id.split()[0][1:]

            #if only_include :
            #    if seq.id not in only_include :
            #        continue
    
            tmp[seq.id] = seq

        f.close()

        self.log.info("read %d centroid sequences" % len(tmp))

        return tmp
示例#3
0
文件: workflow.py 项目: ajm/seance
    def __count(self, fasta) :
        fq = FastqFile(fasta)
        fq.open()

        count = 0
        for seq in fq :
            count += 1

        fq.close()
        return count
示例#4
0
文件: scores.py 项目: ajm/seance
    def read_nematodes(self, fastq_fname, fprimer, rprimer, diffs, length) :
        tmp = []
        acc2name = {}

        # read in sequences
        fq = FastqFile(fastq_fname)
        fq.open()

        for seq in fq :
            if 'Nematoda' not in seq.id :
                continue
    
            seq.ungap()
            seq.back_translate()

            new_id = seq.id.split()[0][1:]
            tmp.append((new_id, seq.sequence))
    
            acc2name[new_id] = seq.id[seq.id.find('Nematoda'):]

        fq.close()


        # test sequences
        p = Progress("Looking for primer sequences", len(tmp), False)
        p.start()

        tmp2 = []

        for label,seq in tmp :
            findex = IUPAC.seq_position(fprimer, seq, diffs)
 
            if findex != -1 :
                #if IUPAC.seq_position_reverse(rprimer, seq, diffs) != -1 :

                shortseq = seq[findex + len(fprimer) : findex + len(fprimer) + length]
                if 'N' not in shortseq :
                    tmp2.append((label, shortseq))          

            p.increment()

        p.end()

        return tmp2,acc2name
示例#5
0
文件: tools.py 项目: ajm/seance
    def run(self, sff, outdir, forward_primer, barcode, barcode_errors, max_homopolymer) :
        if not isinstance(sff, SffFile) :
            raise ExternalProgramError("argument is not an SffFile")

        output_name = abspath(join(outdir, sff.get_basename() + '.fasta'))

        numseq,fname = self.extract(sff, outdir, forward_primer, barcode, barcode_errors, max_homopolymer)

        # just so the rest of the pipeline can be run and there be a record
        # of the sample containing zero sequences
        if numseq == 0 :
            open(output_name, 'w').close()
            return FastqFile(output_name)

        # well... this is a mess
        # PyroDist does not like being given 1 sequence
        if numseq == 1 :
            try :
                from Bio import SeqIO
            except ImportError: 
                print >> sys.stderr, "BioPython not installed (only required for working with SFF files)"
                sys.exit(1)

            fout = open(output_name, 'w')

            for r in SeqIO.parse(sff.get_filename(), 'sff-trim') :
                print >> fout, ">seq0 NumDuplicates=1\n%s" % r.seq
            
            fout.close()
            
            return FastqFile(output_name)

        outfile = join(outdir, "flows")
        
        distfile = PyroDist().run(fname, outfile)
        listfile = FCluster().run(distfile, outfile)
        fafile   = PyroNoise().run(fname, listfile, outfile)

        # read fa file
        # add NumDulicates fields
        # output with correct file name
        fout = open(output_name, 'w')
        f = FastqFile(outfile + "_cd.fa")

#        seq2qual = {}
#        q = open(outfile + "_cd.qual")
#        seqname = None
#        for line in q :
#            if line.startswith('>') :
#                seqname = line.rstrip()[1:]
#            else :
#                seq2qual[seqname] = ''.join([ Sequence.int_to_quality(int(i)) for i in line.split() ])
#        q.close()

        f.open()

        count = 0
        for seq in f :
            dups = int(seq.id.split('_')[-1])
            print >> fout, ">seq%d NumDuplicates=%d\n%s" % (count, dups, seq.sequence)
            count += 1

        f.close()
        fout.close()

        # delete intermediate files
        shutil.rmtree(outfile)
        for fname in glob.glob(outfile + '*') :
            os.remove(fname)

        return FastqFile(output_name)
示例#6
0
文件: tools.py 项目: ajm/seance
    def get_names(self, fasta_fname, method, perc_identity, db_fname=None) :
        if method not in ('blast', 'taxonomy', 'blastlocal') :
            self.log.error("'%s' is not a valid labelling method" % method)
            sys.exit(1)

        # build query_name -> query_length dict
        query_length = {}
        f = FastqFile(fasta_fname)
        f.open()
        for s in f :
            query_name = s.id[1:s.id.index(' ')] if ' ' in s.id else s.id[1:]
            query_length[query_name] = float(len(s))

        f.close()
        # built

        if db_fname is not None :
            print "reading %s ..." % db_fname
            acc2tax = {}
            f = FastqFile(db_fname)
            f.open()
            for s in f :
                acc,tax = s.id.strip().split(' ', 1)
                if ";" not in tax :
                    print "Warning: sequence with accession %s has strange taxonomical identifier (%s)" % (acc[1:], tax)
                acc2tax[acc[1:]] = tax
            f.close()
            print "%d database sequences map to %d taxonomical identifiers..." % (len(acc2tax), len(set(acc2tax.values())))

            self.make_local_db(db_fname)
            command = self.local_command % (fasta_fname, db_fname, int(100 * perc_identity))
        else :
            command = self.remote_command % (fasta_fname, int(100 * perc_identity))

        print "running queries..."
        s,o = commands.getstatusoutput(command)

        # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
        # 
        # i want the top scoring hits in terms of percent identity to the query
        # i.e. ((hit_length * identity) / query_length)
        #

        if s != 0 :
            self.log.error("blastn returned %d" % s)
            sys.exit(1)

        if o.strip() == '' :
            return {}

        names = collections.defaultdict(list)

        for line in o.split('\n') :
            fields = line.split(',')
            try :
                name = int(fields[0])

            except ValueError :
                name = fields[0]

            try :
                pident = float(fields[2]) / 100.0
                length = int(fields[3])
                evalue = float(fields[-2])
                bitscore = float(fields[-1])

            except ValueError, ve :
                self.log.warn("problem with blast result (%s), skipping..." % (str(ve)))
                self.log.debug(line)
                continue

            score = (pident * length) / query_length[name]

            # only keep the highest scoring hits
            if score < perc_identity :
                continue


            if method == 'blastlocal' :
                names[name].append(acc2tax.get(fields[1], "unknown"))
            else :
                try :
                    desc = fields[1].split('|')

                    if re.match(".+\.\d+", desc[3]) :
                        names[name].append((desc[3], fields[2]))

                except IndexError :
                    self.log.warn("could not split line from blastn: %s" % str(fields))
                    continue