def build_distance_matrix(self, fname) : tmp = [] fq = FastqFile(fname) fq.open() for seq in fq : tmp.append((seq.id[1:], seq.sequence)) fq.close() p = Progress("Calculating distance matrix", (len(tmp)*(len(tmp)-1)) / 2.0, False) p.start() dist = {} for index, labelseq in enumerate(tmp) : label1,seq1 = labelseq dist[(label1,label1)] = 1.0 for label2,seq2 in tmp[index+1:] : dist[(label1,label2)] = \ dist[(label2,label1)] = \ self.distance([seq1,seq2]) p.increment() p.end() return dist, [ label for label, seq in tmp ]
def __read_fasta(self, filename, include=None) : tmp = {} if include : include = include.lower() f = FastqFile(filename) f.open() for seq in f : if include : if include not in seq.id.lower() : continue seq.id = seq.id.split()[0][1:] #if only_include : # if seq.id not in only_include : # continue tmp[seq.id] = seq f.close() self.log.info("read %d centroid sequences" % len(tmp)) return tmp
def __count(self, fasta) : fq = FastqFile(fasta) fq.open() count = 0 for seq in fq : count += 1 fq.close() return count
def read_nematodes(self, fastq_fname, fprimer, rprimer, diffs, length) : tmp = [] acc2name = {} # read in sequences fq = FastqFile(fastq_fname) fq.open() for seq in fq : if 'Nematoda' not in seq.id : continue seq.ungap() seq.back_translate() new_id = seq.id.split()[0][1:] tmp.append((new_id, seq.sequence)) acc2name[new_id] = seq.id[seq.id.find('Nematoda'):] fq.close() # test sequences p = Progress("Looking for primer sequences", len(tmp), False) p.start() tmp2 = [] for label,seq in tmp : findex = IUPAC.seq_position(fprimer, seq, diffs) if findex != -1 : #if IUPAC.seq_position_reverse(rprimer, seq, diffs) != -1 : shortseq = seq[findex + len(fprimer) : findex + len(fprimer) + length] if 'N' not in shortseq : tmp2.append((label, shortseq)) p.increment() p.end() return tmp2,acc2name
def run(self, sff, outdir, forward_primer, barcode, barcode_errors, max_homopolymer) : if not isinstance(sff, SffFile) : raise ExternalProgramError("argument is not an SffFile") output_name = abspath(join(outdir, sff.get_basename() + '.fasta')) numseq,fname = self.extract(sff, outdir, forward_primer, barcode, barcode_errors, max_homopolymer) # just so the rest of the pipeline can be run and there be a record # of the sample containing zero sequences if numseq == 0 : open(output_name, 'w').close() return FastqFile(output_name) # well... this is a mess # PyroDist does not like being given 1 sequence if numseq == 1 : try : from Bio import SeqIO except ImportError: print >> sys.stderr, "BioPython not installed (only required for working with SFF files)" sys.exit(1) fout = open(output_name, 'w') for r in SeqIO.parse(sff.get_filename(), 'sff-trim') : print >> fout, ">seq0 NumDuplicates=1\n%s" % r.seq fout.close() return FastqFile(output_name) outfile = join(outdir, "flows") distfile = PyroDist().run(fname, outfile) listfile = FCluster().run(distfile, outfile) fafile = PyroNoise().run(fname, listfile, outfile) # read fa file # add NumDulicates fields # output with correct file name fout = open(output_name, 'w') f = FastqFile(outfile + "_cd.fa") # seq2qual = {} # q = open(outfile + "_cd.qual") # seqname = None # for line in q : # if line.startswith('>') : # seqname = line.rstrip()[1:] # else : # seq2qual[seqname] = ''.join([ Sequence.int_to_quality(int(i)) for i in line.split() ]) # q.close() f.open() count = 0 for seq in f : dups = int(seq.id.split('_')[-1]) print >> fout, ">seq%d NumDuplicates=%d\n%s" % (count, dups, seq.sequence) count += 1 f.close() fout.close() # delete intermediate files shutil.rmtree(outfile) for fname in glob.glob(outfile + '*') : os.remove(fname) return FastqFile(output_name)
def get_names(self, fasta_fname, method, perc_identity, db_fname=None) : if method not in ('blast', 'taxonomy', 'blastlocal') : self.log.error("'%s' is not a valid labelling method" % method) sys.exit(1) # build query_name -> query_length dict query_length = {} f = FastqFile(fasta_fname) f.open() for s in f : query_name = s.id[1:s.id.index(' ')] if ' ' in s.id else s.id[1:] query_length[query_name] = float(len(s)) f.close() # built if db_fname is not None : print "reading %s ..." % db_fname acc2tax = {} f = FastqFile(db_fname) f.open() for s in f : acc,tax = s.id.strip().split(' ', 1) if ";" not in tax : print "Warning: sequence with accession %s has strange taxonomical identifier (%s)" % (acc[1:], tax) acc2tax[acc[1:]] = tax f.close() print "%d database sequences map to %d taxonomical identifiers..." % (len(acc2tax), len(set(acc2tax.values()))) self.make_local_db(db_fname) command = self.local_command % (fasta_fname, db_fname, int(100 * perc_identity)) else : command = self.remote_command % (fasta_fname, int(100 * perc_identity)) print "running queries..." s,o = commands.getstatusoutput(command) # qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore # # i want the top scoring hits in terms of percent identity to the query # i.e. ((hit_length * identity) / query_length) # if s != 0 : self.log.error("blastn returned %d" % s) sys.exit(1) if o.strip() == '' : return {} names = collections.defaultdict(list) for line in o.split('\n') : fields = line.split(',') try : name = int(fields[0]) except ValueError : name = fields[0] try : pident = float(fields[2]) / 100.0 length = int(fields[3]) evalue = float(fields[-2]) bitscore = float(fields[-1]) except ValueError, ve : self.log.warn("problem with blast result (%s), skipping..." % (str(ve))) self.log.debug(line) continue score = (pident * length) / query_length[name] # only keep the highest scoring hits if score < perc_identity : continue if method == 'blastlocal' : names[name].append(acc2tax.get(fields[1], "unknown")) else : try : desc = fields[1].split('|') if re.match(".+\.\d+", desc[3]) : names[name].append((desc[3], fields[2])) except IndexError : self.log.warn("could not split line from blastn: %s" % str(fields)) continue