def getfiltseqs(fname): fpathnoext, fext = os.path.splitext(fname) if fext in [".bam",".sam"]: filt_seqs = [(record.query_name,str(record.seq)) for record in openxam(fname)] else: fpathnoext,fext,ftype,fh = openfastx(fname) filt_seqs = [(record.id,str(record.seq)) for record in SeqIO.parse(fh,ftype)] return(set(filt_seqs))
def gettargetseqs(fname): fpathnoext, fext = os.path.splitext(fname) if fext in [".bam",".sam"]: return openxam(fname,"r") else: fpathnoext,fext,ftype,fh = openfastx(fname) return SeqIO.parse(fh,ftype) return(set(filt_seqs))
ifpathnoext, ifext = os.path.splitext(args["INFNAME"]) ofpathnoext, ofext = os.path.splitext(args["OUTFNAME"]) if not isxam(ifext) and isxam(ofext): raise TypeError("ERROR: cannot output BAM from a FASTX input") if isxam(ifext): infile = pysam.AlignmentFile(args["INFNAME"]) if isxam(ofext): outfile = pysam.AlignmentFile(args["OUTFNAME"], mode="wb", template=infile) for aln in filteralignments(infile,lengths,reqnts,args["nmism"],args["nmaps"],args["endtoend"]): outfile.write(aln) else: ofpathnoext,ofext,oftype,ofh = openfastx(args["OUTFNAME"],mode="wt") for aln in filteralignments(infile,lengths,reqnts,args["nmism"],args["nmaps"],args["endtoend"]): seq = Seq(aln.seq) seq.name = seq.id = seq.description = aln.query_name if aln.is_reverse: seq = seq.reverse_complement() record = SeqRecord(seq,id=aln.query_name,description="") SeqIO.write(record,ofh,oftype) ofh.close() else: fpathnoext,fext,ftype,fh = openfastx(args["INFNAME"]) infile = SeqIO.parse(fh,ftype) ofpathnoext,ofext,oftype,ofh = openfastx(args["OUTFNAME"],mode="wt")
if record.is_reverse: seq = seq.reverse_complement() target_seqs.append((record.query_name,str(seq))) except TypeError: seq = str(record.seq) target_seqs.append((record.id,str(seq))) filter_seqs = getfiltseqs(ffilters) return filterseqs(target_seqs,filter_seqs) if __name__ == "__main__": args = parsearguments() if not args["OUTFNAME"].endswith(".fasta"): args["OUTFNAME"] += ".fasta" matched_target_seqs,matching_filter_seqs = filterseqs_from_files(args["TARGETS"],args["FILTERS"]) if args["output_targets"]: ofpathnoext,ofext,oftype,ofh = openfastx("targets_" + args["OUTFNAME"],mode="wt") for id,seq in matched_target_seqs: ofh.write(">{}\n{}\n".format(id,seq)) ofh.close() if args["output_filters"]: ofpathnoext,ofext,oftype,ofh = openfastx("filters_" + args["OUTFNAME"],mode="wt") for id,seq in matching_filter_seqs: ofh.write(">{}\n{}\n".format(id,seq)) ofh.close()
return(vars(parser.parse_args())) if __name__ == "__main__": import argparse,sys,collections,os from tstk.io import openfastx,parsefastx from tstk.common import revcomp import pysam args = parsearguments() fpathnoext, fext = os.path.splitext(args["INFNAME"]) if fext in [".bam",".sam"]: entries = [entry for entry in pysam.AlignmentFile(args["INFNAME"],"rb") if dict(entry.get_tags())["HI"] == 1]# only get the first entry in case of multi-mappers entries = [revcomp(str(e.seq)) if e.is_reverse else str(e.seq) for e in entries] counter = collections.Counter(entries) #only include one hit from the multi mappers else: fpathnoext,fext,ftype,fh = openfastx(args["INFNAME"]) seqs = [str(seq) for name,seq,qual in parsefastx(fh)] counter = collections.Counter(seqs) fh.close() ofpathnoext,ofext,oftype,ofh = openfastx(args["OUTFNAME"].replace("fastq","fasta"),mode='wt') seqid = 1 for seq in counter.most_common(): ofh.write(">{}-{}\n{}\n".format(seqid,seq[1],seq[0])) seqid += 1 ofh.close()
def getcounts(SEQFILE,minlength=18,maxlength=33,uncollapse=False,normfw=None,normrv=None,normnreads=False,stranded=False,noN=False,c3p=False,chrnames=None,nmaps=1,t2u=False,rvonly=False,fwonly=False): from tstk.io import openfastx from Bio import SeqIO from Bio.Seq import Seq import numpy as np import sys,os,re fpath = SEQFILE ext = os.path.splitext(fpath)[1] lengths = np.arange(minlength,maxlength+1) if stranded: lengths_rv = np.arange(minlength,maxlength+1) counts = {} if noN: counts = {key: {"A":0,"C":0,"T":0,"G":0} for key in lengths} counts_rv = {key: {"A":0,"C":0,"T":0,"G":0} for key in lengths} else: counts = {key: {"A":0,"C":0,"T":0,"G":0,"N":0} for key in lengths} counts_rv = {key: {"A":0,"C":0,"T":0,"G":0,"N":0} for key in lengths} if c3p: nucpos = -1 else: nucpos = 0 nrecords = 0 if ext == ".bam": import pysam bamfile = pysam.AlignmentFile(fpath) if chrnames: reads = [] for chrname in chrnames: reads += bamfile.fetch(chrname) else: reads = bamfile.fetch() for read in reads: tags = dict(read.get_tags()) if "NH" not in tags or tags["NH"] <= nmaps: if read.is_reverse: seq = Seq(read.seq) seq = seq.reverse_complement() else: seq = read.seq firstnuc = seq[nucpos] if not noN or firstnuc != "N": if len(seq) in lengths: if stranded and read.is_reverse: counts_rv[len(seq)][firstnuc] += 1 else: counts[len(seq)][firstnuc] += 1 if normnreads: if chrnames: nrecords = 0 for chrname in chrnames: nrecords += sum(1 for r in pysam.AlignmentFile(fpath).fetch(chrname) if dict(r.get_tags())["HI"] == 1) else: nrecords = sum(1 for r in pysam.AlignmentFile(fpath) if dict(r.get_tags())["HI"] == 1) else: checkheader = True fpathnoext,fext,ftype,fh = openfastx(fpath) for record in SeqIO.parse(fh,ftype): if uncollapse: if checkheader: reid = re.compile("^\d+-\d+$") if not reid.match(record.id): print("WARNING: the fasta header doesn't seem to match the 'id-count' format. Ignoring uncollapse request.") uncollapse = False checkheader = False if ftype == "fastq": print("WARNING: file format is fastq, ignoring uncollapse request") uncollapse = False count = 1 else: count = int(record.id.split("-")[1]) else: count = 1 nrecords += count firstnuc = record.seq[nucpos] if not noN or firstnuc != "N": if len(record.seq) in lengths: counts[len(record.seq)][firstnuc] += count normfactor_fw = None normfactor_rv = None if normnreads: normfactor_fw = nrecords normfactor_rv = nrecords if normfw: if normnreads: print("WARNING: normalisation to library size specified together with specific normalisation factor. Using the specific factor for FW reads") normfactor_fw = normfw if normrv: if normnreads: print("WARNING: normalisation to library size specified together with specific normalisation factor. Using the specific factor for RV reads") normfactor_rv = normrv if normfactor_fw: for l in counts: for n in counts[l]: counts[l][n] = counts[l][n] / normfactor_fw if stranded and normfactor_rv: for l in counts_rv: for n in counts_rv[l]: counts_rv[l][n] = counts_rv[l][n] / normfactor_rv try: fh.close() except NameError: pass if t2u: for l in counts: counts[l]["U"] = counts[l].pop("T") for l in counts_rv: counts_rv[l]["U"] = counts_rv[l].pop("T") counts = {"fw":counts,"rv":counts_rv} return counts