Exemplo n.º 1
0
def getfiltseqs(fname):
    fpathnoext, fext = os.path.splitext(fname)
    if fext in [".bam",".sam"]:
        filt_seqs = [(record.query_name,str(record.seq)) for record in openxam(fname)]
    else:
        fpathnoext,fext,ftype,fh = openfastx(fname)
        filt_seqs = [(record.id,str(record.seq)) for record in SeqIO.parse(fh,ftype)]

    return(set(filt_seqs))
Exemplo n.º 2
0
def gettargetseqs(fname):
    fpathnoext, fext = os.path.splitext(fname)
    if fext in [".bam",".sam"]:
        return openxam(fname,"r")
    else:
        fpathnoext,fext,ftype,fh = openfastx(fname)
        return SeqIO.parse(fh,ftype)

    return(set(filt_seqs))
Exemplo n.º 3
0
    ifpathnoext, ifext = os.path.splitext(args["INFNAME"])
    ofpathnoext, ofext = os.path.splitext(args["OUTFNAME"])

    if not isxam(ifext) and isxam(ofext):
        raise TypeError("ERROR: cannot output BAM from a FASTX input")

    if isxam(ifext):
        infile = pysam.AlignmentFile(args["INFNAME"])

        if isxam(ofext):
            outfile = pysam.AlignmentFile(args["OUTFNAME"], mode="wb", template=infile)
            for aln in filteralignments(infile,lengths,reqnts,args["nmism"],args["nmaps"],args["endtoend"]):
                outfile.write(aln)
        else:
            ofpathnoext,ofext,oftype,ofh = openfastx(args["OUTFNAME"],mode="wt")
            for aln in filteralignments(infile,lengths,reqnts,args["nmism"],args["nmaps"],args["endtoend"]):
                seq = Seq(aln.seq)
                seq.name = seq.id = seq.description = aln.query_name
                if aln.is_reverse:
                    seq = seq.reverse_complement()
                record = SeqRecord(seq,id=aln.query_name,description="")
                SeqIO.write(record,ofh,oftype)

            ofh.close()
    else:
        fpathnoext,fext,ftype,fh = openfastx(args["INFNAME"])
        infile = SeqIO.parse(fh,ftype)

        ofpathnoext,ofext,oftype,ofh = openfastx(args["OUTFNAME"],mode="wt")
Exemplo n.º 4
0
            if record.is_reverse:
                seq = seq.reverse_complement()
            target_seqs.append((record.query_name,str(seq)))
        except TypeError:
            seq = str(record.seq)
            target_seqs.append((record.id,str(seq)))

    filter_seqs = getfiltseqs(ffilters)

    return filterseqs(target_seqs,filter_seqs)

if __name__ == "__main__":
    args = parsearguments()

    if not args["OUTFNAME"].endswith(".fasta"):
        args["OUTFNAME"] += ".fasta"

    matched_target_seqs,matching_filter_seqs = filterseqs_from_files(args["TARGETS"],args["FILTERS"])

    if args["output_targets"]:
        ofpathnoext,ofext,oftype,ofh = openfastx("targets_" + args["OUTFNAME"],mode="wt")
        for id,seq in matched_target_seqs:
           ofh.write(">{}\n{}\n".format(id,seq))
        ofh.close()

    if args["output_filters"]:
        ofpathnoext,ofext,oftype,ofh = openfastx("filters_" + args["OUTFNAME"],mode="wt")
        for id,seq in matching_filter_seqs:
           ofh.write(">{}\n{}\n".format(id,seq))
        ofh.close()
Exemplo n.º 5
0
    return(vars(parser.parse_args()))

if __name__ == "__main__":
    import argparse,sys,collections,os
    from tstk.io import openfastx,parsefastx
    from tstk.common import revcomp
    import pysam

    args = parsearguments()

    fpathnoext, fext = os.path.splitext(args["INFNAME"])

    if fext in [".bam",".sam"]:
        entries = [entry for entry in pysam.AlignmentFile(args["INFNAME"],"rb") if dict(entry.get_tags())["HI"] == 1]# only get the first entry in case of multi-mappers
        entries = [revcomp(str(e.seq)) if e.is_reverse else str(e.seq) for e in entries]
        counter = collections.Counter(entries) #only include one hit from the multi mappers
    else:
        fpathnoext,fext,ftype,fh = openfastx(args["INFNAME"])
        seqs = [str(seq) for name,seq,qual in parsefastx(fh)]
        counter = collections.Counter(seqs)
        fh.close()

    ofpathnoext,ofext,oftype,ofh = openfastx(args["OUTFNAME"].replace("fastq","fasta"),mode='wt')

    seqid = 1
    for seq in counter.most_common():
        ofh.write(">{}-{}\n{}\n".format(seqid,seq[1],seq[0]))
        seqid += 1

    ofh.close()
Exemplo n.º 6
0
def getcounts(SEQFILE,minlength=18,maxlength=33,uncollapse=False,normfw=None,normrv=None,normnreads=False,stranded=False,noN=False,c3p=False,chrnames=None,nmaps=1,t2u=False,rvonly=False,fwonly=False):

    from tstk.io import openfastx
    from Bio import SeqIO
    from Bio.Seq import Seq
    import numpy as np
    import sys,os,re

    fpath = SEQFILE
    ext = os.path.splitext(fpath)[1]

    lengths = np.arange(minlength,maxlength+1)
    if stranded:
        lengths_rv = np.arange(minlength,maxlength+1)
    counts = {}

    if noN:
        counts = {key: {"A":0,"C":0,"T":0,"G":0} for key in lengths}
        counts_rv = {key: {"A":0,"C":0,"T":0,"G":0} for key in lengths}
    else:
        counts = {key: {"A":0,"C":0,"T":0,"G":0,"N":0} for key in lengths}
        counts_rv = {key: {"A":0,"C":0,"T":0,"G":0,"N":0} for key in lengths}

    if c3p:
        nucpos = -1
    else:
        nucpos = 0

    nrecords = 0
    if ext == ".bam":
        import pysam
        bamfile = pysam.AlignmentFile(fpath)
        if chrnames:
            reads = []
            for chrname in chrnames:
                reads += bamfile.fetch(chrname)
        else:
            reads = bamfile.fetch()

        for read in reads:
            tags = dict(read.get_tags())
            if "NH" not in tags or tags["NH"] <= nmaps:
                if read.is_reverse:
                    seq = Seq(read.seq)
                    seq = seq.reverse_complement()
                else:
                    seq = read.seq

                firstnuc = seq[nucpos]
                if not noN or firstnuc != "N":
                    if len(seq) in lengths:
                        if stranded and read.is_reverse:
                                counts_rv[len(seq)][firstnuc] += 1
                        else:
                                counts[len(seq)][firstnuc] += 1

        if normnreads:
            if chrnames:
                nrecords = 0 
                for chrname in chrnames:
                    nrecords += sum(1 for r in pysam.AlignmentFile(fpath).fetch(chrname) if dict(r.get_tags())["HI"] == 1)
            else:
                nrecords = sum(1 for r in pysam.AlignmentFile(fpath) if dict(r.get_tags())["HI"] == 1)
    else:
        checkheader = True

        fpathnoext,fext,ftype,fh = openfastx(fpath)

        for record in SeqIO.parse(fh,ftype):
            if uncollapse:
                if checkheader:
                    reid = re.compile("^\d+-\d+$")
                    if not reid.match(record.id):
                        print("WARNING: the fasta header doesn't seem to match the 'id-count' format. Ignoring uncollapse request.")
                        uncollapse = False
                    checkheader = False

                if ftype == "fastq":
                    print("WARNING: file format is fastq, ignoring uncollapse request")
                    uncollapse = False
                    count = 1
                else:
                    count = int(record.id.split("-")[1])
            else:
                count = 1

            nrecords += count

            firstnuc = record.seq[nucpos]
            if not noN or firstnuc != "N":
                if len(record.seq) in lengths:
                    counts[len(record.seq)][firstnuc] += count

    normfactor_fw = None
    normfactor_rv = None

    if normnreads:
        normfactor_fw = nrecords
        normfactor_rv = nrecords

    if normfw:
        if normnreads:
            print("WARNING: normalisation to library size specified together with specific normalisation factor. Using the specific factor for FW reads")
        normfactor_fw = normfw

    if normrv:
        if normnreads:
            print("WARNING: normalisation to library size specified together with specific normalisation factor. Using the specific factor for RV reads")
        normfactor_rv = normrv

    if normfactor_fw: 
        for l in counts:
            for n in counts[l]:
                counts[l][n] = counts[l][n] / normfactor_fw

    if stranded and normfactor_rv:
        for l in counts_rv:
            for n in counts_rv[l]:
                counts_rv[l][n] = counts_rv[l][n] / normfactor_rv

    try:
        fh.close()
    except NameError:
        pass

    if t2u:
        for l in counts:
            counts[l]["U"] = counts[l].pop("T")
        for l in counts_rv:
            counts_rv[l]["U"] = counts_rv[l].pop("T")

    counts = {"fw":counts,"rv":counts_rv}

    return counts