예제 #1
0
def producer(info):
    try:
        inputdata = "%s.fasta" % (str(info[0]))
        consensus = str(info[1])
        seqs = [
            ">%s\n%s" % (str(i), str(s))
            for i, s in zip(info[2].split("\t"), info[3].split("\t"))
        ]
        with open(inputdata, "w") as o:
            print >> o, ">%(seqID)s\n%(seq)s" % dict(seqID=CONSENSUS_NAME,
                                                     seq=consensus)
            print >> o, "%s" % ("\n".join(seqs))
        cline = """trimal -in %s  -fasta -gt 0.8 -st 0.001 -cons 60 -colnumbering""" % (
            inputdata)
        child = subprocess.Popen(str(cline),
                                 stdout=subprocess.PIPE,
                                 universal_newlines=True,
                                 shell=(sys.platform != "win32"))
        sout, serr = child.communicate()
        removeFiles([inputdata])
        sout = filter(None, sout.splitlines())  # strip empty lines
        fasta = "\n".join(sout[:-1])
        log = sout[-1]
        return fasta, log, info[0]
    except:
        return None
예제 #2
0
def samToBamBuffers(samdat, fprefix):
    cline = """samtools view -bS -"""
    child = subprocess.Popen(str(cline),
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             shell=(sys.platform!="win32"),
                             close_fds = True)
    sout, serr = child.communicate(samdat)
    bamout = "%s.tmp.bam"%(fprefix)
    cline = """samtools sort - -o %s.tmp"""%(fprefix)
    child2 = subprocess.Popen(str(cline),
                              stdin = subprocess.PIPE,
                              stderr = subprocess.PIPE,
                              stdout = subprocess.PIPE,
                              shell = (sys.platform!="win32"),
                              close_fds = True)
    bamdat, berr = child2.communicate(sout)

    with open(bamout, "wb") as o:
        o.write(bamdat)

    os.system("""samtools index %s > /dev/null 2> /dev/null"""%(bamout))
    bamidxdat = open("%s.bai"%(bamout), "rb").read()
    removeFiles([bamout, "%s.tmp.bam.bai"%(fprefix)])

    return bamdat, bamidxdat
예제 #3
0
def samToBamBuffers(samdat, fprefix):
    cline = """samtools view -bS -"""
    child = subprocess.Popen(str(cline),
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             shell=(sys.platform != "win32"),
                             close_fds=True)
    sout, serr = child.communicate(samdat)
    bamout = "%s.tmp.bam" % (fprefix)
    cline = """samtools sort - -o %s.tmp""" % (fprefix)
    child2 = subprocess.Popen(str(cline),
                              stdin=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              stdout=subprocess.PIPE,
                              shell=(sys.platform != "win32"),
                              close_fds=True)
    bamdat, berr = child2.communicate(sout)

    with open(bamout, "wb") as o:
        o.write(bamdat)

    os.system("""samtools index %s > /dev/null 2> /dev/null""" % (bamout))
    bamidxdat = open("%s.bai" % (bamout), "rb").read()
    removeFiles([bamout, "%s.tmp.bam.bai" % (fprefix)])

    return bamdat, bamidxdat
예제 #4
0
    def run(self):
        samout = "%s.sam"%(self.name)
        bamout = "%s.bam"%(self.name)

        msaInfo = self.parseMSA(StringIO.StringIO(self.data))

        refs = [ (r.id, len(r.seq) ) for r in  SeqIO.parse(StringIO.StringIO(self.data), "fasta")]

        readgroups, refIDToReadgroup, groupdata = self.generateReadGroups(msaInfo.keys())

        header = dict(HD = dict(VN = '1.0'), SQ = [ {'LN': refs[0][1], 'SN': refs[0][0] }], RG = readgroups)
        outfile = pysam.Samfile(samout, "wh", header = header)       
        for refName, refVals in msaInfo.iteritems():
            isReversed = True
            if len(refVals) == 3:
                isReversed = False
            samid = os.path.join(self.samdir,"%s.bam" % refName )
            samfile = pysam.Samfile(samid, "rb" )        

            coverage = self.processSingleRef(samfile, refName, refVals[0], refVals[1], refVals[2], isReversed, 
                                  outfile, header, refIDToReadgroup[refName])
            groupdata[refName].append(coverage)
            samfile.close()
        outfile.close()

        # need it in memory anyways...
        samdat = open(samout, "rb").read()      
        cline = """samtools view -bS -"""
        child = subprocess.Popen(str(cline),
                                 stdin=subprocess.PIPE,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 shell=(sys.platform!="win32"),
                                 close_fds = True)
        sout, serr = child.communicate(samdat)

        cline = """samtools sort - -o %s"""%(self.name)
        child2 = subprocess.Popen(str(cline),
                                  stdin = subprocess.PIPE,
                                  stderr = subprocess.PIPE,
                                  stdout = subprocess.PIPE,
                                  shell = (sys.platform!="win32"),
                                  close_fds = True)
        bamdat, berr = child2.communicate(sout)
        with open(bamout, "wb") as o:
            o.write(bamdat)
        
        os.system("""samtools index %s > /dev/null 2> /dev/null"""%(bamout))
        bamidxdat = open("%s.bai"%(bamout), "rb").read()

        newcon = self.updateConsensus(bamout)
        if len(newcon) != int(refs[0][1]):
            print >> sys.stderr, "Con length mismatch : %s"%(bamout)
            print >> sys.stderr , self.data
        removeFiles([samout, bamout, "%s.bai"%(bamout)])
        return samdat, bamdat, bamidxdat, self.name, groupdata.values(), newcon
예제 #5
0
def find_shared_regions(args):
    tmpname = mkstemp(dir = ".")
    os.close(tmpname[0])
    tmpname = tmpname[1]
    qaction = ""
    if args.quiet:
        qaction = " > /dev/null 2> /dev/null "
    cline = USRCH%dict(input=args.input1, database=args.input2, output= tmpname, threads = args.threads, tail = qaction )
    #print cline
    child = subprocess.Popen(str(cline), shell=(sys.platform!="win32") )
    child.wait()
    data = [l.strip().split() for l in open(tmpname)]
    removeFiles([tmpname])
    return data
예제 #6
0
    def genGo(self):
        self.genPbBin()

        os.chdir(XlsToolDir)

        # 生成 xxx.pb.go
        os.system("protoc --version")
        os.system("protoc -I . --go_out=. ./*.proto")

        # 移动文件到目的文件夹
        DstGoPbDir = os.path.join(SelfPath, "../gen")

        utils.moveFiles(XlsToolDir, DstGoPbDir, ["*.pb.go"])
        utils.moveFiles(XlsToolDir, DstGoPbDir, ["*.bytes"])

        # 清除 多余文件
        utils.removeFiles(XlsToolDir,
                          ["*_pb2.py", "*.pyc", "*.log", "*.txt", "*.proto"])
예제 #7
0
def producer(info):
    try:
        inputdata = "%s.fasta"%(str(info[0]))
        consensus = str(info[1])
        seqs = [">%s\n%s"%(str(i), str(s)) for i, s in zip(info[2].split("\t"), info[3].split("\t"))]
        with open(inputdata, "w") as o:
            print >> o, ">%(seqID)s\n%(seq)s"%dict(seqID=CONSENSUS_NAME, seq=consensus)
            print >> o, "%s"%("\n".join(seqs))
        cline = """trimal -in %s  -fasta -gt 0.8 -st 0.001 -cons 60 -colnumbering"""%(inputdata)
        child = subprocess.Popen(str(cline),
                             stdout=subprocess.PIPE,
                             universal_newlines = True,
                             shell=(sys.platform!="win32"))
        sout, serr = child.communicate()
        removeFiles([inputdata])
        sout = filter(None, sout.splitlines()) # strip empty lines
        fasta = "\n".join(sout[:-1])
        log = sout[-1]
        return fasta, log, info[0]
    except:
        return None
예제 #8
0
def producer(info):
    fileidx = str(info[0])

    inputdata = "%s.con.fasta"%(fileidx)
    consensus = str(info[1])

    with open(inputdata, "w") as o:
        print >> o, ">%(seqID)s\n%(seq)s"%dict(seqID = CONSENSUS_NAME, seq = consensus)
    os.system("""samtools faidx %s"""%(inputdata))

    bamfile = "%s.bam"%(fileidx)

    for g in info[3].split("\t"):
        cline = """samtools view -bhr "%s" - > %s_%s"""%(g, g, bamfile)
        child = subprocess.Popen(str(cline),
                                 stdin=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 shell=(sys.platform!="win32"),
                                 close_fds = True)
        child.communicate(info[2])
        with open("%s.ids"%(fileidx), "a") as o:
            print >> o, "%s_%s"%(g, bamfile)   

    cline = """varscan.sh %s %s %s 2> /dev/null"""%( "%s.ids"%(fileidx), bamfile, inputdata )
    child = subprocess.Popen(str(cline),
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             shell=(sys.platform!="win32"),
                             close_fds = True)
    dat, err = child.communicate()
    hdr = []
    for l in StringIO.StringIO(dat):        
        if l[0] == '#' and l[1] != '#':
            hdr = [ (k, v) for k, v in  zip(info[3].split("\t"), l.split()[9:], )]
            break
    
    removeFiles([inputdata, "%s.fai"%(inputdata)])
    return fileidx, dat, hdr
예제 #9
0
def producer(info):
    fileid = str(info[0])
    try:
        vcfInput = vcf.Reader(StringIO.StringIO(info[1]))
    except:
        return None
    line = None
    try:
        line = vcfInput.next()
    except:
        return None
    if not line:
        return None

    bamfile = "%s.bam"%(fileid)
    bamidxfile = "%s.bam.bai"%(fileid)
    with open(bamfile, "wb") as o:
        o.write(info[2])
    with open(bamidxfile, "wb") as o:
        o.write(info[3])


    vcfInput = vcf.Reader(StringIO.StringIO(info[1]))
    vcfohndl = StringIO.StringIO()
    vcfOutput = vcf.Writer(vcfohndl, vcfInput)
    jsonhndl =  StringIO.StringIO()
    data = computeData(vcfInput, vcfOutput, bamfile, 0)
    json.dump(data, jsonhndl, separators=(',', ':'))
    jsonhndl.flush()
    jsonstr = jsonhndl.getvalue()
    jsonhndl.close()
    vcfohndl.flush()
    modvcf = vcfohndl.getvalue()
    vcfohndl.close()

    removeFiles([bamfile, bamidxfile])
    return info[0], modvcf, jsonstr
예제 #10
0
def producer(info):
    fileid = str(info[0])
    try:
        vcfInput = vcf.Reader(StringIO.StringIO(info[1]))
    except:
        return None
    line = None
    try:
        line = vcfInput.next()
    except:
        return None
    if not line:
        return None

    bamfile = "%s.bam" % (fileid)
    bamidxfile = "%s.bam.bai" % (fileid)
    with open(bamfile, "wb") as o:
        o.write(info[2])
    with open(bamidxfile, "wb") as o:
        o.write(info[3])

    vcfInput = vcf.Reader(StringIO.StringIO(info[1]))
    vcfohndl = StringIO.StringIO()
    vcfOutput = vcf.Writer(vcfohndl, vcfInput)
    jsonhndl = StringIO.StringIO()
    data = computeData(vcfInput, vcfOutput, bamfile, 0)
    json.dump(data, jsonhndl, separators=(',', ':'))
    jsonhndl.flush()
    jsonstr = jsonhndl.getvalue()
    jsonhndl.close()
    vcfohndl.flush()
    modvcf = vcfohndl.getvalue()
    vcfohndl.close()

    removeFiles([bamfile, bamidxfile])
    return info[0], modvcf, jsonstr
예제 #11
0
def producer(info):
    fileidx = str(info[0])
    consensus = ">%(seqID)s\n%(seq)s\n" % dict(seqID=CONSENSUS_NAME,
                                               seq=str(info[2]))
    seqs = [
        ">%s\n%s" % (str(i), str(s))
        for i, s in zip(info[3].split("\t"), info[4].split("\t"))
    ]

    fastafile = StringIO.StringIO(consensus + "\n".join(seqs))

    bamfile = "%s.bam" % (fileidx)
    bamidxfile = "%s.bam.bai" % (fileidx)
    with open(bamfile, "wb") as o:
        o.write(info[5])
    with open(bamidxfile, "wb") as o:
        o.write(info[6])

    refs = [(r.id, len(r.seq)) for r in SeqIO.parse(fastafile, "fasta")]

    sfile = pysam.Samfile(bamfile)

    hdr = sfile.header.copy()
    hdr['SQ'] = [{'LN': refs[0][1], 'SN': refs[0][0]}]

    total = 0
    indices = eval("[" + info[1].replace("#ColumnsMap", "") + "]")
    #indices = eval("[" + info[1] + "]")
    maxidx = max(indices) + 1
    shift = [0] * (maxidx)
    for i in xrange(len(shift)):
        shift[i] = total
        if i not in indices:
            total += 1
    outfile = pysam.Samfile("%s.trim.sam" % (fileidx), "wh", header=hdr)
    newcoverage = defaultdict(int)

    # DSL 20160113 -- Since the outfile already has the header, a None check won't work.  Use a boolean flag for now to verify we have sequences.
    hasSeqs = False
    for read in sfile.fetch():
        newseq = ""
        newcig = ""
        newqual = ""

        # mahdi: causing the program to crash. Cause still unknown
        if read.cigarstring == None:
            # DLS 20160113 -- instead of tossing the entire sam file out, just skip the bad sequence...
            #break;
            continue
        # DLS 20160113 -- added to notify later steps the SAM file we are building does infact contain at least 1 sequence.
        hasSeqs = True
        cigar = expandCigar(read.cigarstring)
        back = 0
        pos = read.pos
        for idx, c in enumerate(cigar):
            # stop if we exceed the reference length
            if pos >= maxidx:
                break
            pos += 1
            # only process bases that we havent removed in cleanup
            # also, count any D's that we missed to shift us back in the query and qual
            if (idx + read.pos) not in indices:
                # an idx not in indices indicates that trimal has removed this particular position
                if cigar[idx].upper() == 'D':
                    back += 1
                continue

            # if the idx exists in the logs of trimAL, the base should be kept.
            newcig += cigar[idx]
            #D means we dont have a base to add.
            if cigar[idx].upper() != 'D':
                newseq += read.query[idx - back]
                if read.qqual:
                    newqual += read.qqual[idx - back]
            else:
                back += 1

        #if a sequence has nothing to add, why add it ?!?
        if newseq:
            read.pos = max((read.pos - shift[read.pos]), 0)
            read.seq = newseq
            read.cigarstring = compressCigar(newcig)
            if not newqual:
                read.qual = "I" * len(newseq)
            else:
                read.qual = newqual
            newcoverage[read.qname.split("_")[-1]] += 1
            outfile.write(read)
    outfile.close()

    samout = "%s.trim.sam" % (fileidx)
    bamout = "%s.trim.bam" % (fileidx)
    samdat = open(samout).read()
    removeFiles([bamfile, bamidxfile, samout])
    # DLS 20160113 -- The consumer should handle the None case correctly, by skipping the INSERT/UPDATE action.
    #                 I am speculating an issue occurs in the samToBam when the samfile is empty.
    if hasSeqs == False:
        return None
    bamdat, bamidxdat = samToBam(samdat, "%s.trim" % (fileidx), buffers=True)
    return fileidx, samdat, bamdat, bamidxdat, newcoverage
예제 #12
0
def producer(info):
    fileidx = str(info[0])   
    consensus = ">%(seqID)s\n%(seq)s\n"%dict(seqID=CONSENSUS_NAME, seq=str(info[2]))    
    seqs = [">%s\n%s"%(str(i), str(s)) for i, s in zip(info[3].split("\t"), info[4].split("\t"))]
    
    fastafile = StringIO.StringIO(consensus + "\n".join(seqs) )
    
    bamfile = "%s.bam"%(fileidx)
    bamidxfile = "%s.bam.bai"%(fileidx)
    with open(bamfile, "wb") as o:
        o.write(info[5])
    with open(bamidxfile, "wb") as o:
        o.write(info[6])

    refs = [ (r.id, len(r.seq) ) for r in  SeqIO.parse(fastafile, "fasta")]

    sfile = pysam.Samfile(bamfile)

    hdr = sfile.header.copy()
    hdr['SQ'] = [ {'LN': refs[0][1], 'SN': refs[0][0] }]
    
    total = 0
    indices = eval("["+info[1].replace("#ColumnsMap","") + "]")
    #indices = eval("[" + info[1] + "]")
    maxidx = max(indices) + 1
    shift = [0]*(maxidx)
    for i in xrange(len(shift)):
        shift[i] = total
        if i not in indices:
            total += 1
    outfile = pysam.Samfile("%s.trim.sam"%(fileidx), "wh", header = hdr)
    newcoverage = defaultdict(int)

    # DSL 20160113 -- Since the outfile already has the header, a None check won't work.  Use a boolean flag for now to verify we have sequences.
    hasSeqs = False
    for read in sfile.fetch():
        newseq = ""
        newcig = ""
        newqual = ""

        # mahdi: causing the program to crash. Cause still unknown
        if read.cigarstring == None:
            # DLS 20160113 -- instead of tossing the entire sam file out, just skip the bad sequence...
            #break;
            continue
        # DLS 20160113 -- added to notify later steps the SAM file we are building does infact contain at least 1 sequence.
        hasSeqs = True
        cigar = expandCigar(read.cigarstring)
        back = 0
        pos = read.pos
        for idx, c in enumerate(cigar):
            # stop if we exceed the reference length
            if pos >= maxidx:
                break
            pos += 1
            # only process bases that we havent removed in cleanup
            # also, count any D's that we missed to shift us back in the query and qual
            if (idx + read.pos) not in indices:
                # an idx not in indices indicates that trimal has removed this particular position
                if cigar[idx].upper() == 'D':
                    back += 1
                continue

            # if the idx exists in the logs of trimAL, the base should be kept.
            newcig += cigar[idx]
            #D means we dont have a base to add.
            if cigar[idx].upper() != 'D': 
                newseq += read.query[idx - back]
                if read.qqual:
                    newqual += read.qqual[idx - back]
            else: 
                back += 1

        #if a sequence has nothing to add, why add it ?!?
        if newseq:
            read.pos = max( (read.pos - shift[read.pos]), 0)
            read.seq = newseq
            read.cigarstring = compressCigar(newcig)
            if not newqual:
                read.qual = "I"* len(newseq)
            else:
                read.qual = newqual
            newcoverage[read.qname.split("_")[-1]] += 1
            outfile.write(read)
    outfile.close()
    
    samout = "%s.trim.sam"%(fileidx)
    bamout = "%s.trim.bam"%(fileidx)
    samdat = open(samout).read()
    removeFiles([bamfile, bamidxfile, samout])
    # DLS 20160113 -- The consumer should handle the None case correctly, by skipping the INSERT/UPDATE action.
    #                 I am speculating an issue occurs in the samToBam when the samfile is empty.
    if hasSeqs == False:
        return None
    bamdat, bamidxdat = samToBam(samdat, "%s.trim"%(fileidx), buffers = True)
    return fileidx, samdat, bamdat, bamidxdat, newcoverage