def run(cfg): # get params print("tvc: start...") readSet = cfg.readSet uBam = cfg.uBam numCpus = cfg.numCores samtoolsMem = cfg.samtoolsMem samtoolsDir = cfg.samtoolsDir vcflibDir = cfg.vcflibDir flowTagsNeeded = set(("ZP","ZA","ZG","ZB","ZC","ZM","ZF","RG")) # open raw read file *.basecaller.bam bam = pysam.AlignmentFile(uBam, "rb", check_sq = False) # save BAM header in a string for later bamHeaderRaw = bam.text # open output file, and cutadapt 3' trim file fileout = open(readSet + ".tvc.flowtags.txt","w") fileIn3 = open(readSet + ".cutadapt.3.R1.txt","r") # create a dict of read id -> tag umi_dict = {} tag_name = "mi" with open(readSet + ".umi.tag.txt","r") as IN: for line in IN: read_id, umi, umi_qual = line.strip('\n').split('\t') umi_dict[read_id] = umi print "\nDone creating readID -> UMI dict\n" # merge 5' trim, 3' trim, and raw read flow tags for line in open(readSet + ".cutadapt.5.R1.txt","r"): vals5 = line.strip().split("\t") readId = vals5[0] umi = umi_dict["@"+readId] # spin the 3' trim file, and the raw read file forward while True: line = fileIn3.readline() vals3 = line.strip().split("\t") readId3 = vals3[0] read = bam.next() if read.query_name != readId3: raise Exception("3' cutadapt trim info file not in same order as raw read file") if readId3 == readId: break # debug check if readId3 != readId: raise Exception("3' and 5' cutadapt trim files out of sync") # skip to next read if 5' adapter not found if int(vals5[1]) == -1: continue # debug check if int(vals3[1]) == -1: raise Exception("trim of both 5' and 3' adapters expected") # fix line strip while len(vals3) < 11: vals3.append("") while len(vals5) < 11: vals5.append("") # avoid hassles of pysam type conversions for optional tags, by using SAM text format (might be slow) samVals = read.tostring(bam).split("\t") # get barcode region quality from raw read umiC = samVals[ 9][0:12] umiQ = samVals[10][0:12] if umiC != umi: raise Exception("unexpected barcode sync") # get trimmed sequences seq3 = vals3[5] + vals3[6] # 6 should be empty string seq5 = umi + vals5[4] + vals5[5] # 4 should be empty string # get trimmed qual vals qual3 = vals3[9] + vals3[10] # 10 should be empty string qual5 = umiQ + vals5[8] + vals5[9] # 8 should be empty string # debug check on read length readLenRaw = len(samVals[9]) readLenTrm = len(seq5) + len(vals5[6]) + len(seq3) if readLenTrm != readLenRaw: raise Exception("Length of trimmed read not equal to raw read!") # get raw read flow signal tags outvec = [readId, seq5, qual5, seq3, qual3] for tag in samVals[11:]: if tag[0:2] in flowTagsNeeded: outvec.append(tag) # write to disk fileout.write("|".join(outvec)) fileout.write("\n") # done fileout.close() fileIn3.close() bam.close() # sort the trimmed seq / flow tag file by read id cmd = "sort -k1,1 -t\| --parallel={0} {1}.tvc.flowtags.txt > {1}.tvc.flowtags.sorted.txt".format(numCpus,readSet) subprocess.check_call(cmd, shell=True) os.remove("{}.tvc.flowtags.txt".format(readSet)) # sort oligoClip file by read id cmd = samtoolsDir + "samtools sort -n -m " + samtoolsMem + " -@" + numCpus \ + " -T " + readSet \ + " -o " + readSet + ".tvc.temp.bam " \ + readSet + ".bam " \ + " > " + readSet + ".tvc.sort.log 2>&1 " subprocess.check_call(cmd, shell=True) # set up reverse comlement dnaComplementTranslation = string.maketrans("ATGC", "TACG") # open readId-sorted main BAM file, build header for TVC output bam bamIn = pysam.AlignmentFile(readSet + ".tvc.temp.bam", "rb") # dump bam header to sam file, because we are using older version of pysam that cannot directly take text lines for header headerTagsNeeded = set(["CO", "RG", "PG"]) fileOut = open(readSet + ".tvc.header.sam", "w") for line in bamIn.text.split("\n"): # init with TMAP tags if len(line) > 3 and line[1:3] != "RG": fileOut.write(line) fileOut.write("\n") for line in bamHeaderRaw.split("\n"): # add Ion BaseCaller flow tags if line[1:3] in headerTagsNeeded: fileOut.write(line) fileOut.write("\n") fileOut.close() samHeaderOnly = pysam.AlignmentFile(readSet + ".tvc.header.sam", "r") # open output BAM file bamOut = pysam.AlignmentFile(readSet + ".tvc.bam", "wb", template=samHeaderOnly) samHeaderOnly.close() os.remove(readSet + ".tvc.header.sam") # make TVC input file - add hard clipped regions back as soft-clipped alignments fileIn = open(readSet + ".tvc.flowtags.sorted.txt","r") for read in bamIn: # drop fake R2 (primer side) if not read.is_read2: continue # change back to single end read.is_read1 = False read.is_read2 = False read.is_paired = False read.mate_is_reverse = False read.mate_is_unmapped = False # parse readId vals = read.query_name.split(":") #readIdBam = ":".join(vals[0:-2]) readIdBam = read.query_name # spin the flowtag file forward (not all reads in the bam) readId = None while True: line = fileIn.readline() vals = line.strip().split("|") (readId, seq5, qual5, seq3, qual3) = vals[0:5] if readId == readIdBam: break # debug check if readId == None: raise Exception("missing read id in TVC flowtag merge") # handle negative strand alignment if read.is_reverse: tmp = seq5 seq5 = seq3 seq3 = tmp seq5 = seq5[::-1] seq5 = seq5.translate(dnaComplementTranslation) seq3 = seq3[::-1] seq3 = seq3.translate(dnaComplementTranslation) tmp = qual5[::-1] qual5 = qual3[::-1] qual3 = tmp # copy the cigar cigar = list(read.cigar) # add 5' trim back on (op, bases) = cigar[0] if op == 4: cigar[0] = (op, bases + len(seq5)) else: cigar.insert(0,(4, len(seq5))) # add 3' trim back on (op, bases) = cigar[-1] if op == 4: cigar[-1] = (op, bases + len(seq3)) else: cigar.append((4, len(seq3))) # save cigar edits read.cigar = cigar # pysam requires saving qual values first qual = read.qual # fix up the seq read.query_sequence = seq5 + read.query_sequence + seq3 # fix up the quality read.qual = qual5 + qual + qual3 # add flow quality tags for tag in vals[5:]: (tagName,tagType,tagVal) = tag.split(":") if tagType == "Z": pass elif tagType == "i": tagVal = int(tagVal) elif tagType == "B": if tagVal.startswith("f,"): tagVal = array.array("f",[float(x) for x in tagVal[2:].split(",")]) elif tagVal.startswith("i,"): tagVal = array.array("i",[int(x) for x in tagVal[2:].split(",")]) elif tagVal.startswith("s,"): tagVal = array.array("h",[int(x) for x in tagVal[2:].split(",")]) else: raise Exception() else: raise Exception() read.set_tag(tagName,tagVal) # output modified read bamOut.write(read) # done fileIn.close() bamIn.close() bamOut.close() os.remove(readSet + ".tvc.temp.bam") # sort final TVC input bam cmd = samtoolsDir + "samtools sort -m " + samtoolsMem + " -@" + numCpus \ + " -T " + readSet \ + " -o " + readSet + ".tvc.sorted.bam " \ + readSet + ".tvc.bam " \ + " > " + readSet + ".tvc.sort.log 2>&1 " subprocess.check_call(cmd, shell=True) # index final TVC input bam cmd = samtoolsDir + "samtools index " + readSet + ".tvc.sorted.bam" subprocess.check_call(cmd, shell=True) # run TVC roiBedFile = cfg.roiBedFile torrentBinDir = cfg.torrentBinDir torrentGenomeFile = cfg.genomeFile torrentVcfFile = readSet + ".tvc.vcf" cmd = os.path.join(torrentBinDir , "tvc") + " --output-dir _TVC_ " \ + " -n " + numCpus \ + " -b " + readSet + ".tvc.sorted.bam" \ + " -t " + roiBedFile \ + " -r " + torrentGenomeFile \ + " -o " + torrentVcfFile \ + " --snp-min-allele-freq 0.005" \ + " --snp-min-cov-each-strand 0 " \ + " --snp-min-coverage 3" \ + " --snp-min-var-coverage 2" \ + " --snp-min-variant-score 6" \ + " --snp-strand-bias 1" \ + " --snp-strand-bias-pval 0" \ + " --mnp-min-allele-freq 0.005" \ + " --mnp-min-cov-each-strand 0" \ + " --mnp-min-coverage 3" \ + " --mnp-min-var-coverage 2" \ + " --mnp-min-variant-score 6" \ + " --mnp-strand-bias 1" \ + " --mnp-strand-bias-pval 0" \ + " --indel-min-allele-freq 0.05" \ + " --indel-min-cov-each-strand 0" \ + " --indel-min-coverage 3" \ + " --indel-min-var-coverage 2" \ + " --indel-min-variant-score 10" \ + " --indel-strand-bias 1" \ + " --indel-strand-bias-pval 0" \ + " > " + readSet + ".tvc.log 2>&1" print("tvc: command line is " + cmd) subprocess.check_call(cmd, shell=True) print("tvc: done running TVC") # move TVC VCF to current directory os.rename("_TVC_/" + torrentVcfFile, torrentVcfFile) # call up vcflib command to split multi-allelic, remove GT tag, get primitives cmd = "{0}vcfbreakmulti {1}.tvc.vcf | " \ + "{0}vcfkeepgeno - DP AF AD VF | " \ + "{0}vcfallelicprimitives --tag-parsed AP > {1}.tvc.primitives.vcf 2> {1}.tvc.vcflib.log" cmd = cmd.format(vcflibDir,readSet) subprocess.check_call(cmd,shell=True) # (1) drop TVC primitive variants that have allele fraction below 0.05 # (2) make BED file for smCounter - regions +/- 10 bp from a TVC primitive variant bedTvc = [] fileout = open(readSet + ".tvc.primitives.temp.vcf", "w") for line in open(readSet + ".tvc.primitives.vcf", "r"): # echo VCF header if line.startswith("#"): fileout.write(line) continue # parse line chrom, pos, id, ref, alt, qual,filter,info,format,sampleId = line.strip().split("\t") # make sure data is as expected if alt.find(",") >= 0: raise Exception("tvc: not expecting multi-allelic variant in primitives file") # get left location, zero-based locL = int(pos) - 1 # look for indel, include right flanking base altLen = len(alt) refLen = len(ref) if altLen == refLen: # SNP or MNP locR = locL + refLen isIndel = False else: # INDEL locR = locL + refLen + 1 isIndel = True # get AF tag - assume always present - exception will be thrown if None remains alleleFraction = None for tag in info.split(";"): if tag.find("=") > 0: tagName, tagVal = tag.split("=") if tagName == "AF": if tagVal.find(",") >= 0: raise Exception("tvc: not expecting TVC primitives to be multi-allelic") alleleFraction = float(tagVal) # drop TVC primitive variants with low INDEL allele fraction if (isIndel and alleleFraction < 0.05) or alleleFraction < 0.005: continue # echo line to new TVC VCF primitives file fileout.write(line) # save region, with 10 bp flanking locL = max(0,locL - 10) locR += 10 if chrom == "chrM" and locR > 16569: # horrific hack for chrM NC_012920 reference locR = 16569 if locL < locR: bedTvc.append((chrom,locL, locR)) # close filtered TVC VCF primitives file, rename for later use fileout.close() os.rename(readSet + ".tvc.primitives.temp.vcf", readSet + ".tvc.primitives.vcf") # merge BED and write to disk bedTvc = bed.merge(bedTvc) bed.write(bedTvc, readSet + ".tvc_roi.bed") print("tvc: done running TVC and making smCounter ROI bed")
def geneCov(gene, genePrimers, fragLen): ''' get gene coverage using exon models and a max fragment length based on step01.py by John Dicarlo ''' # intit bed coverage bedCovOneGene = [] bedTrackSet = set() bedWarnings = [] # loop over RNAs for rnaId in gene: rnaLen = 0 bedExons = [] # get exons firstExon = True for (geneName, strand, chrom, exonStart, exonEnd) in gene[rnaId]: exonStart = int(exonStart) exonEnd = int(exonEnd) if firstExon: geneLocL = exonStart geneLocR = exonEnd firstExon = False else: geneLocL = min(exonStart, geneLocL) geneLocR = max(exonEnd, geneLocR) rnaLen += exonEnd - exonStart bedExons.append((chrom, exonStart, exonEnd)) bedExons.sort() # init coverage for this RNA bedCovOneRna = [] # loop over primers, make RNA coverage BED tracks for (chrom, locDna5, locDna3, strand, primer) in genePrimers: locDna5 = int(locDna5) locDna3 = int(locDna3) strand = int(strand) # get primer RNA loc3 exonsLen = 0 locRna3 = None for (chrom, locL, locR) in bedExons: if locL <= locDna3 < locR: locRna3 = exonsLen + locDna3 - locL break exonsLen += (locR - locL) # check if primer match RNA if locRna3 == None: continue # get loc5 on RNA primerLen = len(primer) if strand == 0: locRna5 = locRna3 - primerLen + 1 else: locRna5 = locRna3 + primerLen - 1 # get DNA position of end of fragment if strand == 0: locRnaEnd = min(locRna5 + fragLen - 1, rnaLen - 1) else: locRnaEnd = max(locRna5 - fragLen + 1, 0) locL_ = 0 locR_ = 0 locDnaEnd = None for (chrom, locL, locR) in bedExons: locR_ += (locR - locL) if locL_ <= locRnaEnd < locR_: locDnaEnd = locL + locRnaEnd - locL_ break locL_ = locR_ if locDnaEnd == None: raise Exception() # frag coverage region if strand == 0: bedDelete = [(chrom, geneLocL, locDna3 + 1), (chrom, locDnaEnd + 1, geneLocR)] else: bedDelete = [(chrom, geneLocL, locDnaEnd), (chrom, locDna3, geneLocR)] bedCov = bed.subtract(bedExons, bedDelete) # save coverage across whole RNA bedCovOneRna.extend(bedCov) # make subtraction bed for full frag, including primer if strand == 0: bedDelete = [(chrom, geneLocL, locDna5), (chrom, locDnaEnd + 1, geneLocR)] else: bedDelete = [(chrom, geneLocL, locDnaEnd), (chrom, locDna5 + 1, geneLocR)] # do bed subtraction to get enrichment frag bedFrag = bed.subtract(bedExons, bedDelete) bedFrag = bed.merge(bedFrag) # should not do anything # get size of enrichment frag (might be less than fragLen at ends of RNA) bpFrag = sum((x[2] - x[1] for x in bedFrag)) # convert bedFrag to a one-row bed bedLocL = bedFrag[0][1] bedLocR = bedFrag[-1][2] if strand == 0: bedStrand = "+" bedThickStart = locDna3 + 1 bedThickStop = bedLocR else: bedStrand = "-" bedThickStart = bedLocL bedThickStop = locDna3 if bedThickStart >= bedThickStop: bedWarnings.append( (chrom, locDna5, locDna3, strand, primer, geneName, rnaId, bedThickStart, bedThickStop)) numBlocks = len(bedFrag) blockSizes = ",".join([str(x[2] - x[1]) for x in bedFrag]) blockStarts = ",".join([str(x[1] - bedLocL) for x in bedFrag]) bedScore = 0 bedOne = (chrom, bedLocL, bedLocR, geneName, bedScore, bedStrand, bedThickStart, bedThickStop, 0, numBlocks, blockSizes, blockStarts) # bedOne = (chrom, bedLocL, bedLocR, bpFrag, bedScore, bedStrand, bedThickStart, bedThickStop, 0, numBlocks, blockSizes, blockStarts) bedTrackSet.add(bedOne) # update BED for all RNAs coverage bedCovOneGene.extend(bedCovOneRna) # post processing bedCovOneGene.sort() bedTrackSet = list(bedTrackSet) bedTrackSet.sort() return (bedCovOneGene, bedTrackSet, bedWarnings)