def callbase(bamfile, snpsites, out): BF = Samfile(bamfile, 'rb') #open your bam file SF = open(snpsites, 'r') #the file contain snp sites info RF = open(out, 'w') #resulte file RF.write('ref_name\tpos\tRbase\tAbase\tA\tT\tC\tG\tN\tothers\n') for i in SF: if i.startswith('#'): continue else: line = ParseSNPsitesLine(i) vcf_pos = line.pos-1 #change 1-base to 0-based vcf_refname = line.chrom print 'processing: %s %s...'%(vcf_refname, str(vcf_pos)) At, Tt, Ct, Gt, Nt, othert = 0, 0, 0, 0, 0, 0 for i in BF.pileup(vcf_refname, vcf_pos, vcf_pos+1): if i.pos == vcf_pos: vcf_Rbase = line.Rbase vcf_Abase = line.Abase for j in i.pileups: yourbase = j.alignment.seq[j.qpos] if yourbase == 'A': At += 1 elif yourbase == 'T': Tt += 1 elif yourbase == 'C': Ct += 1 elif yourbase == 'G': Gt += 1 elif yourbase == 'N': Nt += 1 else: othert += 1 RF.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(vcf_refname, \ str(vcf_pos+1), vcf_Rbase, vcf_Abase, str(At), str(Tt), str(Ct), str(Gt), \ str(Nt), str(othert))) BF.close()
def removeEdgeMismatches(self, bamFile, minDistance, minBaseQual): startTime = Helper.getTime() minDistance = int(minDistance) counter = 0 j = 0 num_lines = len(self.variantDict) Helper.info( " [%s] remove Missmatches from the first %s bp from read edges" % (startTime.strftime("%c"), str(minDistance)), self.logFile, self.textField) bamFile = Samfile(bamFile, "rb") for varKey in self.variantDict.keys(): variant = self.variantDict[varKey] counter += 1 if counter % 10000 == 0: Helper.status('%s mm parsed ' % counter, self.logFile, self.textField, "grey") keepSNP = False varPos = variant.position - 1 iter = bamFile.pileup(variant.chromosome, variant.position - 1, variant.position) #walks up the region wich overlap this position for x in iter: if x.pos == varPos: for pileupread in x.pileups: #walk through the single reads if not pileupread.is_del and not pileupread.is_refskip: distance = abs( pileupread.alignment.alen - pileupread.query_position ) if pileupread.alignment.is_reverse else pileupread.query_position if distance >= minDistance: #check readBase and Base Quality if pileupread.alignment.query_sequence[ pileupread. query_position] == variant.alt and pileupread.alignment.query_qualities[ pileupread. query_position] >= minBaseQual: #if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt: keepSNP = True if keepSNP == False: j += 1 del self.variantDict[varKey] Helper.status('%s of %svariants were deleted' % (j, num_lines), self.logFile, self.textField, "black") Helper.printTimeDiff(startTime, self.logFile, self.textField) bamFile.close()
def removeEdgeMismatches(self,bamFile,minDistance, minBaseQual): startTime=Helper.getTime() minDistance=int(minDistance) counter=0;j=0 num_lines = len(self.variantDict) Helper.info(" [%s] remove Missmatches from the first %s bp from read edges" % (startTime.strftime("%c"),str(minDistance)),self.logFile,self.textField) bamFile = Samfile(bamFile, "rb") for varKey in self.variantDict.keys(): variant = self.variantDict[varKey] counter+=1 if counter%10000==0: Helper.status('%s mm parsed ' % counter ,self.logFile, self.textField,"grey") keepSNP=False varPos=variant.position-1 iter = bamFile.pileup(variant.chromosome, variant.position-1, variant.position) #walks up the region wich overlap this position for x in iter: if x.pos == varPos: for pileupread in x.pileups: #walk through the single reads if not pileupread.is_del and not pileupread.is_refskip: distance=abs(pileupread.alignment.alen-pileupread.query_position) if pileupread.alignment.is_reverse else pileupread.query_position if distance >= minDistance: #check readBase and Base Quality if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt and pileupread.alignment.query_qualities[pileupread.query_position]>=minBaseQual: #if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt: keepSNP=True if keepSNP==False: j+=1 del self.variantDict[varKey] Helper.status('%s of %svariants were deleted' % (j,num_lines), self.logFile, self.textField,"black") Helper.printTimeDiff(startTime, self.logFile, self.textField) bamFile.close()
def blatSearch(self, variants, outFile, minBaseQual, minMissmatch): startTime = Helper.getTime() Helper.info( " [%s] Search non uniquely mapped reads" % (startTime.strftime("%c")), self.rnaEdit.logFile, self.rnaEdit.textField) bamFile = Samfile(self.bamFile, "rb") # create Fasta file for blat to remap the variant overlapping reads tempFasta = outFile + "_tmp.fa" if not os.path.isfile(tempFasta) or not os.path.getsize( tempFasta ) > 0: # check if temFast exists and is not empty. If it exist it will not be created again tempFastaFile = open(tempFasta, "w+") mmNumberTotal = len(variants.variantDict) ############################################# ######### CREATE FASTA FILE ####### ############################################# Helper.info( " [%s] Create fasta file for blat " % (startTime.strftime("%c")), self.rnaEdit.logFile, self.rnaEdit.textField) counter = 1 if len(variants.variantDict.keys()) == 0: Helper.error("No Variants left", self.rnaEdit.logFile, self.rnaEdit.textField) for varKey in variants.variantDict.keys(): variant = variants.variantDict[varKey] varPos = variant.position - 1 iter = bamFile.pileup(variant.chromosome, variant.position - 1, variant.position) alignements = [] for x in iter: if x.pos == varPos: # loop over reads of that position for pileupread in x.pileups: if not pileupread.is_del and not pileupread.is_refskip: if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt and \ pileupread.alignment.query_qualities[pileupread.query_position] >= minBaseQual: # if pileupread.alignment.query_sequence[pileupread.query_position] == variant.alt: alignements.append( pileupread.alignment.seq) if len(alignements) >= minMissmatch: missmatchReadCount = 0 for sequence in alignements: tempFastaFile.write("> " + variant.chromosome + "-" + str(variant.position) + "-" + variant.ref + "-" + variant.alt + "-" + str(missmatchReadCount) + "\n" + sequence + "\n") missmatchReadCount += 1 counter += 1 if counter % 1000 == 0: sys.stdout.write("\r" + str(counter) + " of " + str(mmNumberTotal) + " variants done") Helper.info( str(counter) + " of " + str(mmNumberTotal) + " variants done", self.rnaEdit.logFile, self.rnaEdit.textField) sys.stdout.flush() Helper.info("\n created fasta file " + tempFasta, self.rnaEdit.logFile, self.rnaEdit.textField) Helper.printTimeDiff(startTime, self.rnaEdit.logFile, self.rnaEdit.textField) tempFastaFile.close() ############################# ##### do blat search ##### ############################# pslFile = outFile + ".psl" if not os.path.isfile(pslFile) or not os.path.getsize(pslFile) > 0: cmd = [ self.rnaEdit.params.sourceDir + "blat", "-stepSize=5", "-repMatch=2253", "-minScore=20", "-minIdentity=0", "-noHead", self.rnaEdit.params.refGenome, tempFasta, pslFile ] # print cmd Helper.proceedCommand("do blat search for unique reads", cmd, tempFasta, "None", self.rnaEdit) Helper.info(" [%s] Blat finished" % (startTime.strftime("%c")), self.rnaEdit.logFile, self.rnaEdit.textField) Helper.info( " [%s] Parse Blat output to look for non uniquely mapped reads" % (startTime.strftime("%c")), self.rnaEdit.logFile, self.rnaEdit.textField) if not os.path.isfile(outFile): # open psl file pslFile = open(pslFile, "r") blatDict = {} for line in pslFile: # summarize the blat hits pslFields = line.split() chr, pos, ref, alt, mmReadCount = pslFields[9].split("-") varTuple = (chr, int(pos), ref, alt) try: blatScore = [ pslFields[0], pslFields[13], pslFields[17], pslFields[18], pslFields[20] ] # #of Matches, targetName, blockCount, blockSize, targetStarts except IndexError: Helper.warning("Not enough Values in '%s' (Skip)" % line, self.rnaEdit.logFile, self.rnaEdit.textField) continue if varTuple in blatDict: blatDict[varTuple] = blatDict[varTuple] + [blatScore] else: blatDict[varTuple] = [blatScore] siteDict = {} discardDict = {} Helper.info( " [%s] Analyse Blat hits (Slow)" % (startTime.strftime("%c")), self.rnaEdit.logFile, self.rnaEdit.textField) # loop over blat Hits for varTuple in blatDict.keys( ): # Loop over all blat hits of mmReads to observe the number of Alignements keepSNP = False chr, pos, ref, alt = varTuple pslLine = blatDict[varTuple] largestScore = 0 largestScoreLine = pslLine[0] scoreArray = [] # look for largest blatScore and save the largest line too for blatHit in pslLine: lineScore = int(blatHit[0]) scoreArray.append(lineScore) if lineScore > largestScore: largestScore = lineScore largestScoreLine = blatHit scoreArray.sort(reverse=True) if len(scoreArray ) < 2: # test if more than one blat Hit exists scoreArray.append(0) if chr == largestScoreLine[1] and scoreArray[1] < scoreArray[ 0] * 0.95: # check if same chromosome and hit is lower the 95 percent of first hit blockCount, blockSizes, blockStarts = int(largestScoreLine[2]), largestScoreLine[3].split(",")[:-1], \ largestScoreLine[4].split(",")[:-1] for i in range(blockCount): startPos = int(blockStarts[i]) + 1 endPos = startPos + int(blockSizes[i]) if pos >= startPos and pos < endPos: # check if alignement overlaps missmatch keepSNP = True if keepSNP == True: if varTuple in siteDict: siteDict[varTuple] += 1 else: siteDict[varTuple] = 1 elif not keepSNP: # when read not passes the blat criteria if varTuple in discardDict: discardDict[varTuple] += 1 else: discardDict[varTuple] = 1 pslFile.close() ############################################################################## ##### loop through variants and delete invalid variants ###### ############################################################################## Helper.info( " [%s] Deleting invalid variants" % (startTime.strftime("%c")), self.rnaEdit.logFile, self.rnaEdit.textField) mmNumberTotal = 0 mmNumberTooSmall = 0 mmReadsSmallerDiscardReads = 0 for key in variants.variantDict.keys(): numberBlatReads = 0 numberDiscardReads = 0 if key in siteDict: numberBlatReads = siteDict[key] if key in discardDict: numberDiscardReads = discardDict[key] if numberBlatReads <= minMissmatch and numberBlatReads <= numberDiscardReads: del variants.variantDict[key] # count statistics if numberBlatReads < minMissmatch: mmNumberTooSmall += 1 elif numberBlatReads < numberDiscardReads: # check if more reads fit the blat criteria than not mmReadsSmallerDiscardReads += 1 mmNumberTotal += 1 if not self.rnaEdit.params.keepTemp: os.remove(tempFasta) os.remove(pslFile.name) # output statistics mmPassedNumber = mmNumberTotal - (mmNumberTooSmall + mmReadsSmallerDiscardReads) Helper.info( "\t\t %d out of %d passed blat criteria" % (mmPassedNumber, mmNumberTotal), self.rnaEdit.logFile, self.rnaEdit.textField) Helper.info( "\t\t %d Missmatches had fewer than %d missmatching-Reads." % (mmNumberTooSmall, minMissmatch), self.rnaEdit.logFile, self.rnaEdit.textField) Helper.info( "\t\t %d Missmatches had more missaligned reads than correct ones." % mmReadsSmallerDiscardReads, self.rnaEdit.logFile, self.rnaEdit.textField) Helper.printTimeDiff(startTime, self.rnaEdit.logFile, self.rnaEdit.textField)
def bam_depth_with_nm(args): """ * unmapped is discarded * both clipped is discarded * end clipped is included * multimap is included * stratified with NM default mode: pos is 1-based summary mode: covered """ sam = Samfile(args.bam) if args.region: c, s, e = parse_region(args.region) it = sam.pileup(reference=r, start=s, end=e, max_depth=args.max_depth) else: it = sam.pileup(max_depth=args.max_depth) sam_info = SamInfo(sam) def cond(prec): rec = prec.alignment if rec.is_unmapped: return False read = sam_info.get_read_info(rec) if read.overhang > 0: return False return True max_key = 'NM_more' nm_keys = ['NM' + str(nm) for nm in range(args.max_nm + 1)] + [max_key] def get_key(prec): rec = prec.alignment nm = rec.get_tag('NM') if nm < args.max_nm: return 'NM' + str(nm) else: return max_key header = ['contig', 'pos'] + nm_keys def iter_table(it): Record = namedtuple('Record', header) for pcol in it: ps = filter(cond, pcol.pileups) counts = Counter(map(get_key, ps)) yield Record(pcol.reference_name, pcol.pos + 1, *(counts[k] for k in nm_keys)) summary_header = ['contig', 'length', 'covered'] + nm_keys def iter_summary(it): """ NMx is the number of covered position with at least a read whose edit distance to the refernece is under x. """ Record = namedtuple('Record', summary_header) def get_min_nm(row): for k in nm_keys: if getattr(row, k) > 0: return k it1 = iter_table(it) for contig, rows in groupby(it1, lambda row: row.contig): length = sam_info.get_length(contig) counts = Counter([get_min_nm(row) for row in rows]) nm_counts = [counts[k] for k in nm_keys] covered = sum(nm_counts) yield Record(contig, length, covered, *nm_counts) read_count_header = ['contig', 'length', 'total'] + nm_keys def iter_read_counts(it): """ NMx is the number of reads whose edit distance to the refernece is under x. """ Record = namedtuple('Record', read_count_header) it1 = iter_table(it) for contig, rows in groupby(it1, lambda row: row.contig): length = sam_info.get_length(contig) rows = list(rows) counts = {} for k in nm_keys: counts[k] = sum(getattr(row, k) for row in rows) nm_counts = [counts[k] for k in nm_keys] total = sum(nm_counts) yield Record(contig, length, total, *nm_counts) if args.summary: logging.info('Emit coverage summary') print(*summary_header, sep='\t') for row in iter_summary(it): print(*row, sep='\t') elif args.read_count: logging.info('Emit read counts') print(*read_count_header, sep='\t') for row in iter_read_counts(it): print(*row, sep='\t') else: print(*header, sep='\t') # header for row in iter_table(it): print(*row, sep='\t')
def from_bam(pysam_samfile, loci, normalized_contig_names=True): ''' Create a PileupCollection for a set of loci from a BAM file. Parameters ---------- pysam_samfile : `pysam.Samfile` instance, or filename string to a BAM file. The BAM file must be indexed. loci : list of Locus instances Loci to collect pileups for. normalized_contig_names : whether the contig names have been normalized (e.g. pyensembl removes the 'chr' prefix). Set to true to de-normalize the names when querying the BAM file. Returns ---------- PileupCollection instance containing pileups for the specified loci. All alignments in the BAM file are included (e.g. duplicate reads, secondary alignments, etc.). See `PileupCollection.filter` if these need to be removed. ''' loci = [to_locus(obj) for obj in loci] close_on_completion = False if typechecks.is_string(pysam_samfile): pysam_samfile = Samfile(pysam_samfile) close_on_completion = True try: # Map from pyensembl normalized chromosome names used in Variant to # the names used in the BAM file. if normalized_contig_names: chromosome_name_map = {} for name in pysam_samfile.references: normalized = pyensembl.locus.normalize_chromosome(name) chromosome_name_map[normalized] = name chromosome_name_map[name] = name else: chromosome_name_map = None result = PileupCollection({}) # Optimization: we sort variants so our BAM reads are localized. locus_iterator = itertools.chain.from_iterable( (Locus.from_interbase_coordinates(locus_interval.contig, pos) for pos in locus_interval.positions) for locus_interval in sorted(loci)) for locus in locus_iterator: result.pileups[locus] = Pileup(locus, []) if normalized_contig_names: try: chromosome = chromosome_name_map[locus.contig] except KeyError: logging.warn("No such contig in bam: %s" % locus.contig) continue else: chromosome = locus.contig columns = pysam_samfile.pileup( chromosome, locus.position, locus.position + 1, # exclusive, 0-indexed truncate=True, stepper="nofilter") try: column = next(columns) except StopIteration: # No reads align to this locus. continue # Note that storing the pileups here is necessary, since the # subsequent assertion will invalidate our column. pileups = column.pileups assert list(columns) == [] # column is invalid after this. for pileup_read in pileups: if not pileup_read.is_refskip: element = PileupElement.from_pysam_alignment( locus, pileup_read) result.pileups[locus].append(element) return result finally: if close_on_completion: pysam_samfile.close()
def BamFile(bam_path): """Return enclosed function to read, read depths from the "bam_path". .. code-block:: python >>> from chanjo.depth_reader import BamFile >>> read_depths = BamFile('./alignment.bam') Args: bam_path (path): path to alignment BAM-file Returns: function: function to read from the BAM-file """ # raise an error if the file doesn't exist if not os.path.exists(bam_path): raise OSError(errno.ENOENT, bam_path) bam = Samfile(bam_path) try: bam.pileup() except ValueError: # catch error when BAM-file isn't indexed (+ ".bai" file) raise OSError( errno.ENOENT, "BAM-file (%s) must be indexed." % os.path.basename(bam_path) ) def reader(contig, start, end): """Generate a list of read depths for each position (start, end). The `numpy` array is used to optimize performance when building and slicing the list. This function depends on `Pysam` >=0.7.5 since the ``truncate`` option wasn't available in previous versions. .. code-block:: python >>> read_depths = BamFile('./alignment.bam') >>> read_depths('17', 1, 5) array([3., 4., 4., 5., 4.]) .. note:: Positions are expected to be 1:1-based. In other words; if start=1, end=9 you should expect read depths for base pair positions 1-9 to be returned. Args: contig (str): contig/chromosome id (str) of interest start (int): first position of the interval (1-based) end (int): last position of the interval (1-based) Returns: list or numpy.array: array of read depths for *each* position in the interval """ # convert start to 0-based since this is what pysam expects! pysam_start = start - 1 # pysam expects contig as bytes in Python 2 pysam_contig = str(contig) # check that we don't have a negative start position if pysam_start < 0: raise ValueError("Start position must be > 0, not %d" % start) # preallocate an array of 0 read depth for each position # pysam excludes positions with 0 read depth read_depths = prealloc_func(end - pysam_start) try: # overwrite read-covered positions (>0 read depth) # ``truncate`` ensures it starts and ends on the gives positions # note: ``col.pos`` is 0-based, as is ``pysam_start`` for col in bam.pileup(pysam_contig, pysam_start, end, truncate=True): read_depths[col.pos - pysam_start] = col.n except ValueError as ve: # catch errors where the contig doesn't exist in the BAM-file raise ValueError( "Must use contig that exist in the Bam-file. Error: %s" % ve) return read_depths return reader