import pysam files = ["m54119_180806_194558.subreads.bam", "m54119_180807_160930.subreads.bam", "m54119_180808_103633.subreads.bam"] total_size = 0 for fn in files: bam = pysam.AlignmentFile(fn, 'rb', check_header=False, check_sq=False) for index,read in enumerate(bam): total_size += len(read.seq) if index % 1000000 == 0: print total_size print index print fn
def sampleNameBam(bamFile): """get @RG SM: information as sample name from BAM header""" bam = pysam.AlignmentFile(bamFile) name = bam.header['RG'][0]['SM'] return name
import pysam infile = pysam.AlignmentFile("-", "rb") outfile = pysam.AlignmentFile("-", "w", template=infile) for s in infile: outfile.write(s)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-t", "--template-bam-file", dest="filename_genome_bam", type="string", help="input bam file for header information [%default]") parser.add_option("-s", "--contigs-tsv-file", dest="filename_contigs", type="string", help="filename with contig sizes [%default]") parser.add_option("-o", "--colour", dest="colour_mismatches", action="store_true", help="mismatches will use colour differences (CM tag) [%default]") parser.add_option("-i", "--ignore-mismatches", dest="ignore_mismatches", action="store_true", help="ignore mismatches [%default]") parser.add_option("-c", "--remove-contigs", dest="remove_contigs", type="string", help="','-separated list of contigs to remove [%default]") parser.add_option("-f", "--force-output", dest="force", action="store_true", help="force overwriting of existing files [%default]") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="remove reads not matching uniquely [%default]") parser.set_defaults( filename_genome_bam=None, filename_gtf=None, filename_mismapped=None, remove_contigs=None, force=False, unique=False, colour_mismatches=False, ignore_mismatches=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) genomefile, referencenames, referencelengths = None, None, None if options.filename_genome_bam: genomefile = pysam.AlignmentFile(options.filename_genome_bam, "rb") elif options.filename_contigs: contigs = IOTools.ReadMap(IOTools.openFile(options.filename_contigs)) data = list(zip(*list(contigs.items()))) referencenames, referencelengths = data[0], list(map(int, data[1])) else: raise ValueError( "please provide either --template-bam-file or --contigs-tsv-file") infile = pysam.AlignmentFile("-", "rb") outfile = pysam.AlignmentFile("-", "wb", template=genomefile, referencenames=referencenames, referencelengths=referencelengths) if options.colour_mismatches: tag = "CM" else: tag = "NM" nambiguous = 0 ninput = 0 nunmapped = 0 ncigar = 0 nfull = 0 noutput = 0 contig2tid = dict([(y, x) for x, y in enumerate(outfile.references)]) for qname, readgroup in itertools.groupby(infile, lambda x: x.qname): ninput += 1 reads = list(readgroup) if reads[0].is_unmapped: nunmapped += 1 continue # filter for best match best = min([x.opt(tag) for x in reads]) reads = [x for x in reads if x.opt(tag) == best] if len(reads) > 1: nambiguous += 1 continue read = reads[0] # reject complicated matches (indels, etc) # to simplify calculations below. if len(read.cigar) > 1: ncigar += 1 continue # set NH flag to latest count t = dict(read.tags) t['NH'] = 1 read.tags = list(t.items()) sname = infile.getrname(read.tid) contig, first_exon_start, middle, last_exon_end, splice, strand = sname.split( "|") first_exon_end, last_exon_start = middle.split("-") first_exon_start, first_exon_end, last_exon_start, last_exon_end = list(map(int, ( first_exon_start, first_exon_end, last_exon_start, last_exon_end))) first_exon_end += 1 total = first_exon_end - first_exon_start + \ last_exon_end - last_exon_start first_exon_length = first_exon_end - first_exon_start match1 = first_exon_length - read.pos intron_length = last_exon_start - first_exon_end match2 = read.qlen - match1 # match lies fully in one exon - ignore if match1 <= 0 or match2 <= 0: nfull += 1 continue # increment pos read.pos = first_exon_start + read.pos read.tid = contig2tid[contig] # 3 = BAM_CREF_SKIP read.cigar = [(0, match1), (3, intron_length), (0, match2)] outfile.write(read) noutput += 1 outfile.close() if genomefile: genomefile.close() c = E.Counter() c.input = ninput c.output = noutput c.full = nfull c.cigar = ncigar c.ambiguous = nambiguous c.unmapped = nunmapped E.info("%s" % str(c)) # write footer and output benchmark information. E.Stop()
def count_bam_file_length(bam_file: str) -> int: """Get length of BAM indexed file""" import pysam return pysam.AlignmentFile(bam_file).count()
def makemut(args, chrom, start, end, vaf, ins, avoid, alignopts): ''' is ins is a sequence, it will is inserted at start, otherwise delete from start to end''' if args.seed is not None: random.seed(int(args.seed) + int(start)) mutid = chrom + '_' + str(start) + '_' + str(end) + '_' + str(vaf) if ins is None: mutid += ':DEL' else: mutid += ':INS:' + ins bamfile = pysam.AlignmentFile(args.bamFileName, 'rb') bammate = pysam.AlignmentFile( args.bamFileName, 'rb') # use for mates to avoid iterator problems reffile = pysam.Fastafile(args.refFasta) vcffile = pysam.VariantFile(args.germline, 'r') if args.germline is not None else None tmpbams = [] is_insertion = ins is not None is_deletion = ins is None snvfrac = float(args.snvfrac) mutstr = get_mutstr(chrom, start, end, ins, reffile) del_ln = 0 if is_deletion: del_ln = end - start mutpos = start mutpos_list = [start] # optional CNV file cnv = None if (args.cnvfile): cnv = pysam.Tabixfile(args.cnvfile, 'r') log = open( 'addindel_logs_' + os.path.basename(args.outBamFile) + '/' + os.path.basename(args.outBamFile) + "." + "_".join( (chrom, str(start), str(end))) + ".log", 'w') tmpoutbamname = args.tmpdir + "/" + mutid + ".tmpbam." + str( uuid4()) + ".bam" logger.info("%s creating tmp bam: %s" % (mutid, tmpoutbamname)) outbam_muts = pysam.AlignmentFile(tmpoutbamname, 'wb', template=bamfile) mutfail, hasSNP, maxfrac, outreads, mutreads, mutmates = mutation.mutate( args, log, bamfile, bammate, chrom, mutpos, mutpos + del_ln + 1, mutpos_list, avoid=avoid, mutid_list=[mutid], is_insertion=is_insertion, is_deletion=is_deletion, ins_seq=ins, reffile=reffile, indel_start=start, indel_end=end, vcffile=vcffile) if mutfail: outbam_muts.close() os.remove(tmpoutbamname) return None # pick reads to change readlist = [] for extqname, read in outreads.items(): if read.seq != mutreads[extqname]: readlist.append(extqname) logger.info("%s len(readlist): %d" % (mutid, len(readlist))) readlist.sort() random.shuffle(readlist) if len(readlist) < int(args.mindepth): logger.warning("%s skipped, too few reads in region: %d" % (mutid, len(readlist))) outbam_muts.close() os.remove(tmpoutbamname) return None if vaf is None: vaf = float(args.mutfrac ) # default minor allele freq if not otherwise specified if cnv: # cnv file is present if chrom in cnv.contigs: for cnregion in cnv.fetch(chrom, start, end): cn = float( cnregion.strip().split()[3]) # expect chrom,start,end,CN logger.info(mutid + "\t" + ' '.join(("copy number in snp region:", chrom, str(start), str(end), "=", str(cn)))) if float(cn) > 0.0: vaf = vaf / float(cn) else: vaf = 0.0 logger.info("%s adjusted VAF: %f" % (mutid, vaf)) else: logger.info("%s selected VAF: %f" % (mutid, vaf)) lastread = int(len(readlist) * vaf) # pick at least args.minmutreads if possible if lastread < int(args.minmutreads): if len(readlist) > int(args.minmutreads): lastread = int(args.minmutreads) logger.warning("%s forced %d reads" % (mutid, lastread)) else: logger.warning( "%s dropped site with fewer reads than --minmutreads" % mutid) os.remove(tmpoutbamname) return None readtrack = dd(list) for readname in readlist: orig_name, readpos, pairend = readname.split(',') readtrack[orig_name].append('%s,%s' % (readpos, pairend)) usedreads = 0 newreadlist = [] for orig_name in readtrack: for read_instance in readtrack[orig_name]: newreadlist.append(orig_name + ',' + read_instance) usedreads += 1 if usedreads >= lastread: break readlist = newreadlist logger.info("%s picked: %d reads" % (mutid, len(readlist))) wrote = 0 nmut = 0 mut_out = {} # change reads from .bam to mutated sequences for extqname, read in outreads.items(): if read.seq != mutreads[extqname]: if not args.nomut and extqname in readlist: qual = read.qual # changing seq resets qual (see pysam API docs) read.seq = mutreads[extqname] # make mutation read.qual = qual nmut += 1 if not hasSNP or args.force: wrote += 1 mut_out[extqname] = read muts_written = {} for extqname in mut_out: if extqname not in muts_written: outbam_muts.write(mut_out[extqname]) muts_written[extqname] = True if mutmates[extqname] is not None: # is mate also in mutated list? mate_read = mutmates[extqname] pairname = 'F' # read is first in pair if mate_read.is_read2: pairname = 'S' # read is second in pair if not mate_read.is_paired: pairname = 'U' # read is unpaired mateqname = ','.join( (mate_read.qname, str(mate_read.pos), pairname)) if mateqname in mut_out: # yes: output mutated mate outbam_muts.write(mut_out[mateqname]) muts_written[mateqname] = True else: # no: output original mate outbam_muts.write(mate_read) logger.info("%s wrote: %d, mutated: %d" % (mutid, wrote, nmut)) if not hasSNP or args.force: outbam_muts.close() aligners.remap_bam(args.aligner, tmpoutbamname, args.refFasta, alignopts, threads=int(args.alignerthreads), mutid=mutid, paired=(not args.single), insane=args.insane) outbam_muts = pysam.AlignmentFile(tmpoutbamname, 'rb') coverwindow = 1 incover = countReadCoverage(bamfile, chrom, mutpos - coverwindow, mutpos + del_ln + coverwindow) outcover = countReadCoverage(outbam_muts, chrom, mutpos - coverwindow, mutpos + del_ln + coverwindow) avgincover = float(sum(incover)) / float(len(incover)) avgoutcover = float(sum(outcover)) / float(len(outcover)) spikein_frac = 0.0 if wrote > 0: spikein_frac = float(nmut) / float(wrote) # qc cutoff for final snv depth if (avgoutcover > 0 and avgincover > 0 and avgoutcover / avgincover >= float(args.coverdiff)) or args.force: tmpbams.append(tmpoutbamname) indelstr = '' if is_insertion: indelstr = ':'.join(('INS', chrom, str(start), ins)) else: indelstr = ':'.join(('DEL', chrom, str(start), str(end))) snvstr = chrom + ":" + str(start) + "-" + str( end) + " (VAF=" + str(vaf) + ")" log.write("\t".join(("indel", indelstr, str(mutpos), mutstr, str(avgincover), str(avgoutcover), str(spikein_frac), str(maxfrac))) + "\n") else: outbam_muts.close() os.remove(tmpoutbamname) if os.path.exists(tmpoutbamname + '.bai'): os.remove(tmpoutbamname + '.bai') logger.warning("%s dropped for outcover/incover < %s" % (mutid, str(args.coverdiff))) return None outbam_muts.close() bamfile.close() bammate.close() log.close() return sorted(tmpbams)
def get_position_matrix(bam, chrom, start, stop, reffile, stepper='all'): """ Given a coordinate range, return a dataframe containing positional coverage. :param bam: basestring BAM file name :param chrom: basestring chromosome (ie. chr1) :param start: start position (start at this position) :param stop: int stop position (do not exceed this position) :param reffile: basestring reference fasta file :param stepper: stepper :return: """ total_reads = 0 reference = pybedtools.BedTool.seq([chrom, 0, stop], reffile) infile = pysam.AlignmentFile(bam, "rb", reference_filename=reffile) count = start # running counter for each position added alphabet = {} positions = [] offset = 0 max_offset = 0 MAX_DEPTH = 10000000 check = start for pileupcolumn in infile.pileup(chrom, start, stop, stepper=stepper, max_depth=MAX_DEPTH): if pileupcolumn.pos >= start: st = "" # print("pileuppos: {}".format(pileupcolumn.pos)) # print("count: {}".format(count)) if count >= ( stop ) or pileupcolumn.pos >= stop: # I think this works because of sorted reads? break """ if there is no read coverage at the beginning positions """ while (count < pileupcolumn.pos): alphabet['A'] = 0 alphabet['T'] = 0 alphabet['C'] = 0 alphabet['G'] = 0 alphabet['del'] = 0 alphabet['ref'] = reference[pileupcolumn.reference_pos].upper() # print(alphabet) positions.append(alphabet) alphabet = {} # print("ADDING COUNT") count = count + 1 # print('{}\t0'.format(count)) # print str(pp.pos)+'\t'+str(pp.n) # print(len(pileupcolumn.pileups)) for pileupread in pileupcolumn.pileups: # for each pileup read total_reads = total_reads + 1 if not pileupread.is_del and not pileupread.indel and not pileupread.is_refskip: st = st + pileupread.alignment.query_sequence[ pileupread.query_position] elif pileupread.is_del: st = st + 'd' elif pileupread.is_refskip: st = st + 's' else: st = st + '-' # print(st) # print("ADDING: {} at step: {}, at pos: {}".format(st, count, pileupcolumn.reference_pos)) alphabet['A'] = st.count('A') alphabet['T'] = st.count('T') alphabet['C'] = st.count('C') alphabet['G'] = st.count('G') alphabet['del'] = st.count('d') alphabet['ref'] = reference[pileupcolumn.reference_pos].upper() count = count + 1 # print(alphabet) positions.append(alphabet) alphabet = {} # print('{} '.format(count)), """ If there are positions in the end without read coverage """ while count < stop: # count = count + 1 alphabet['A'] = 0 alphabet['T'] = 0 alphabet['C'] = 0 alphabet['G'] = 0 alphabet['del'] = 0 alphabet['ref'] = reference[count].upper() # print(alphabet) count = count + 1 positions.append(alphabet) # print(start, stop, len(positions), max_offset, check-start) # print(total_reads) return pd.DataFrame(positions)
def __init__( self, sites, bam, logs_dir, # send_end, name, cells=None, log_every=25000, pad_left=15, pad_right=20, min_reads=2, min_freq=0.65, site_keys=None, add_seq=False, chr_tag="", **kwargs, ): super().__init__(name=name, **kwargs) self.sites = sites self.chr_tag = chr_tag self.bam = pysam.AlignmentFile(bam, "r") self.logs_dir = logs_dir self.log_every = int(log_every) # self.send_end = send_end self.pad_left = int(pad_left) self.pad_right = int(pad_right) self.min_reads = min_reads self.min_freq = min_freq self.add_seq = add_seq self.iter = 0 self.n_corr = 0 self.n_umi = 0 self.rm_reads = [] self.info = dict() self.info["umi"] = dict( cfreq=[], freq=[], count=[], nucl=[], qual=[], avg_qual=[], motif_len=[], variants=[], chrom=[], start=[], stop=[], ) self.info["global"] = dict( n_umi=[], n_umi_corr=[], n_reads=[], n_reads_corr=[], f_min_reads=[], f_min_freq=[], nucl=[], motif_len=[], variants=[], variants_kept=[], chrom=[], start=[], stop=[], ) self.sites_kept = set() if cells is not None: self.cells = set( map(lambda x: x.split()[0], open(cells, "r").read().splitlines()) ) self.has_cells = True else: self.cells = set() self.has_cells = False if site_keys is None: self.site_keys = OrderedDict( {v: i for i, v in enumerate(["chrom", "start", "stop"])} ) else: self.site_keys = OrderedDict(site_keys) self.variant_f = open(self.pr_path.format("variants", "txt"), "w") self.variant_f.write( "chrom start cb a1 a2 n-umi-1 n-umi-2 n-reads-1 n-reads-2\n" )
def rawAssignment(SRAList, Names, Params): # changes: 04.Jul :: restucturate h5 file - creates h5 files with keys like "For_rpm/I" and sets "Position" to index makeDirectory("6-AssignRaw") makeDirectory("6-AssignRaw/Reports") # include_mapped_twice influence ho normalisation factor is computed. vt blow include_mapped_twice = Params[ 'MappedTwice'] # includes reads mapped twice NH:i:2 save_csv = bool(Params["Save2csv"] ) # save output to tab delim csv. in addition to hdf5 rlmin = int(Params["ReadLenMiN"]) rlmax = int(Params["ReadLenMaX"]) Mapping = Params["Mapping"] # Mapping 5 or 3 prime end rlrange = str(rlmin) + "-" + str(rlmax) # read length range 4 filename for iN in Names: BamName = "5-Aligned/" + iN + ".bam" # sorted and indexed BAM bamfile = pysam.AlignmentFile(BamName, "rb") # open BAM filee outfile_for = "6-AssignRaw/" + iN + "_raw" + "_For.txt" outfile_rev = "6-AssignRaw/" + iN + "_raw" + "_Rev.txt" outfile_hdf = "6-AssignRaw/" + iN + ".h5" outf_idx_hdf = "6-AssignRaw/" + iN + "_" + "idx" + ".h5" LogFileName = "6-AssignRaw/Reports/" + iN + "_log.txt" LOG_FILE = open(LogFileName, "wt") # counters for log c2_twice = c_once = total_no = 0 # empty dataframe for collecting data df_for_sum = pd.DataFrame() df_rev_sum = pd.DataFrame() # Process Log report = "\nBamFile: {}\nrlmin: {}\nrlmax: {}\nName: {}\nMapping: {}".format( BamName, rlmin, rlmax, iN, Mapping) LOG_FILE.write(report + "\n") print(report, "\n") # humanChr() gives an ordered list for ref in yeastChr(): c1 = 0 c2 = 0 reads_mapped_ref = 0 ref_total_read_count = 0 defF = defaultdict(list) # DefaultDict For defR = defaultdict(list) # DefaultDict Rev ForDict = {} # Collecting data For RevDict = {} # Collecting data Rev for read in bamfile.fetch(ref): ref_total_read_count += 1 # collect no of readsa if (read.get_tag("NH") == 1): c1 += 1 elif (read.get_tag("NH") == 2): c2 += 1 else: pass if ( read.get_tag("NH") == 1 ): # NH tag (NH:i:1) tells how many times read are mapped to genome reads_mapped_ref += 1 readl = read.query_length # get read length # Redefining leftmost & rightmost if not read.is_reverse: # read is Forward beg = read.reference_start # 5' end = read.reference_end - 1 # 3' correct by -1 else: # read is Reverse beg = read.reference_end - 1 # 5' correct by -1 end = read.reference_start # 3' if Mapping == "5": defR[readl].append( beg) if read.is_reverse else defF[readl].append( beg) if Mapping == "3": defR[readl].append( end) if read.is_reverse else defF[readl].append( end) # to include those mapped twice if (read.get_tag("NH") == 2) & (include_mapped_twice == "Yes"): reads_mapped_ref += 1 readl = read.query_length # get read length # Redefining leftmost & rightmost if not read.is_reverse: # read is Forward beg = read.reference_start # 5' end = read.reference_end - 1 # 3' correct by -1 else: # read is Reverse beg = read.reference_end - 1 # 5' correct by -1 end = read.reference_start # 3' if Mapping == "5": defR[readl].append( beg) if read.is_reverse else defF[readl].append( beg) if Mapping == "3": defR[readl].append( end) if read.is_reverse else defF[readl].append( end) dummy = [0] for rlen in range(rlmin, rlmax + 1): ForDict[rlen] = Counter(defF.get( rlen, dummy)) # .get() method if rlen RevDict[rlen] = Counter(defR.get( rlen, dummy)) # if don't exist use dummy df_for = update_df(pd.DataFrame(ForDict), Chr=ref, strand="+") df_rev = update_df(pd.DataFrame(RevDict), Chr=ref, strand="-") df_for_sum = pd.concat([df_for_sum, df_for], ignore_index=True) # collect summary table df_rev_sum = pd.concat([df_rev_sum, df_rev], ignore_index=True) # collect summary table # Log_File pr Chr report = "{:<5s}\t{:>10,d} reads".format(ref, reads_mapped_ref) LOG_FILE.write(report + "\n") print(report) # Reset/collect counter data c_once += c1 c2_twice += c2 # mapped twice total_no += ref_total_read_count reads_mapped_ref = ref_total_read_count = 0 # Per Name !!! # convert int column names to str df_for_sum.rename(columns={i: str(i) for i in range(rlmin, rlmax + 1)}, inplace=True) # num col_names to str df_rev_sum.rename(columns={i: str(i) for i in range(rlmin, rlmax + 1)}, inplace=True) # num col_names to str ## Log Report summary report = "\nTotal No of reads {:>11,} mapped to genome\n".format( total_no) report += "Number of reads {:>11,d} mapped once to genome\n".format( c_once) report += "Number of reads {:>11,d} mapped twice to genome reports # will be added if MappedTwice == True \n".format( c2_twice) report += "Number of reads {:>11,d} mapped more than counted already\n".format( total_no - (c_once + c2_twice)) LOG_FILE.write(report) print(report) ##>> report = "\nOutput tables are stored:" LOG_FILE.write(report + "\n") print(report) if save_csv == True: df_for_sum.to_csv(outfile_for, sep='\t', header=True, index=True) # csv table output df_rev_sum.to_csv(outfile_rev, sep='\t', header=True, index=True) # csv table output report = "{}\n{}\n".format(outfile_for, outfile_rev) LOG_FILE.write(report + "\n") print(report) report = "{}\tkeys: 'For_raw', 'Rev_raw'".format(outfile_hdf) store = pd.HDFStore(outfile_hdf, complevel=5, complib="zlib", mode="w") store.put("For_raw", df_for_sum, format="table", data_columns=True) store.put("Rev_raw", df_rev_sum, format="table", data_columns=True) LOG_FILE.write("\n" + report + "\n") print(report) # Convert to RPM s report = "\n Converting raw -> rpm \n" LOG_FILE.write(report + "\n") print(report) report = "" # ## Convert RAW -> RPM # # include_mapped_twice = Yes mapped twice are included to RPM normalisation # normFactor = 0 if include_mapped_twice == "Yes": l = [0 for read in bamfile.fetch() if read.get_tag("NH") <= 2] # reads mapped once & twice normFactor = len(l) / 10**6 # normalisation factor report = "Normalization factor {} is computed based reads mapped once and twice {:,}".format( normFactor, len(l)) else: l = [0 for read in bamfile.fetch() if read.get_tag("NH") == 1] # reads mapped once normFactor = len(l) / 10**6 # normalisation factor report = "Normalization factor {} is computed based on reads mapped once {:,}".format( normFactor, len(l)) LOG_FILE.write(report + "\n") print(report) report = "" col2norm = [str(i) for i in (range(rlmin, rlmax + 1))] + ["sum"] for iX in col2norm: # normalization df_for_sum[iX] = df_for_sum[iX] / normFactor df_rev_sum[iX] = df_rev_sum[iX] / normFactor line = "Normal factor for {} - {:7.4f}".format(iX, normFactor) report += line + "\n" print(line) LOG_FILE.write(report + "\n") print("") if save_csv == True: outfile_for, outfile_rev = outfile_for.replace( "_raw", "_rpm"), outfile_rev.replace("_raw", "_rpm") df_for_sum.to_csv(outfile_for, sep='\t', header=True, index=True) # csv table output df_rev_sum.to_csv(outfile_rev, sep='\t', header=True, index=True) # csv table output report = "{}\n{}\n".format(outfile_for, outfile_rev) LOG_FILE.write(report + "\n") print(report) store.put("For_rpm", df_for_sum, format="table", data_columns=True) store.put("Rev_rpm", df_rev_sum, format="table", data_columns=True) store.close() report = "\n{}\tkeys: 'For_rpm', 'Rev_rpm'\n".format(outfile_hdf) report += "\n{}\tTime taken thus far: {}".format( iN, time.time() - Start) LOG_FILE.write(report + "\n") print(report) # restructurate hdf infile = outfile_hdf outfile = outf_idx_hdf restructurate_hd5(infile, outfile, close_outfile=True) report = "Restructurate hdf\nInfile: {}\nOutfile: {}".format( infile, outfile) LOG_FILE.write(report + "\n") print(report, "\n") LOG_FILE.close() bamfile.close()
def main(): args = parse_args() og_bam = args.original_bam read_set_out = args.read_set_out read_set = args.read_set files_to_delete = args.files_to_delete files_to_delete_path = args.files_to_delete_path new_file_path = args.new_file_path new_bams = args.new_bam_filename new_fastqs = args.new_fastq_filename print dir(gzip_module) bam_files = [] fastq_files = [] original_reads = set() new_reads = set() to_remove = [] if og_bam == None and read_set == None: raise ValueError('--original_bam or --read_set parameter must be set.') if not og_bam == None and read_set_out == None: raise ValueError( '--read_set_out parameter must be set when using --original_bam.') if og_bam == None and not read_set_out == None: raise ValueError( '--original_bam parameter must be set when using --read_set_out.') if not og_bam == None and not read_set == None: raise ValueError( '--original_bam and --read_set parameters cannot both be set. Only use one.' ) if not new_bams == None and not new_fastqs == None: raise ValueError( '--new_bam and --new_fastq parameters cannot both be set. Only use one.' ) if new_bams == None and new_fastqs == None: raise ValueError( 'Either --new_bam_filename or --new_fastq_filename must be set.') if not files_to_delete == None and files_to_delete_path == None: print "Using './' path as default filepath for files indicated by --files_to_delete parameter." if new_file_path == None: print "Using './' path as default filepath for new BAMs or new FASTQs." if not og_bam == None and read_set == None: # original bam parameter set and read_set parameter not set. samfile = pysam.AlignmentFile(os.path.abspath(og_bam), 'rb') for read in samfile.fetch(until_eof=True): if read.is_read1: original_reads.add(read.query_name + '/1') elif read.is_read2: original_reads.add(read.query_name + '/2') else: original_reads.add(read.query_name) p = subprocess.Popen('gzip > ' + read_set_out + '.gz', bufsize=-1, shell=True, stdin=subprocess.PIPE) p.stdin.write('\n'.join(original_reads)) elif og_bam == None and not read_set == None: # read set parameter set and original bam parameter not set. if '.gz' in os.path.basename(read_set): with gzip_module.open(read_set, 'rb') as f: original_reads = set(r.rstrip() for r in f) else: with open(read_set) as f: original_reads = set(r.rstrip() for r in f) # Parse new bam or new fastqs into set if not new_bams == None and new_fastqs == None: if len(new_bams) == 1: if '*' in new_bams[0]: bam_files = glob.glob( os.path.join(new_file_path, os.path.basename(new_bams[0]))) else: bam_files.append( os.path.join(new_file_path, os.path.basename(new_bams[0]))) else: bam_files = [ os.path.join(new_file_path, os.path.basename(b)) for b in new_bams ] for bam_file in bam_files: samfile = pysam.AlignmentFile(os.path.abspath(bam_file), 'rb') for read in samfile.fetch(until_eof=True): if read.is_read1: new_reads.add(read.query_name + '/1') elif read.is_read2: new_reads.add(read.query_name + '/2') else: new_reads.add(read.query_name) elif new_bams == None and not new_fastqs == None: if len(new_fastqs) == 1: if '*' in new_fastqs[0]: fastq_files = glob.glob( os.path.join(new_file_path, os.path.basename(new_fastqs[0]))) else: fastq_files.append( os.path.join(new_file_path, os.path.basename(new_fastqs[0]))) else: fastq_files = [ os.path.join(new_file_path, os.path.basename(f)) for f in new_fastqs ] for fastq_file in fastq_files: abs_fastq_file = os.path.abspath(fastq_file) if '.gz' in fastq_file: with gzip_module.open(abs_fastq_file) as fastq: i = 0 for line in fastq: if i % 4 == 0: qname = line.rstrip() new_reads.add(qname[1:]) i += 1 else: with open(abs_fastq_file) as fastq: i = 0 for line in fastq: if i % 4 == 0: qname = line.rstrip() new_reads.add(qname[1:]) i += 1 # Check if the two sets are the same. if not len(original_reads - new_reads) == 0: print original_reads - new_reads raise ValueError( "FAIL: Read names are missing in the new files compared to original file." ) elif not len(new_reads - original_reads) == 0: print new_reads - original_reads raise ValueError( "FAIL: More read names in new file(s) compared to original file.") else: print "SUCCESS: Read names in original file matches read names in new file(s)." to_remove = [] if not files_to_delete == None: if len(files_to_delete) == 1: if '*' in files_to_delete[0]: to_remove = glob.glob( os.path.join(files_to_delete_path, os.path.basename(files_to_delete))) else: to_remove.append( os.path.join(files_to_delete_path, os.path.basename(files_to_delete[0]))) else: to_remove = [ os.path.join(files_to_delete_path, os.path.basename(f)) for f in files_to_delete ] subprocess.call(["rm"] + to_remove) return
def openFile(self, dataFile): return pysam.AlignmentFile(dataFile)
return distances[-1] # criteria # the boundary of real and noisy STAMPs num_of_STAMPs = int(sys.argv[3]) # num_of_STAMPs = 500 edit_distance_threthold = 1 minimal_mapping_quality = 10 bam_file = sys.argv[2] with open(sys.argv[1], 'r') as f: selected_STAMPs = [next(f).split('\t')[1] for x in range(num_of_STAMPs)] f = pysam.AlignmentFile(bam_file, 'rb') output_bam_noisy = bam_file.replace('.bam', '') + '_mapQ' + \ str(minimal_mapping_quality) + '_below' + \ str(num_of_STAMPs) + '.bam' f_output_noisy = pysam.AlignmentFile(output_bam_noisy, 'wb', template=f) output_bam_real = bam_file.replace('.bam', '') + '_mapQ' + \ str(minimal_mapping_quality) + '_above' + \ str(num_of_STAMPs) + '.bam' f_output_real = pysam.AlignmentFile(output_bam_real, 'wb', template=f) for read in f.fetch(until_eof=True): if read.mapping_quality >= minimal_mapping_quality: indicator = int() for i in read.tags: if i[0] == 'XC':
def _parse_sam_file_and_vcf( cls, samfile, query_vcf_file, flank_length, allow_mismatches, exclude_regions=None, max_soft_clipped=3, number_ns=0, ): if exclude_regions is None: exclude_regions = {} found = [] match_flag = [] correct_allele = [] gt_conf = [] allele = [] samfile_handle = pysam.AlignmentFile(samfile, "r") sam_previous_record_name = None for sam_record in samfile_handle.fetch(until_eof=True): if sam_record.query_name == sam_previous_record_name: continue sam_previous_record_name = sam_record.query_name found_conf = False found_allele = False # see if excluded region in bed file ref, start, ref_num, var_num, allele_num = sam_record.query_name.rsplit( ".", maxsplit=5) start = int(start) + flank_length exclude = False for ref_name in exclude_regions.keys(): end = int(start) + 1 interval = pyfastaq.intervals.Interval(start, end) exclude = EvaluateRecall._interval_intersects_an_interval_in_list( interval, exclude_regions[ref_name]) if exclude: found.append("Exclude") gt_conf.append(0) allele.append("0") continue match = EvaluateRecall._check_if_sam_match_is_good( sam_record, flank_length, query_sequence=sam_record.query_sequence, allow_mismatches=allow_mismatches, max_soft_clipped=max_soft_clipped, ) alignment_start = str(sam_record).split("\t")[3] match_flag.append(match) if match == "Good": logging.debug("SAM record is a good match") logging.debug("SAM record reference is %s" % sam_record.reference_name) ref_name, expected_start, vcf_pos_index, vcf_record_index, allele_index = sam_record.reference_name.rsplit( ".", maxsplit=4) vcf_reader = pysam.VariantFile(query_vcf_file) vcf_interval_start = (int(expected_start) + int(alignment_start) + flank_length - 2 - number_ns) vcf_interval_end = (int(expected_start) + int(alignment_start) + flank_length - number_ns) logging.debug( "Find VCF records matching ref %s in interval [%i,%i]" % (ref_name, vcf_interval_start, vcf_interval_end)) for i, vcf_record in enumerate( vcf_reader.fetch(ref_name, vcf_interval_start, vcf_interval_end)): if i == int(vcf_pos_index): sample_name = vcf_record.samples.keys()[0] if ("GT" in vcf_record.format.keys() and len( set(vcf_record.samples[sample_name]["GT"])) == 1): if int(allele_index) == int( vcf_record.samples[sample_name]["GT"][0]): found.append("1") allele.append(str(allele_index)) correct_allele.append("1") found_allele = True if "GT_CONF" in vcf_record.format.keys(): gt_conf.append( int( float( vcf_record.samples[sample_name] ["GT_CONF"]))) found_conf = True if not found_allele: found.append("0") allele.append("0") correct_allele.append("0") if not found_conf: gt_conf.append(0) assert len(found) == len(gt_conf) assert len(found) == len(allele) assert len(found) == len(match_flag) assert len(found) == len(correct_allele) return found, gt_conf, allele, match_flag, correct_allele
"--fasta", help="Input fasta file", required=True) parser.add_argument("-l", "--length", help="Read length", default=100) parser.add_argument("-o", "--output", help="Output BAM file", required=True) parser.add_argument("-s", "--seed", help="Random seed", default=2) parser.add_argument("--verbose", help="Verbose mode", action="store_true") opts = parser.parse_args() seqs = loadSeqsFromFasta(opts.fasta) header = makeHeader(seqs) with pysam.AlignmentFile(opts.output, "wb", header=header) as outf: for i, seq in enumerate(seqs): n = 0 if opts.verbose: print("Chromosome: {}".format(seq)) for pos in range(len(seqs[seq]) - int(opts.length)): n += 1 a = pysam.AlignedSegment() a.query_name = "read_" + str(pos) + "_" + str(n) a.query_sequence = seqs[seq][pos:pos + int(opts.length)] a.flag = 0 a.reference_id = i a.reference_start = pos a.mapping_quality = 20 a.cigarstring = str(opts.length) + 'M'
def to_bam(string): return pysam.AlignmentFile(string, mode="rb")
chrom_ind += chrom_span if len(gene_merge_list) != 0: merge_dict[sort_df.loc[i, 'name2']] = gene_merge_list i += sym_span return merge_dict samfile_path = '/Users/liuzhen/intern/data/test/tophat_map_g1/accepted_hits.bam' refgene_path = '/Users/liuzhen/intern/data/test/refGene.txt' result_path = '/Users/liuzhen/intern/data/test/RPKM_gene_dpsfd_avglen_result.txt' refgene_column = [ 'bin', 'name', 'chrom', 'strand', 'txStart', 'txEnd', 'cdsStart', 'cdsEnd', 'exonCount', 'exonStarts', 'exonEnds', 'score', 'name2', 'cdsStartStat', 'cdsEndStat', 'exonFrames' ] samfile = pysam.AlignmentFile(samfile_path, 'rb') refgene_df = pd.read_table(refgene_path, sep='\t', header=None, names=refgene_column) mapped_reads_amount = samfile.count() merge_dict = mergeRefGeneDf(refgene_df) result_list = [] for gene in merge_dict: read_counts = 0 exon_length = 0 chrom_num = 0 trans_id_list = [] tss_list = [] chrom_list = []
def main(args): logger.info("starting %s called with args: %s" % (sys.argv[0], ' '.join(sys.argv))) bedfile = open(args.varFileName, 'r') reffile = pysam.Fastafile(args.refFasta) if not os.path.exists(args.bamFileName + '.bai'): logger.error("input bam must be indexed, not .bai file found for %s" % args.bamFileName) sys.exit(1) alignopts = {} if args.alignopts is not None: alignopts = dict([o.split(':') for o in args.alignopts.split(',')]) aligners.checkoptions(args.aligner, alignopts) # load readlist to avoid, if specified avoid = None if args.avoidreads is not None: avoid = dictlist(args.avoidreads) # make a temporary file to hold mutated reads outbam_mutsfile = "addindel." + str(uuid4()) + ".muts.bam" bamfile = pysam.AlignmentFile(args.bamFileName, 'rb') outbam_muts = pysam.AlignmentFile(outbam_mutsfile, 'wb', template=bamfile) outbam_muts.close() bamfile.close() tmpbams = [] if not os.path.exists(args.tmpdir): os.mkdir(args.tmpdir) logger.info("created tmp directory: %s" % args.tmpdir) if not os.path.exists('addindel_logs_' + os.path.basename(args.outBamFile)): os.mkdir('addindel_logs_' + os.path.basename(args.outBamFile)) logger.info("created directory: addindel_logs_%s" % os.path.basename(args.outBamFile)) assert os.path.exists('addindel_logs_' + os.path.basename(args.outBamFile) ), "could not create output directory!" assert os.path.exists(args.tmpdir), "could not create temporary directory!" pool = Pool(processes=int(args.procs)) results = [] ntried = 0 for bedline in bedfile: if ntried < int(args.numsnvs) or int(args.numsnvs) == 0: c = bedline.strip().split() chrom = c[0] start = int(c[1]) end = int(c[2]) vaf = float(c[3]) type = c[4] ins = None assert type in ('INS', 'DEL') if type == 'INS': ins = c[5] # make mutation (submit job to thread pool) result = pool.apply_async( makemut, [args, chrom, start, end, vaf, ins, avoid, alignopts]) results.append(result) ntried += 1 for result in results: tmpbamlist = result.get() if tmpbamlist is not None: for tmpbam in tmpbamlist: if os.path.exists(tmpbam): tmpbams.append(tmpbam) if len(tmpbams) == 0: logger.error("no succesful mutations") sys.exit() tmpbams.sort() # merge tmp bams if len(tmpbams) == 1: os.rename(tmpbams[0], outbam_mutsfile) elif len(tmpbams) > 1: mergebams(tmpbams, outbam_mutsfile, maxopen=int(args.maxopen)) bedfile.close() # cleanup for bam in tmpbams: if os.path.exists(bam): os.remove(bam) if os.path.exists(bam + '.bai'): os.remove(bam + '.bai') if os.listdir(args.tmpdir) == []: os.rmdir(args.tmpdir) if args.skipmerge: logger.info("skipping merge, plase merge reads from %s manually." % outbam_mutsfile) else: if args.tagreads: from bamsurgeon.markreads import markreads tmp_tag_bam = 'tag.%s.bam' % str(uuid4()) markreads(outbam_mutsfile, tmp_tag_bam) move(tmp_tag_bam, outbam_mutsfile) logger.info("tagged reads.") logger.info("done making mutations, merging mutations into %s --> %s" % (args.bamFileName, args.outBamFile)) replace(args.bamFileName, outbam_mutsfile, args.outBamFile, seed=args.seed) #cleanup os.remove(outbam_mutsfile) var_basename = '.'.join(os.path.basename(args.varFileName).split('.')[:-1]) bam_basename = '.'.join(os.path.basename(args.outBamFile).split('.')[:-1]) vcf_fn = bam_basename + '.addindel.' + var_basename + '.vcf' makevcf.write_vcf_indel( 'addindel_logs_' + os.path.basename(args.outBamFile), args.refFasta, vcf_fn) logger.info('vcf output written to ' + vcf_fn)
import pysam import sys samfile = pysam.AlignmentFile(sys.argv[1], "r") for align in samfile: if not align.is_supplementary \ and not align.is_secondary \ and not align.is_unmapped \ and align.reference_length >= 500 \ and align.mapping_quality > 0: try: sa = align.get_tag('SA') except KeyError: # exclude alignments if supplementary exists ref = samfile.get_reference_name(align.reference_id) ref, contig = ref.split("__") print("%s\t%s\t%s\t%s" % (ref, contig, align.query_name, align.alen))
import pysam sam = pysam.AlignmentFile("SRR3189743.join.aligned.sort.bam") # print(sam.header) #sam.count(contig=sam.references[100]) #sam.count_coverage(contig=sam.references[100]) #sam.pileup(contig=sam.references[100]) #for c in sorted([(sam.count(contig=c), c) for c in sam.references]): # print(f"{c[0]}\t{c[1]}") target_contig = sam.references[0] # select significant column cc = sam.count_coverage(contig=target_contig) [ for i in range(len(cc)) ] # divide reads into new tmp bams # recur for new bams sam.close()
def retrieve_reads(in_bam, in_vcf, quality, out_list): print("\033[32m%s\033[0m Reading VCF" % (time.strftime('[%H:%M:%S]', time.localtime(time.time())))) snp_db, snp_pos_db = read_vcf(in_vcf) reads_db = {} for chrn in snp_pos_db: reads_db[chrn] = {} for pos in snp_pos_db[chrn]: reads_db[chrn][pos] = {'ref': {}, 'alt': {}} bamfile = pysam.AlignmentFile(in_bam, 'rb') print("\033[32m%s\033[0m Reading bam" % (time.strftime('[%H:%M:%S]', time.localtime(time.time())))) for line in bamfile: read_name = line.query_name flag = bin(line.flag) #if line.mapping_quality < quality or line.mapping_quality == 255: # filter data with mapping quality # continue if flag[-3] == '1': # this flag means segment unmapped continue if len(flag) > 7 and flag[ -5] == '1': # this flag means query seq is reverse complemented is_rev = True else: is_rev = False chrn = line.reference_name sp = line.reference_start ep = sp + line.reference_length search_pos = get_pos_in_range(snp_pos_db[chrn], sp, ep) cigar = line.cigartuples query_sequence = line.query_sequence alignment_length = line.query_alignment_length mapping_quality = line.mapping_quality for i in search_pos: offset = i - sp - 1 if offset > len(query_sequence): continue ref = snp_db[chrn][i][0] alt = snp_db[chrn][i][1] query_base = get_base_pos_with_offset(cigar, query_sequence, offset, is_rev) if query_base.lower() == ref.lower(): if read_name not in reads_db[chrn][i]['alt']: reads_db[chrn][i]['ref'][read_name] = [ mapping_quality, alignment_length ] else: amapq, amapl = reads_db[chrn][i]['alt'][read_name] if mapping_quality > amapq: reads_db[chrn][i]['alt'].pop(read_name) reads_db[chrn][i]['ref'][read_name] = [ mapping_quality, alignment_length ] elif alignment_length > amapl: reads_db[chrn][i]['alt'].pop(read_name) reads_db[chrn][i]['ref'][read_name] = [ mapping_quality, alignment_length ] elif query_base.lower() == alt.lower(): if read_name not in reads_db[chrn][i]['ref']: reads_db[chrn][i]['alt'][read_name] = [ mapping_quality, alignment_length ] else: rmapq, rmapl = reads_db[chrn][i]['ref'][read_name] if mapping_quality > rmapq: reads_db[chrn][i]['ref'].pop(read_name) reads_db[chrn][i]['alt'][read_name] = [ mapping_quality, alignment_length ] elif alignment_length > rmapl: reads_db[chrn][i]['ref'].pop(read_name) reads_db[chrn][i]['alt'][read_name] = [ mapping_quality, alignment_length ] bamfile.close() print("\033[32m%s\033[0m Writing result" % (time.strftime('[%H:%M:%S]', time.localtime(time.time())))) with open(out_list, 'w') as f_out: for chrn in sorted(reads_db): for pos in sorted(reads_db[chrn]): f_out.write("%s,%d,%s," % (chrn, pos, '|'.join(snp_db[chrn][pos]))) ref_list = [] for read_name in sorted(reads_db[chrn][pos]['ref']): ref_list.append( "%s(%d;%d)" % (read_name, reads_db[chrn][pos]['ref'][read_name][0], reads_db[chrn][pos]['ref'][read_name][1])) alt_list = [] for read_name in sorted(reads_db[chrn][pos]['alt']): alt_list.append( "%s(%d;%d)" % (read_name, reads_db[chrn][pos]['alt'][read_name][0], reads_db[chrn][pos]['alt'][read_name][1])) f_out.write("%s,%s\n" % ('|'.join(ref_list), '|'.join(alt_list))) print("\033[32m%s\033[0m Finished" % (time.strftime('[%H:%M:%S]', time.localtime(time.time()))))
def main(): start = timeit.default_timer() logging.basicConfig(level=logging.DEBUG, format="%(asctime)-15s [%(processName)s.%(levelname)s] %(message)s") parser = argparse.ArgumentParser(description="Utility for retrieving tumor specific kmers in RNA-seq reads.") parser.add_argument("--Kmer_file", required=True, type=str, nargs='?', help="provide Kmer file here") parser.add_argument("--input_bam_file", required=True, type=str, nargs='?', help="provide input bam file path here") parser.add_argument("--out_bam_file", required=True, type=str, nargs='?', help="provide output bam file path here") args = parser.parse_args() cigar_map = {0:'M', 1:'I', 2:'D', 3:'N', 4:'S', 5:'H', 6:'P', 7:'=', 8:'X', 9:'B'} samfile = pysam.AlignmentFile(args.input_bam_file, "rb") kmer_reads = pysam.AlignmentFile(args.out_bam_file, "wb", template=samfile) trie = ahocorasick.Automaton() with open(args.Kmer_file) as f: for line in f: line_split = line.strip().split('\t') trie.add_word(line_split[0], line_split[0]) trie.make_automaton() logging.info("finished making automaton") for read in samfile.fetch(): if not read.is_unmapped and not read.is_secondary and read.is_proper_pair and read.is_paired and\ not read.is_duplicate and not read.is_supplementary and 'N' in read.cigarstring: for end, kmer in trie.iter(read.query_sequence.upper()): start_index = end - len(kmer) + 1 end_index = end + 1 if all(ref_pos is None for ref_pos in read.get_reference_positions(full_length=True)[start_index:end_index]): continue quality_string = read.to_string().split('\t')[10][start_index:end_index] kmer_read = pysam.AlignedSegment() kmer_read.query_name = read.query_name kmer_read.query_sequence = read.query_sequence[start_index:end_index].upper() kmer_read.flag = read.flag kmer_read.reference_id = read.reference_id for ref_pos in read.get_reference_positions(full_length=True)[start_index:end_index]: if ref_pos is not None: kmer_read.reference_start = ref_pos break kmer_read.mapping_quality = read.mapping_quality current_ind = 0 in_kmer = False cigarString_temp = "" for operation, count in read.cigartuples: if current_ind >= end_index: break if cigar_map[operation] == 'N' or cigar_map[operation] == 'D': if in_kmer: cigarString_temp += str(count) + cigar_map[operation] elif cigar_map[operation] == 'M' or cigar_map[operation] == 'I' or cigar_map[operation] == 'S': if current_ind + count > start_index: cigarString_temp += str( min(end_index, current_ind + count) - max(current_ind, start_index)) + cigar_map[ operation] in_kmer = True current_ind += count else: logging.warn('Unexpected cigar op {}'.format(cigar_map[operation])) if "S" in cigarString_temp or "N" not in cigarString_temp: continue kmer_read.cigarstring = cigarString_temp kmer_read.query_qualities = pysam.qualitystring_to_array(quality_string) kmer_reads.write(kmer_read) kmer_reads.close() samfile.close() logging.info("Done!") stop = timeit.default_timer() logging.info("total search time: {}".format(stop - start))
def check_raw_alignments(df, args, pon): # get soft-clip position and direction clips = [] for chrA, posA, contA, chrB, posB, contB, idx, svlen, spanning in zip( df.chrA, df.posA, df.contigA, df.chrB, df.posB, df.contigB, df.index, df.svlen, df.spanning): if spanning: clips.append((chrA, posA, 3, idx, chrA == chrB, svlen)) clips.append((chrB, posB, 3, idx, chrA == chrB, svlen)) else: if contA: start_lower = contA[0].islower() end_lower = contA[-1].islower() if start_lower and not end_lower: clip_side = 0 elif not start_lower and end_lower: clip_side = 1 else: # start_lower and end_lower: clip_side = 3 # any side clips.append((chrA, posA, clip_side, idx, chrA == chrB, svlen)) if contB: start_lower = contB[0].islower() end_lower = contB[-1].islower() if start_lower and not end_lower: clip_side = 0 elif not start_lower and end_lower: clip_side = 1 else: clip_side = 3 clips.append((chrB, posB, clip_side, idx, chrA == chrB, svlen)) clips = sorted(clips, key=lambda x: (x[0], x[1])) opts = {"bam": "rb", "cram": "rc", "sam": "r", "-": "rb", "stdin": "rb"} pad = 20 found = set([]) for pth, _ in pon: # open alignment file kind = pth.split(".")[-1] bam_mode = opts[kind] pysam.set_verbosity(0) infile = pysam.AlignmentFile( pth, bam_mode, threads=1, reference_filename=None if kind != "cram" else args["ref"]) pysam.set_verbosity(3) for chrom, pos, cs, index, intra, svlen in clips: if index in found: continue for a in infile.fetch(chrom, pos - pad if pos - pad > 0 else 0, pos + pad): if not a.cigartuples: continue # if pos == 3786481 and a.cigartuples[-1][0] == 4: # echo(a.cigartuples, abs(pos - a.pos), abs(pos - a.reference_end)) if a.cigartuples[0][0] == 4 and cs != 1: current_pos = a.pos if abs(current_pos - pos) < 8: found.add(index) break if a.cigartuples[-1][0] == 4 and cs != 0: current_pos = a.reference_end if abs(current_pos - pos) < 8: found.add(index) break df = df.drop(found) return df
def bam_worker(bam_q, progress_q, worker_i): worker = worker_i slices = 0 crumbs = 0 covered_snps = 0 bam = pysam.AlignmentFile(bam_path) while True: work_block = bam_q.get() if work_block is None: progress_q.put({ "pos": None, "worker_i": worker_i, "slices": slices, "crumbs": crumbs, "covered_snps": covered_snps, }) break reads = {} dreads = set([]) for p_col in bam.pileup(reference=target_contig, start=work_block["start"] - 1, stop=work_block["end"], ignore_overlaps=False, min_base_quality=0, stepper=stepper): if p_col.reference_pos + 1 > end_pos: # Ignore positions beyond the end_pos break if vcf_handler["region"][p_col.reference_pos + 1] != 1: # Ignore non-SNPs continue for p_read in p_col.pileups: curr_read_1or2 = 0 if p_read.alignment.is_paired: if p_read.alignment.is_read1: curr_read_1or2 = 1 elif p_read.alignment.is_read2: curr_read_1or2 = 2 else: #TODO Probably indicative of bad data pass curr_read_name = "%s_%s_%d" % (p_read.alignment.query_name, str(p_read.alignment.flag), curr_read_1or2) LEFTMOST_1pos = p_read.alignment.reference_start + 1 # Convert 0-based reference_start to 1-based position (to match region array and 1-based VCF) # Special case: Consider reads that begin before the start_pos, but overlap the 0th block if work_block["i"] == 0: if LEFTMOST_1pos < start_pos: # Read starts before the start_pos if p_read.alignment.reference_start + 1 + p_read.alignment.query_alignment_length < start_pos: # Read ends before the start_pos continue LEFTMOST_1pos = start_pos else: # This read begins before the start of the current (non-0) block # and will have already been covered by the block that preceded it if LEFTMOST_1pos < work_block["start"]: continue sequence = None qual = None if p_read.is_del: # TODO Not sure about how to estimate quality of deletion? sequence = "-" * (abs(p_read.indel) + 1) qual = p_read.alignment.query_qualities[ p_read.query_position_or_next] * ( abs(p_read.indel) + 1) elif p_read.indel > 0: # p_read.indel peeks to next CIGAR and determines whether the base FOLLOWING this one is an insertion or not sequence = p_read.alignment.query_sequence[ p_read.query_position:p_read.query_position + p_read.indel + 1] qual = p_read.alignment.query_qualities[ p_read.query_position:p_read.query_position + p_read.indel + 1] else: sequence = p_read.alignment.query_sequence[ p_read.query_position] qual = p_read.alignment.query_qualities[ p_read.query_position] if not sequence: print( "[WARN] Sequence data seems to not be correctly salvaged from read %s" % p_read.alignment.query_name) continue if curr_read_name not in reads: reads[curr_read_name] = { "rank": np.sum( vcf_handler["region"] [1:LEFTMOST_1pos]), # non-inclusive 1pos end "seq": [], "quals": [], "refs_1pos": [], "read_variants_0pos": [], } if p_read.alignment.query_name in debug_reads: dreads.add(curr_read_name) reads[curr_read_name]["seq"].append(sequence) reads[curr_read_name]["quals"].append(qual) reads[curr_read_name]["refs_1pos"].append( p_col.reference_pos + 1) reads[curr_read_name]["read_variants_0pos"].append( p_read.query_position) for dread in sorted(dreads): r = reads[dread] if r: for snp_i, ref_pos in enumerate(r["refs_1pos"]): print(dread, ref_pos, r["seq"][snp_i]) print("RANK", dread, r["rank"]) if debug_pos: for read in reads: for d_pos in set(reads[read]["refs_1pos"]) & debug_pos: i = reads[read]["refs_1pos"].index(d_pos) print(read, d_pos, reads[read]["seq"][i]) num_reads = len(reads) for qi, qname in enumerate(reads): progress_q.put({ "pos": num_reads - (qi + 1), "worker_i": worker_i }) if not len(reads[qname]["seq"]) > 1: # Ignore reads without evidence continue slices += 1 rank = reads[qname]["rank"] support_len = len(reads[qname]["seq"]) support_seq = "".join( [b[0] for b in reads[qname]["seq"]] ) # b[0] has the affect of capturing the base before any insertion covered_snps += len( support_seq.replace("N", "").replace("_", "")) # For each position in the supporting sequence (that is, each covered SNP) for i in range(0, support_len): snp_a = support_seq[i] #if support_len == 1: # if rank == 0: # hansel.add_observation('_', snp_a, 0, 1) # hansel.add_observation(snp_a, '_', 1, 2) # else: # hansel.add_observation(snp_a, '_', rank+1, rank+2) # For each position in the supporting sequence following i for j in range(i + 1, support_len): snp_b = support_seq[j] # Ignore observations who are from an invalid transition if snp_a in ['_', 'N']: continue # Sentinel->A if i == 0 and j == 1 and rank == 0: # If this is the first position in the support (support_pos == 0) # and rank > 0 (that is, this is not the first SNP) # and SNPs a, b are adjacent hansel.add_observation('_', snp_a, 0, 1) hansel.add_observation(snp_a, snp_b, 1, 2) crumbs += 1 # B->Sentinel elif (j + rank + 1) == vcf_handler["N"] and abs(i - j) == 1: # Last observation (abs(i-j)==1), # that ends on the final SNP (j+rank+1 == N) hansel.add_observation(snp_a, snp_b, vcf_handler["N"] - 1, vcf_handler["N"]) hansel.add_observation(snp_b, '_', vcf_handler["N"], vcf_handler["N"] + 1) crumbs += 1 # A regular observation (A->B) else: hansel.add_observation(snp_a, snp_b, i + rank + 1, j + rank + 1) crumbs += 1 if use_end_sentinels: if j == (support_len - 1) and abs(i - j) == 1: # The last SNP on a read, needs a sentinel afterward hansel.add_observation( snp_b, '_', j + rank + 1, j + rank + 2)
def getBs(rlist): [r1,r2,r3,r4]=rlist currentID=r1.strip().split(' ')[0][1:] fastqOutput="%s%s%s%s" % (r1,r2,r3,r4) eachFaPos=faPos[currentID].split(";") eachChr=eachFaPos[0] eachLeft=int(eachFaPos[1]) eachRight=int(eachFaPos[2]) eachLen=int(eachFaPos[3]) eachFaSeq=faOutput[currentID] foFastq=open(outPrefixTmp+"seq1_"+currentID+".fastq",'w') foFastq.write(fastqOutput) foFastq.close() foFa=open(outPrefixTmp+"seq2_"+currentID+".fa",'w') foFa.write(eachFaSeq) foFa.close() cmd="minimap2 -ax splice "+strandFastq+" -k14 "+outPrefixTmp+"seq2_"+currentID+".fa "+outPrefixTmp+"seq1_"+currentID+".fastq >"+outPrefixTmp+currentID+".sam 2>/dev/null" os.system(cmd) if not os.path.exists(outPrefixTmp+currentID+".sam"): return('') samfile=pysam.AlignmentFile(outPrefixTmp+currentID+".sam","r") BSright=[] Mright=[] BSleft=[] Mleft=[] for read in samfile.fetch(): if (read.flag & 4 != 4): readInfo=getReadInfo(read) else: continue ExonS=readInfo[0] ExonE=readInfo[1] ExonS_diff=abs(np.array(ExonS)-eachLen-hangLen) ExonE_diff=abs(np.array(ExonE)-eachLen+hangLen) ExonS_diff_idx=np.where(ExonS_diff==min(ExonS_diff))[0] ExonE_diff_idx=np.where(ExonE_diff==min(ExonE_diff))[0] commonIdx=set(ExonS_diff_idx-1) & set(ExonE_diff_idx) if len(commonIdx)==0: if len(ExonS_diff_idx)==1 and len(ExonE_diff_idx)==1: if ExonS_diff_idx[0] == ExonE_diff_idx[0]: if ExonS[ExonS_diff_idx[0]]<eachLen: if ExonS_diff_idx[0]<len(ExonS)-1: if ExonS[ExonS_diff_idx[0]+1]>eachLen: commonIdx=[ExonE_diff_idx[0]] else: if ExonE_diff_idx[0]>0: if ExonE[ExonE_diff_idx[0]-1]<eachLen: commonIdx=[ExonE_diff_idx[0]-1] for index in commonIdx: #1 based position tmpright=eachLeft+ExonE[index]-1 tmpleft=eachLeft+ExonS[index+1]-eachLen BSright.append(tmpright) BSleft.append(tmpleft) Mright.append(genome.sequence({'chr': eachChr, 'start':tmpright+1, 'stop':tmpright+2}).upper()) Mleft.append(genome.sequence({'chr': eachChr, 'start':tmpleft-2, 'stop':tmpleft-1}).upper()) samfile.close() os.remove(outPrefixTmp+"seq1_"+currentID+".fastq") os.remove(outPrefixTmp+"seq2_"+currentID+".fa") os.remove(outPrefixTmp+currentID+".sam") if len(BSleft)==0: return('') return(currentID+"\t"+eachChr+"\t"+','.join([str(i) for i in BSleft])+"\t"+','.join([str(i) for i in BSright])+"\t"+','.join([str(i) for i in Mleft])+"\t"+','.join([str(i) for i in Mright]))
print("File: " + str(x)) bam = os.path.isfile(os.path.splitext(x)[0] + ".bam") sorted_bam = os.path.isfile(os.path.splitext(x)[0] + ".sorted.bam") bai = os.path.isfile(os.path.splitext(x)[0] + ".sorted.bam.bai") if not (bam): os.system("samtools view -bS " + x + " > " + os.path.splitext(x)[0] + ".bam") if not (sorted_bam): os.system("samtools sort " + os.path.splitext(x)[0] + ".bam " + os.path.splitext(x)[0] + ".sorted") if not (bai): os.system("samtools index " + os.path.splitext(x)[0] + ".sorted.bam") samfile = pysam.AlignmentFile(os.path.splitext(x)[0] + ".sorted.bam", "rb") splitRef = [] splitAlt = [] count = 0.0 countRef = 0.0 countAlt = 0.0 for pileupcolumn in samfile.pileup(chrStr): if pileupcolumn.pos == loc: for pileupread in pileupcolumn.pileups: if not pileupread.is_del: count += 1 if pileupread.alignment.query_sequence[ pileupread.query_position] == refAllele: splitRef.append(pileupread.alignment.query_name)
def _get_snps_queue(snps_q, snps_conn, snps_db_fn, snps_txt_fn, db_safety, pr_refs_fn, pr_ref_filts, whatshap_map_fn, ref_names_and_lens, ref_fn): def write_whatshap_alignment(read_id, snp_seq, snp_quals, chrm, strand, r_st, snp_cigar): a = pysam.AlignedSegment() a.query_name = read_id a.flag = 0 if strand == 1 else 16 a.reference_id = whatshap_map_fp.get_tid(chrm) a.reference_start = r_st a.template_length = len(snp_seq) a.mapping_quality = WHATSHAP_MAX_QUAL a.set_tags([('RG', WHATSHAP_RG_ID)]) # convert to reference based sequence if strand == -1: snp_seq = mh.revcomp(snp_seq) snp_quals = snp_quals[::-1] snp_cigar = snp_cigar[::-1] a.query_sequence = snp_seq a.query_qualities = array('B', snp_quals) a.cigartuples = snp_cigar whatshap_map_fp.write(a) return def get_snp_call(): # note strand is +1 for fwd or -1 for rev r_snp_calls, (read_id, chrm, strand, r_start, ref_seq, read_len, q_st, q_en, cigar) = snps_q.get(block=False) snps_db.executemany( ADDMANY_SNPS, [(read_id, chrm, strand, pos, alt_lp, snp_ref_seq, snp_alt_seq, snp_id) for pos, alt_lps, snp_ref_seq, snp_alt_seqs, snp_id in r_snp_calls for alt_lp, snp_alt_seq in zip(alt_lps, snp_alt_seqs)]) if snps_txt_fp is not None and len(r_snp_calls) > 0: snps_txt_fp.write('\n'.join(( ('\t'.join('{}' for _ in field_names)).format( read_id, chrm, strand, pos, np.log1p(-np.exp(alt_lps).sum( )), alt_lp, snp_ref_seq, snp_alt_seq, snp_id) for pos, alt_lps, snp_ref_seq, snp_alt_seqs, snp_id in r_snp_calls for alt_lp, snp_alt_seq in zip(alt_lps, snp_alt_seqs))) + '\n') snps_txt_fp.flush() if do_ann_snps: if not mapping.read_passes_filters(pr_ref_filts, read_len, q_st, q_en, cigar): return snp_seq, snp_quals, snp_cigar = annotate_snps( r_start, ref_seq, r_snp_calls, strand) if pr_refs_fn is not None: pr_refs_fp.write('>{}\n{}\n'.format(read_id, snp_seq)) pr_refs_fp.flush() if whatshap_map_fn is not None: write_whatshap_alignment(read_id, snp_seq, snp_quals, chrm, strand, r_start, snp_cigar) return snps_db = sqlite3.connect(snps_db_fn) if db_safety < 2: snps_db.execute(SET_ASYNC_MODE) if db_safety < 1: snps_db.execute(SET_NO_ROLLBACK_MODE) snps_db.execute(CREATE_SNPS_TBLS) if snps_txt_fn is None: snps_txt_fp = None else: snps_txt_fp = open(snps_txt_fn, 'w') field_names = ('read_id', 'chrm', 'strand', 'pos', 'ref_log_prob', 'alt_log_prob', 'ref_seq', 'alt_seq', 'snp_id') snps_txt_fp.write('\t'.join(field_names) + '\n') if pr_refs_fn is not None: pr_refs_fp = open(pr_refs_fn, 'w') if whatshap_map_fn is not None: _, map_fmt = os.path.splitext(whatshap_map_fn) if map_fmt == '.bam': w_mode = 'wb' elif map_fmt == '.cram': w_mode = 'wc' elif map_fmt == '.sam': w_mode = 'w' else: raise mh.MegaError('Invalid mapping output format') header = { 'HD': { 'VN': '1.4' }, 'SQ': [{ 'LN': ref_len, 'SN': ref_name } for ref_name, ref_len in sorted(zip(*ref_names_and_lens))], 'RG': [ { 'ID': WHATSHAP_RG_ID, 'SM': SAMPLE_NAME }, ] } whatshap_map_fp = pysam.AlignmentFile(whatshap_map_fn, w_mode, header=header, reference_filename=ref_fn) do_ann_snps = whatshap_map_fn is not None or pr_refs_fn is not None while True: try: get_snp_call() except queue.Empty: if snps_conn.poll(): break sleep(0.1) continue while not snps_q.empty(): get_snp_call() if snps_txt_fp is not None: snps_txt_fp.close() if pr_refs_fn is not None: pr_refs_fp.close() if whatshap_map_fn is not None: whatshap_map_fp.close() snps_db.execute(CREATE_SNPS_IDX) snps_db.commit() snps_db.close() return
def Filter(inputBAM, outputBAM, log, bed, MQ=2, minIdentity=0.8, NM=-1, printOnly=False, verbose=True, force=False): if (printOnly or checkStep([inputBAM], [outputBAM], force)): mappedReads = 0 unmappedReads = 0 filteredReads = 0 mqFiltered = 0 idFiltered = 0 nmFiltered = 0 multimapper = 0 infile = pysam.AlignmentFile(inputBAM, "rb") outfile = pysam.AlignmentFile(outputBAM, "wb", template=infile) # Default filtering without bed if (bed == None): print("#No bed-file supplied. Running default filtering on " + inputBAM + ".", file=log) for read in infile: if (not read.is_secondary and not read.is_supplementary): if (read.is_unmapped): unmappedReads += 1 else: mappedReads += 1 if (read.is_unmapped): continue if (read.mapping_quality < MQ): mqFiltered += 1 continue if (float(read.get_tag("XI")) < minIdentity): idFiltered += 1 continue if (NM > -1 and int(read.get_tag("NM")) > NM): nmFiltered += 1 continue if (not read.is_secondary and not read.is_supplementary): filteredReads += 1 outfile.write(read) print("Criterion\tFiltered reads", file=log) print("MQ < " + str(MQ) + "\t" + str(mqFiltered), file=log) print("ID < " + str(minIdentity) + "\t" + str(idFiltered), file=log) print("NM > " + str(NM) + "\t" + str(nmFiltered), file=log) print("MM\t0", file=log) else: # Multimap retention strategy filtering when bed is supplied random.seed(1) print( "#Bed-file supplied. Running multimap retention filtering strategy on " + inputBAM + ".", file=log) mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper = multimapUTRRetainment( infile, outfile, bed, minIdentity, NM, log) #mappedReads, unmappedReads, filteredReads = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log) # Add number of sequenced and number of mapped reads to the read group description # Used for creating summary file inFileBamHeader = outfile.header if ('RG' in inFileBamHeader and len(inFileBamHeader['RG']) > 0): slamseqInfo = SlamSeqInfo() slamseqInfo.SequencedReads = mappedReads + unmappedReads slamseqInfo.MappedReads = mappedReads slamseqInfo.FilteredReads = filteredReads slamseqInfo.MQFilteredReads = mqFiltered slamseqInfo.IdFilteredReads = idFiltered slamseqInfo.NmFilteredReads = nmFiltered slamseqInfo.MultimapperReads = multimapper if (bed != None): slamseqInfo.AnnotationName = os.path.basename(bed) slamseqInfo.AnnotationMD5 = md5(bed) else: slamseqInfo.AnnotationName = "" slamseqInfo.AnnotationMD5 = "" if not isinstance(inFileBamHeader, dict): inFileBamHeader = inFileBamHeader.to_dict() inFileBamHeader['RG'][0]['DS'] = str(slamseqInfo) #inFileBamHeader['RG'][0]['DS'] = "{'sequenced':" + str(mappedReads + unmappedReads) + "," + "'mapped':" + str(mappedReads) + "," + "'filtered':" + str(filteredReads) + "}" slamDunkPG = { 'ID': 'slamdunk', 'PN': 'slamdunk filter v' + __version__, 'VN': __bam_version__ } if ('PG' in inFileBamHeader): inFileBamHeader['PG'].append(slamDunkPG) else: inFileBamHeader['PG'] = [slamDunkPG] infile.close() outfile.close() # Sort afterwards bamSort(outputBAM, log, inFileBamHeader, verbose) pysamIndex(outputBAM) #pysamFlagstat(outputBAM) #runFlagstat(outputBAM, log, verbose=verbose, dry=printOnly) else: print("Skipped filtering for " + inputBAM, file=log)
# all of the non-uk mutations other_variants = all_mutations - uk_variant_mutations # L5F doesn't included in both lists other_variants.remove('L5F') all_tables = {} files_list = glob.glob(bam_dir + '/*.mapped.sorted.bam') # iterate all bam files: for file in files_list: pileup_table = pd.DataFrame(np.empty(shape=(29903, 6)) * np.nan, columns=['C', 'A', 'G', 'T', 'N', 'del'], index=list( range(29903))) # empty pileup table bam = pysam.AlignmentFile(file, 'rb') # upload bam file pileup_iter = bam.pileup(stepper='nofilter') # samtools pileup # iterate over reads in each position and count nucleotides, Ns and deletions. for position in pileup_iter: c = Counter({'C': 0, 'A': 0, 'G': 0, 'T': 0, 'N': 0, 'del': 0}) for pileupread in position.pileups: if not pileupread.is_del and not pileupread.is_refskip: c[pileupread.alignment.query_sequence[ pileupread.query_position].upper()] += 1 elif pileupread.is_del: c['del'] += 1 elif pileupread.is_refskip: # N? c['N'] += 1 pileup_table.loc[position.reference_pos] = pd.Series(c) # produce pileup table(for each bam): pos,A,C,T,G,N,del,totaldepth, pileup_table.index.name = 'pos'
#!/usr/bin/python3 import os import pysam import sys import numpy as np genome = sys.argv[1] output_folder = sys.argv[2] input_fasta = output_folder + "/results/" + genome + "_reconstructed_genome.fna" samfile = pysam.AlignmentFile( output_folder + "/artifacts/sorted_contigs_alignment_on_rgenome.bam", "r") predicted_file_name = output_folder + "/results/" + genome + "_predictedCDSs" predicted_filtered_file_name = output_folder + "/results/" + genome + "_predictedCDSs_filtered" frags_with_no_genes = output_folder + "/results/" + genome + "_frags_with_no_genes.txt" predicted_filtered_only_genes_file_name = output_folder + "/artifacts/" + genome + "_predictedCDSs_filtered_only_genes.bed" gene_positions = os.popen("grep 'CDS' " + predicted_file_name + " | awk '{print $2}'").read() sequence_identifier = os.popen("grep '>' " + input_fasta).read() sequence_identifier = sequence_identifier[1:-1] sequence_identifier = sequence_identifier.split(" ")[0] gene_positions = gene_positions.split("\n") ref_name = samfile.references[0] deduced_bases = [] for gene in gene_positions: in_reverse_strands = False if not gene: continue
def categorize_no_overlap_outcomes(self, max_reads=None): outcomes = defaultdict(list) with self.fns['no_overlap_outcome_list'].open('w') as fh: fh.write(f'## Generated at {utilities.current_time_string()}\n') alignment_groups = self.no_overlap_alignment_groups() if max_reads is not None: alignment_groups = islice(alignment_groups, max_reads) for name, als in self.progress( alignment_groups, desc='Categorizing non-overlapping read pairs'): try: pair_layout = layout_module.NonoverlappingPairLayout( als['R1'], als['R2'], self.target_info) pair_layout.categorize() except: print(self.sample_name, name) raise outcomes[pair_layout.category, pair_layout.subcategory].append(name) outcome = self.final_Outcome.from_layout(pair_layout) fh.write(f'{outcome}\n') # To make plotting easier, for each outcome, make a file listing all of # qnames for the outcome and a bam file (sorted by name) with all of the # alignments for these qnames. qname_to_outcome = {} bam_fhs = {} with ExitStack() as stack: full_bam_fns = { which: self.fns_by_read_type['bam_by_name'][f'{which}_no_overlap'] for which in ['R1', 'R2'] } full_bam_fhs = { which: stack.enter_context(pysam.AlignmentFile(full_bam_fns[which])) for which in ['R1', 'R2'] } for outcome, qnames in outcomes.items(): outcome_fns = self.outcome_fns(outcome) outcome_fns['dir'].mkdir(exist_ok=True) for which in ['R1', 'R2']: bam_fn = outcome_fns['bam_by_name'][f'{which}_no_overlap'] bam_fhs[outcome, which] = stack.enter_context( pysam.AlignmentFile(bam_fn, 'wb', template=full_bam_fhs[which])) fh = stack.enter_context( outcome_fns['no_overlap_query_names'].open('w')) for qname in qnames: qname_to_outcome[qname] = outcome fh.write(qname + '\n') for which in ['R1', 'R2']: for al in full_bam_fhs[which]: if al.query_name in qname_to_outcome: outcome = qname_to_outcome[al.query_name] bam_fhs[outcome, which].write(al)