def refine(options): """ Refine TE insertion and deletion calls within a group of related samples Use indel calls from other samples in the group, inspect areas of the genome in samples where indel was not called, and look for evidence of the same indel with much lower read count threshold """ te = pybedtools.BedTool(options.te).sort() names = readNames(options.all_samples) if options.insertions is not False: insertions = getOtherLines(names, options.insertions) if options.deletions is not False: deletions = getOtherLines(names, options.deletions) # format ([data], [inverse_accessions]) print "Processing "+options.name chrom_sizes = check_bam(options.conc, options.proc, options.prefix) check_bam(options.split, options.proc, options.prefix, make_new_index=True) cov = calc_cov(options.conc, 100000, 120000) concordant = pysam.AlignmentFile(options.conc, 'rb') split_alignments = pysam.AlignmentFile(options.split, 'rb') name_indexed = pysam.IndexedReads(split_alignments) name_indexed.build() if options.deletions is not False: print " checking deletions" process_missed(deletions, "deletion", concordant, split_alignments, name_indexed, options.name, te, cov/5, chrom_sizes) else: pass if options.insertions is not False: print " checking insertions" process_missed(insertions, "insertion", concordant, split_alignments, name_indexed, options.name, te, cov/10, chrom_sizes) else: pass
def get_st_alignments(contigs, st_bam): bam = pysam.AlignmentFile(st_bam, 'rc') index = pysam.IndexedReads(bam) index.build() st_alignment = [] for contig in contigs.contig_id.values: aligned_conts = [ read.reference_name for read in index.find(contig) if read.reference_name ] aligned_conts = ','.join( np.unique(aligned_conts)) if len(aligned_conts) > 0 else '' st_alignment.append(aligned_conts) # get short gene name (first gene of every overlapping set of genes, include fusion genes) short_gnames = contigs.overlapping_genes.apply(get_short_gene_name) contig_ids, samples = contigs.contig_id, contigs['sample'] con_names = [ '|'.join([s, cid, sg]) for cid, s, sg in zip(contig_ids, samples, short_gnames) ] contigs['expected_ST_alignment'] = con_names contigs['real_ST_alignment'] = st_alignment return contigs
def extract_reads(options): n = get_names(options.names) bamfile = pysam.AlignmentFile(options.bam, 'rb') name_indexed = pysam.IndexedReads(bamfile) name_indexed.build() header = bamfile.header.copy() # out = pysam.Samfile(options.out, 'wb', header=header) for name in n: try: name_indexed.find(name) except KeyError: pass else: iterator = name_indexed.find(name) for read in iterator: # out.write(x) if read.is_reverse: if read.qual: sys.stdout.write("@{0}\n{1}\n+\n{2}\n".format(read.qname, read.seq.translate(revComp)[::-1], read.qual[::-1])) else: sys.stdout.write(">{0}\n{1}\n".format(read.qname, read.seq.translate(revComp)[::-1])) else: if read.qual: sys.stdout.write("@{0}\n{1}\n+\n{2}\n".format(read.qname, read.seq, read.qual)) else: sys.stdout.write(">{0}\n{1}\n".format(read.qname, read.seq))
def build(self, id_queries: List[Tuple[str, str, str]], refdict: ReferenceDict): log.info("Building BAM Index in memory for fetching VJ queries") bam_indexed = pysam.IndexedReads(self.bam) bam_indexed.build() log.info("Built index in memory for fast retrieval") count = 0 report_interval = max(1, len(id_queries) // 25) for id_query in id_queries: barcode, umi, query_name = id_query read = Read(query_name) read.parse_alignments(bam_indexed, refdict) if read.top_V is None or read.top_J is None: log.error( f"Should have gotten a top V and J for {query_name} but did not!" ) if barcode not in self: self[barcode] = Barcode(barcode) if umi not in self[barcode]: self[barcode][umi] = UMI(umi) self[barcode][umi][query_name] = read count += 1 if count % report_interval == 0: log.info( f"Stored top alignments for {count} reads ({(count/len(id_queries)):.0%})." )
def bam2py(bamfile): # this is probably not very efficient as it uses up a lot of memory but it'll do the job for now bamfile = pysam.AlignmentFile(bamfile, 'rb') name_indexed = pysam.IndexedReads(bamfile) name_indexed.build() header = bamfile.header.copy() return header, name_indexed
def rmb_dedup(basename): merge_bam_sort = "vector/" + basename + '_merge_vector.sort.bam' merge_bam_sort_dedup = "vector/" + basename + '_merge_vector_dedup.bam' merge_bam_sort_dedup_sort = "vector/" + basename + '_merge_vector_dedup.sort.bam' unique_barcode = "barcode/"+basename+"_barcode_uniq.txt" load(unique_barcode) # get uniq qname list data = pd.read_csv(unique_barcode,sep = '\t',names = [u'Qname',u'Barcode',u'Freq',u'Length'],low_memory=False) uniq_qname_list = data['Qname'].tolist() #generate dedup bam merge_bam = pysam.AlignmentFile(merge_bam_sort, 'rb') dedup_bam = pysam.AlignmentFile(merge_bam_sort_dedup, "wb", template=merge_bam) #index bam name by pysam to generate dedup bam name_indexed = pysam.IndexedReads(merge_bam) name_indexed.build() for name in uniq_qname_list: try: name_indexed.find(name) except KeyError: pass else: iterator = name_indexed.find(name) for x in iterator: dedup_bam.write(x) merge_bam.close() dedup_bam.close() pysam.sort("-o", merge_bam_sort_dedup_sort, merge_bam_sort_dedup)
def annotate_contigs(args): ''' Extract aligned contigs from supplied bam file and output annotated contig if it contains any novel bits ''' ref_trees, ex_tree, ex_ref = get_gene_lookup(args.tx_ref_file) juncs = get_junc_lookup(args.junc_file) bam = pysam.AlignmentFile(args.bam_file, 'rc') bam_idx = pysam.IndexedReads(bam, multiple_iterators=True) bam_idx.build() outbam_file_unsort = '%s_unsorted.bam' % os.path.splitext(args.output_bam)[0] outbam = pysam.AlignmentFile(outbam_file_unsort, 'wb', template=bam) ci_file = args.contig_info_output logging.info('Checking contigs for non-reference content...') for read in bam.fetch(multiple_iterators=True): if read.reference_id < 0: logging.info('Skipping unmapped contig %s.' % read.query_name) continue if not do_any_read_blocks_overlap_exons(read, ex_tree, bam_idx): logging.info('Skipping contig %s as it doesn\'t overlap any reference exons.' % read.query_name) continue if read.query_name in record: # we have processed this read already (likely a read's partner) continue # only consider the contig if at least match_min bases align # to reference and at least match_perc_min of the read aligns rlen = read.reference_length qlen = float(read.query_length) if (rlen < MIN_MATCH_BP) or (rlen / qlen) < MIN_MATCH_PERC: logging.info('Skipping contig %s: not enough bases match reference' % read.query_name) continue allmatch = all([op == constants.CIGAR['match'] for op, val in read.cigar]) if len(read.get_blocks()) == 1 and allmatch: chr_ex = get_chrom_ref_tree(read.reference_name, ex_tree) s, e = read.get_blocks()[0] if not (chr_ex.overlaps(s, s + 1) and chr_ex.overlaps(e - 1, e)): # skip the contig if the contig start or end are outside exons logging.info('Skipping contig %s: unspliced contiguous alignment' % read.query_name) continue is_hardclipped = any([op == constants.CIGAR['hard-clip'] and val >= MIN_CLIP for op, val in read.cigar]) if is_hardclipped: annotate_fusion(args, read, juncs, bam_idx, ex_ref, ref_trees, outbam) else: annotate_single_read(args, read, juncs, ex_ref, ref_trees, outbam) bam.close() outbam.close() # convert output sam file to bam, sort and index pysam.sort('-o', args.output_bam, outbam_file_unsort) pysam.index(args.output_bam) os.remove(outbam_file_unsort)
def index_bam(filepath): """Creates an in-memory index for BAM file. :param filepath: Path to the BAM file. """ index = pysam.IndexedReads(pysam.AlignmentFile(filepath)) index.build() return index
def __init__(self, bam_file: Union[str, Path]): """ Load alignment file and build index. """ logging.debug(f"Loading {bam_file}.") self.alignments = pysam.AlignmentFile(bam_file) logging.debug(f"Building index for {bam_file}.") self.read_index = pysam.IndexedReads(self.alignments, multiple_iterators=True) self.read_index.build()
def parse_cluster(path, bam_path, out_path, opt): # load the bam file logging.info("Loading the Bam file.") bamfile = pysam.AlignmentFile(bam_path, 'rb') name_indexed = pysam.IndexedReads(bamfile) name_indexed.build() # load cluster info logging.info("Loading the cluster file.") num = 0 file = open(path, 'r') for line in file: num += 1 if num % 100 == 0: logging.info("Finished %d clusters."%(num)) seq = line.strip('\n').split('\t') chr = seq[0] breakpoint = seq[1]+'_'+seq[2]+'_'+seq[3]+'_'+str(len(seq[4:])) id_list = seq[4:] if len(id_list) < 5: continue if opt == "fq": file_path = "%s%s_%s.fq"%(out_path, chr, breakpoint) else: file_path = "%s%s_%s.fa"%(out_path, chr, breakpoint) out_file = open(file_path, 'w') for name in id_list: try: name_indexed.find(name) except KeyError: pass else: iterator = name_indexed.find(name) for read in iterator: if read.is_reverse: if opt == 'fq': out_file.write("@{0}\n{1}\n+\n{2}\n".format(read.qname, read.seq.translate(revComp)[::-1], read.qual[::-1])) else: out_file.write(">{0}\n{1}\n".format(read.qname, read.seq.translate(revComp)[::-1])) # out_file.write("@{0}\n{1}\n+\n{2}\n".format(read.qname, read.seq.translate(revComp)[::-1], read.qual[::-1])) else: if opt == 'fq': out_file.write("@{0}\n{1}\n+\n{2}\n".format(read.qname, read.seq, read.qual)) else: out_file.write(">{0}\n{1}\n".format(read.qname, read.seq)) out_file.close() file.close()
def extract_raw_reads(self, read_names, inbamfile): raw_reads = [] read_names_indexed = pysam.IndexedReads(inbamfile) read_names_indexed.build() for name in read_names: try: read_names_indexed.find(name) except KeyError: pass else: iterator = read_names_indexed.find(name) for x in iterator: raw_reads.append(x) return raw_reads
def get_pos(file_path, name): bamfile = sam.AlignmentFile(file_path, "rb") idx = sam.IndexedReads(bamfile) idx.build() name = idx.find(name) #finds first for read in name: pos = read.reference_start break bamfile.close() return pos
def main(inbam1: str = typer.Option(..., help="BAM from ref1"), inbam2: str = typer.Option(..., help="BAM from ref2"), ctg_hap1: str = typer.Option(None, help="link this contig in ref1"), ctgs_hap1: str = typer.Option( None, help="link these contigs in ref1 (list file)"), outdir: str = typer.Option(..., help="output link file from ref1 to ref2"), savematrix: str = typer.Option(None, help="save link matrix to this file")): """根据reads比对情况,输出ref1到ref2的对应关系。 只支持BAM文件。 """ t1 = time.time() print('load reference infor') chrom2offset_hap1, chrom2length_hap1, rnames_hap1, offsets_hap1 = load_refinfo( inbam1) chrom2offset_hap2, chrom2length_hap2, rnames_hap2, offsets_hap2 = load_refinfo( inbam2) print('build reads index') alnfile_hap2 = pysam.AlignmentFile(inbam2, 'rb') name_indexed_hap2 = pysam.IndexedReads(alnfile_hap2) name_indexed_hap2.build() print('start link:') if ctgs_hap1: ctgs_hap1 = [x.strip() for x in open(ctgs_hap1)] else: ctgs_hap1 = [ ctg_hap1, ] for ctg_hap1 in ctgs_hap1: print(f'ctg: {ctg_hap1}') matrix_R2T = load_pos_to_matrix(inbam1, name_indexed_hap2, ctg_hap1, chrom2length_hap1, chrom2length_hap2, chrom2offset_hap2) if savematrix: save_npz(os.path.join(savematrix, f'{ctg_hap1}.npz'), matrix_R2T) # matrix_to_pos(ctg_hap1, matrix_R2T, os.path.join(outdir, f'{ctg_hap1}.linkpos.tsv.gz'), rnames_hap2, offsets_hap2, chrom2offset_hap2) t2 = time.time() runtime = t2 - t1 h = runtime // 3600 m = (runtime - h * 3600) // 60 s = runtime - h * 3600 - m * 60 print(f'Finished! Executed in {h:.0f}h {m:.0f}m {s:.0f}s ({runtime}s)')
def extract_reads(options): n = get_names(options.names) bamfile = pysam.AlignmentFile(options.bam, 'rb', check_sq=False) name_indexed = pysam.IndexedReads(bamfile) name_indexed.build() header = bamfile.header.copy() out = pysam.Samfile(options.out, 'wb', header=header) for name in n: try: name_indexed.find(name) except KeyError: pass else: iterator = name_indexed.find(name) for x in iterator: out.write(x)
def unmerge_bams(out_file_prefix, valid_cells, n_cores): input_file = out_file_prefix + ".merged.aligned.sorted.bam" if not os.path.isdir(out_file_prefix + "_deindexed_bam"): os.mkdir(out_file_prefix + "_deindexed_bam") with ps.AlignmentFile(input_file, "rb") as f_in: indexed = ps.IndexedReads(f_in) indexed.build() header = f_in.header.copy() for cell_id in valid_cells: out_file_name = out_file_prefix + "_deindexed_bam/" + cell_id + ".bam" with ps.AlignmentFile(out_file_name, "wb", header=header) as f_out: iterator = indexed.find(cell_id) for i in iterator: f_out.write(i)
def write_anomalous_read_to_bam(bam,split_reads,span_reads,anom_reads,out): print('Writing anom reads to file') split_reads = np.unique(split_reads['query_name']) span_reads = np.unique(span_reads['query_name']) anom_reads = np.unique(anom_reads['query_name']) # need to filter out any reads that were at any point marked as valid supporting reads anom_reads = np.array([x for x in anom_reads if x not in split_reads]) anom_reads = np.array([x for x in anom_reads if x not in span_reads]) bamf = pysam.AlignmentFile(bam, "rb") index = pysam.IndexedReads(bamf) index.build() anom_bam = pysam.AlignmentFile("%s_anom_reads.bam" % out, "wb", template=bamf) for read_name in anom_reads: for read in index.find(read_name): anom_bam.write(read) anom_bam.close()
def primer_filter(basename, vector_fa, genome, bait_chr, bait_strand, sgRNA_start, sgRNA_end): directory_store = vector_fa.rpartition('.')[0] print("[PEM-Q Vector Analysis] processing primer filter...") pe_bam_sort = basename + '_pe_vector.sort.bam' pe_primer_bam = basename + '_primer_vector.bam' pe_primer_bam_sort = basename + '_primer_vector.sort.bam' primer_list_file = pd.read_csv("primer/bamlist_stitch.txt", sep=' ', names=["Qname", "Bait_start", "Bait_end"]) primer_list = primer_list_file["Qname"] vector_pe_bam = pysam.AlignmentFile(directory_store + "/" + pe_bam_sort, 'rb') vector_primer_bam = pysam.AlignmentFile(directory_store + "/" + pe_primer_bam, "wb", template=vector_pe_bam) vector_pe_bam_indexed = pysam.IndexedReads(vector_pe_bam) vector_pe_bam_indexed.build() n = 0 for name in primer_list: try: vector_pe_bam_indexed.find(name) except KeyError: pass else: iterator = vector_pe_bam_indexed.find(name) for x in iterator: n = n + 1 vector_primer_bam.write(x) print("primer filter left:", n) vector_pe_bam.close() vector_primer_bam.close() pysam.sort("-o", directory_store + "/" + pe_primer_bam_sort, directory_store + "/" + pe_primer_bam) cmd = "samtools index {}/{}".format(directory_store, pe_primer_bam_sort) os.system(cmd)
def filter_reads(alignment_file, readdb, read_dirs, quality_threshold=7, recursive=False, trim=False): """Filter fast5 files based on a quality threshold and if there is an alignment :param alignment_file: bam aligment file :param readdb: readdb or sequence summary file :param read_dirs: list of directories :param quality_threshold: phred quality score min threshold for passing :param recursive: search directories recursively for more fast5 dirs :param trim: number of bases to analyze """ assert alignment_file.endswith("bam"), "Alignment file must be in BAM format: {}".format(alignment_file) # grab aligned segment if trim: assert isinstance(trim, int), "Trim needs to be an integer: {}".format(trim) else: trim = np.inf n_bases = 0 n_files = 0 with closing(pysam.AlignmentFile(alignment_file, 'rb')) as bamfile: name_indexed = pysam.IndexedReads(bamfile) name_indexed.build() for name, fast5 in parse_read_name_map_file(readdb, read_dirs, recursive=recursive): try: if trim < n_bases: print("Filtered {} files for {} bases".format(n_files, n_bases)) break iterator = name_indexed.find(name) for aligned_segment in iterator: if aligned_segment.is_secondary or aligned_segment.is_unmapped \ or aligned_segment.is_supplementary or aligned_segment.has_tag("SA"): continue # get data and sanity check if aligned_segment.query_qualities is not None: if np.mean(aligned_segment.query_qualities) < quality_threshold: continue n_files += 1 n_bases += aligned_segment.query_length yield fast5, aligned_segment except KeyError: print("Found no alignments for {}".format(fast5))
def extract_reads_by_name_list(self, sf_names, sf_bam, sf_out_bam): l_names = self.load_read_names(sf_names) bamfile = pysam.AlignmentFile(sf_bam, 'rb', reference_filename=self.sf_reference) name_indexed = pysam.IndexedReads( bamfile) # here use hashing to save the read names in the memory name_indexed.build() header = bamfile.header.copy() out = pysam.Samfile(sf_out_bam, 'wb', header=header) for name in l_names: try: name_indexed.find(name) except KeyError: pass else: iterator = name_indexed.find(name) for x in iterator: # x is an alignment ###here need to check whether this is the first or second read we wanted!!!! out.write(x) out.close()
def get_cigar(file_path, name): bamfile = sam.AlignmentFile(file_path, "rb") idx = sam.IndexedReads(bamfile) idx.build() name = idx.find(name) cigar_align = [] for read in name: # tmp = read.get_blocks() if (not (read.is_unmapped)): # if it's mapped cigarLine = read.cigar for (cigarType, cigarLength) in cigarLine: try: if (cigarType == 0): # match for i in range(cigarLength): cigar_align.append('.') elif (cigarType == 1): # insertions for i in range(cigarLength): cigar_align.append('i') elif (cigarType == 2): # deletion for i in range(cigarLength): cigar_align.append('d') elif (cigarType == 3): # skip for i in range(cigarLength): cigar_align.append('s') elif (cigarType == 4): # soft clipping continue elif (cigarType == 5): # hard clipping continue elif (cigarType == 6): # padding for i in range(cigarLength): cigar_align.append('p') else: print("Wrong CIGAR number") sys.exit(1) except: print("Problem") return cigar_align
def extract_reads(options): n = get_names(options.names) bamfile = pysam.AlignmentFile(options.bam, 'rb') name_indexed = pysam.IndexedReads(bamfile) name_indexed.build() out = open(options.out, 'w') for name in n: try: name_indexed.find(name) except KeyError: pass else: iterator = name_indexed.find(name) for x in iterator: start = x.reference_start end = x.reference_end mq = x.mapping_quality #chrom=x.target_name chrom = x.reference_name rl = x.query_length out.write("%s %s %s %s %s %s\n" % (name, chrom, start, end, mq, rl))
def extract_reads_cigar(options): reads_file = '%s/read_names.txt' % (options.outdir) n = get_names(reads_file) bamfile = pysam.AlignmentFile(options.bam, 'rb') name_indexed = pysam.IndexedReads(bamfile) name_indexed.build() header = bamfile.header.copy() # out = pysam.Samfile(options.out, 'wb', header=header) for name in n: try: name_indexed.find(name) except KeyError: pass else: iterator = name_indexed.find(name) n = 0 for x in iterator: n += 1 multi_align_list = [] iterator = name_indexed.find(name) for x in iterator: #if n == 2 or x.cigar == [(0, 150)]: # print (name, x.reference_name, x, x.cigar,cigar2array(x.cigar)) # continue if x.is_unmapped: continue cigararray = cigar2array(x.cigar) # print (x.cigar) s = 0 # locus_set = x.get_reference_positions(full_length=True) t_name = x.reference_name # pre_ref, pre_read = '', '' for i in range(len(cigararray)): base_quality = x.query_qualities[i] p = 10**(-base_quality / 10) cigar = cigararray[i] print(p, cigar)
def multiprocess_get_summary_info(alignment_file, readdb, read_dirs, get_summary_args, worker_count=1, debug=False): """Multiprocess get summary info""" assert alignment_file.endswith("bam"), "Alignment file must be in BAM format: {}".format(alignment_file) # grab aligned segment data = pd.DataFrame([]) with closing(pysam.AlignmentFile(alignment_file, 'rb')) as bamfile: name_indexed = pysam.IndexedReads(bamfile) print("Indexing bam file by read name.") name_indexed.build() print("Finished.") print("Looping through readdb file.") if debug: for name, fast5, iterator in parse_readdb_wrapper(parse_readdb(readdb, read_dirs), name_indexed): pd_line = create_summary_pd(iterator, fast5, name, **get_summary_args) data = data.append(pd_line) else: total, failure, messages, output = multithread.run_service2( get_summary_info_service, parse_readdb_wrapper(parse_readdb(readdb, read_dirs), name_indexed), get_summary_args, ['name', "fast5", "sam_lines"], worker_count) for pd_line in output: if isinstance(pd_line, pd.DataFrame): data = data.append(pd_line) return data
def readAndFilterBam(config): print("Read bam", config.bam_file) bam_file_all = pysam.AlignmentFile(filepath_or_object=config.bam_file, mode='rb') index = pysam.IndexedReads(bam_file_all) index.build() reads_names = set() for ch in config.contigs: for read in bam_file_all.fetch( ch, 0, ): reads_names.add(read.query_name) result = [] for read_name in reads_names: reads = [ r for r in index.find(read_name) if validRead(r, config) and not r.is_secondary ] readsInChromosoms = [ r for r in reads if r.reference_name in config.contigs ] if len(reads) < 2 or not readsInChromosoms: continue segments = [Segment(read) for read in reads] segments.sort(key=lambda x: x.query_alignment_start) read_aligment = calculateAlignment(segments) # print("alignment", read_aligment * 100, "%") if read_aligment < config.min_read_aligment: continue result.append(segments) return result, bam_file_all
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="""Split a BAM file containing alignments from a SLAM/TUC/TL-seq experiment into labelled/unlabelled BAM files. Requires the MD tag. All reads are kept incl. unmapped, secondary/supplementary, etc. these can be filtered out when counting (featureCounts). Optionally, SNPs can be subtracted.""" ) parser.add_argument('bam', help="""The input BAM file (full path).""") parser.add_argument('outdir_bam', help="""The output directory (BAM files).""") parser.add_argument( 'outdir_mm', help="""The output directory (mismatch information).""") parser.add_argument('name', help="""The output base name without extension.""") parser.add_argument('-s', '--subtract', help="""SNPs to be subtracted (GS default format)""", type=str) parser.add_argument('--vcf', help="""Use this flag if SNPs are in VCF format""", action='store_true') parser.add_argument('-ref', '--ref-base', help="""Conversion reference base.""", choices=['A', 'C', 'G', 'T'], default='T') parser.add_argument('-bc', '--base-change', help="""Conversion base (substitution/mismatch).""", choices=['A', 'C', 'G', 'T'], default='C') parser.add_argument( '-q', '--base-qual', help="The minimum base quality for any given mismatch (default: 20).", type=int, default=20) parser.add_argument( '--trim5p', help="The number bases to trim at the 5' ends of reads (default: 0).", type=int, default=0) parser.add_argument( '--trim3p', help="The number bases to trim at the 3' ends of reads (default: 0).", type=int, default=0) parser.add_argument('--overwrite', help='''If this flag is present, then existing files will be overwritten.''', action='store_true') parser.add_argument('-t', '--tmp', help="""Optional argument: where to write temporary files. If not specified, programs-specific tmp will be used.""", default=None) utils.add_sbatch_options(parser, num_cpus=default_num_cpus, mem=default_mem) utils.add_logging_options(parser) args = parser.parse_args() utils.update_logging(args) msg = "[splbam]: {}".format(' '.join(sys.argv)) logger.info(msg) # if using slurm, submit the script if args.use_slurm: cmd = "{}".format(' '.join(shlex.quote(s) for s in sys.argv)) utils.check_sbatch(cmd, args=args) return # check output path exist = utils.check_files_exist([args.outdir_bam, args.outdir_mm], raise_on_error=True, logger=logger) # check that all files exist input_files = [args.bam] if args.subtract: input_files.append(args.subtract) exist = utils.check_files_exist(input_files, raise_on_error=True, logger=logger) # create the output files - BAM labelled_filename = '{}.labelled.unsrt.bam'.format(args.name) labelled_filename = os.path.join(args.outdir_bam, labelled_filename) sorted_labelled_filename = '{}.labelled.bam'.format(args.name) sorted_labelled_filename = os.path.join(args.outdir_bam, sorted_labelled_filename) unlabelled_filename = '{}.unlabelled.unsrt.bam'.format(args.name) unlabelled_filename = os.path.join(args.outdir_bam, unlabelled_filename) sorted_unlabelled_filename = '{}.unlabelled.bam'.format(args.name) sorted_unlabelled_filename = os.path.join(args.outdir_bam, sorted_unlabelled_filename) # create the output files - mismatches mismatch_details_filename = '{}.mismatchDetails.tab.gz'.format(args.name) mismatch_details_filename = os.path.join(args.outdir_mm, mismatch_details_filename) mismatch_filename = '{}.mismatches.tab.gz'.format(args.name) mismatch_filename = os.path.join(args.outdir_mm, mismatch_filename) mismatch_filename_final = '{}.mismatches-used.tab.gz'.format(args.name) mismatch_filename_final = os.path.join(args.outdir_mm, mismatch_filename_final) # and check if exist out_files = [ labelled_filename, sorted_labelled_filename, unlabelled_filename, sorted_unlabelled_filename, mismatch_details_filename, mismatch_filename, mismatch_filename_final ] all_out_exists = all([os.path.exists(of) for of in out_files]) if args.overwrite or not all_out_exists: pass else: msg = "All output files {} already exist. Skipping call.".format( out_files) logger.warning(msg) return msg = "Getting all mismatches" logger.info(msg) # first get all mismatches bam = ps.AlignmentFile(args.bam, "rb") SN = (SQ['SN'] for SQ in bam.header['SQ']) bam.close() mismatch_and_details = utils.apply_parallel_iter(SN, args.num_cpus, get_mismatches, args.bam, progress_bar=False, backend='multiprocessing') all_details = [a for a, b in mismatch_and_details] mismatch_count = get_mismatch_details(all_details, mismatch_details_filename, mismatch_filename) all_mismatches = [b for a, b in mismatch_and_details] all_mismatches = pd.concat(all_mismatches) # get the conversion of interest bc = get_base_vec(args.base_change, '+') m_bc = all_mismatches['bases11'] == bc m_ref = all_mismatches['ref'] == args.ref_base all_mismatches = all_mismatches[m_ref & m_bc] # filter base quality - no offset of 33 needs to be subtracted m_qual = all_mismatches['base_qual'] >= args.base_qual # below we keep track of discarded mismatches to adjust rates discarded = all_mismatches[~m_qual].copy() m = (discarded['read1'] == True) & (discarded['score'] == True) discarded_first = discarded[m].shape[0] m = (discarded['read1'] == False) & (discarded['score'] == True) discarded_second = discarded[m].shape[0] all_mismatches = all_mismatches[m_qual] # discard mismatches found at read ends m_trim5p = all_mismatches['m_pos'] < (args.trim5p - all_mismatches['qstart']) m_trim3p = all_mismatches['m_pos'] >= (all_mismatches['rlen'] - args.trim3p) discarded = all_mismatches[m_trim5p | m_trim3p].copy() m = (discarded['read1'] == True) & (discarded['score'] == True) discarded_first += discarded[m].shape[0] m = (discarded['read1'] == False) & (discarded['score'] == True) discarded_second += discarded[m].shape[0] all_mismatches = all_mismatches[~m_trim5p & ~m_trim3p] # remove all SNPs from what remains if args.subtract: # currently GRAND-SLAM snpdata default format if args.vcf: snps = utils.fmt_convert(args.subtract) else: snps = pd.read_csv(args.subtract, sep='\t') snps = snps.Location.unique() # add field all_mismatches['Location'] = all_mismatches[['contig', 'start']].apply( lambda x: ':'.join([str(s) for s in x]), axis=1) discarded = all_mismatches[all_mismatches.Location.isin(snps)].copy() m = (discarded['read1'] == True) & (discarded['score'] == True) discarded_first += discarded[m].shape[0] m = (discarded['read1'] == False) & (discarded['score'] == True) discarded_second += discarded[m].shape[0] all_mismatches = all_mismatches[~all_mismatches.Location.isin(snps)] # adjust final mismatch counts m = (mismatch_count.Orientation == 'First') & ( mismatch_count.Genomic == 'A') & (mismatch_count.Read == 'G') mismatch_count.loc[ m, 'Mismatches'] = mismatch_count.loc[m, 'Mismatches'] - discarded_first n = (mismatch_count.Orientation == 'Second') & ( mismatch_count.Genomic == 'T') & (mismatch_count.Read == 'C') mismatch_count.loc[ n, 'Mismatches'] = mismatch_count.loc[n, 'Mismatches'] - discarded_second mismatch_count = mismatch_count[m | n] mismatch_count.to_csv(mismatch_filename_final, sep='\t', index=False, compression='gzip') # what remains are true conversions, other reads are classified as unlabelled # NOTE: we keep all query_name for which at least one read has a mismatch # this include read pairs, but also multi-mapping reads true_conversions = all_mismatches.name.unique() # now split the BAM file msg = "Reading the alignments and splitting the input BAM file" logger.info(msg) # requires a lot of memory however... bam = ps.AlignmentFile(args.bam, "rb") qname_index = ps.IndexedReads(bam) qname_index.build() # we first "split" the query names, sort by query name and write each file in turn # we don't sort the lists, this will not be faster, the index is just fine true_conversions = set(true_conversions) all_qnames = set([a.query_name for a in bam.fetch(until_eof=True)]) all_qnames = all_qnames - true_conversions labelled = ps.AlignmentFile(labelled_filename, "wb", template=bam) unlabelled = ps.AlignmentFile(unlabelled_filename, "wb", template=bam) # labelled/new for qname in true_conversions: alignments = qname_index.find(qname) for a in alignments: labelled.write(a) labelled.close() # unlabelled/old for qname in all_qnames: alignments = qname_index.find(qname) for a in alignments: unlabelled.write(a) unlabelled.close() bam.close() # create the bamtools index if it does not already exists args.num_cpus = 6 # limit... otherwise this is problematic?! args.keep_intermediate_files = False # delete unsorted bam file utils.sort_bam_file(labelled_filename, sorted_labelled_filename, args) utils.index_bam_file(sorted_labelled_filename, args) utils.sort_bam_file(unlabelled_filename, sorted_unlabelled_filename, args) utils.index_bam_file(sorted_unlabelled_filename, args)
def get_alignment_summary_info_withdb(alignment_file, readdb, read_dirs, pass_threshold=7, gap_size=10, verbose=False, max_reads=100, number=0): """Filter fast5 files based on a quality threhsold and if there is an alignment""" assert alignment_file.endswith("bam"), "Alignment file must be in BAM format: {}".format(alignment_file) # grab aligned segment seen_counter = 0 with closing(pysam.AlignmentFile(alignment_file, 'rb')) as bamfile: name_indexed = pysam.IndexedReads(bamfile) print("Indexing bam file by read name.") name_indexed.build() print("Finished.") print("Looping through readdb file.") for name, fast5 in parse_readdb(readdb, read_dirs): try: iterator = name_indexed.find(name) # create ability to only grab x number of reads if seen_counter >= max_reads: break # need to start the data table with first row if seen_counter == 0: pd_data = get_summary_info_row(name) big_table = pd_data else: big_table.append(pd_data) pd_data = get_summary_info_row(name) # start tracking data pd_data["seen"] = 1 seen_counter += 1 cl_handle = CreateLabels(fast5, kmer_index=2) seq_start_time = cl_handle.raw_attributes['start_time'] pd_data["seq_start_time"] = seq_start_time for aligned_segment in iterator: if aligned_segment.is_secondary or aligned_segment.is_unmapped \ or aligned_segment.is_supplementary or aligned_segment.has_tag("SA"): if aligned_segment.is_secondary: pd_data["num_secondary_mappings"] += 1 if aligned_segment.is_unmapped: pd_data["no_mapping"] = 1 if aligned_segment.is_supplementary or aligned_segment.has_tag("SA"): pd_data["chimera_mapping"] += 1 else: pd_data["map_q"] = aligned_segment.mapq soft_clipped_percentage = \ 1 - float(len(aligned_segment.query_alignment_sequence)) / len(aligned_segment.query_sequence) pd_data["soft_clipped_percentage"] = soft_clipped_percentage handle = AlignmentSegmentWrapper(aligned_segment) handle.initialize() accuracy = handle.alignment_accuracy() pd_data["basecalled_accuracy"] = accuracy try: mea = cl_handle.add_mea_labels(number=int(number)) sa_full = cl_handle.add_signal_align_predictions(number=int(number), add_basecall=True) all_basecall_data = [] for name, basecall_data in cl_handle.aligned_signal.prediction.items(): if "guide" in name: all_basecall_data.extend(basecall_data) alignment_summary = analyze_event_skips(mea, sa_full, all_basecall_data, generate_plot=False) flagged_gaps_summary = flag_large_gaps(alignment_summary, gap_size, verbose=verbose) counter = 0 total_distance = 0 for gap in flagged_gaps_summary: if gap["mea_peak_distance"] > 10: counter += 1 total_distance += gap["mea_peak_distance"] if counter > 0: pd_data["num_flagged_gaps"] = counter pd_data["avg_flagged_gap_size"] = float(total_distance) / counter q_score_average = 0 if aligned_segment.query_qualities is None: print("Alignment done with fasta instead of fastq so read qualities will not be reported") else: q_score_average = np.mean(aligned_segment.query_qualities) pd_data["q_score_average"] = q_score_average print("pd_data['q_score_average']", pd_data["q_score_average"][0]) if pd_data["q_score_average"][0] > pass_threshold: pd_data["pass"] = 1 except Exception as e: pd_data["other_errors"] = 1 print("ERROR {}: {}".format(fast5, e), file=sys.stderr) except KeyError: pd_data["other_errors"] = 1 print("Found no alignments for {}".format(fast5)) return big_table
import pysam from collections import defaultdict # load and index the CellRanger BAM file cr_bam = pysam.AlignmentFile(snakemake.input[0], mode="rb") cr_idx = pysam.IndexedReads(cr_bam) cr_idx.build() # load and iterate through the PathSeq BAM file pathseq_bam = pysam.AlignmentFile(snakemake.input[1], mode="rb") output = [] # d = defaultdict(lambda: defaultdict(list)) # seg is an AlignedSegment object for seg in pathseq_bam.fetch(until_eof=True): # returns an IteratorRowSelection object, which contains one or more AlignedSegment object cr_list = list(cr_idx.find(seg.query_name)) # we assume that all records belonging to the same query name will have the same CB/UB tag # not all records will have the CB tag and the UB tag if cr_list[0].has_tag("CB") and cr_list[0].has_tag("UB"): CB = cr_list[0].get_tag(tag="CB") UB = cr_list[0].get_tag(tag="UB") # using set_tags removes all other tags - use set_tag instead seg.set_tag("CB", CB, "Z") seg.set_tag("UB", UB, "Z") # d[CB][UB].append(seg) # keep all PathSeq alignments output.append(seg) # write all PathSeq alignments with or without tags all_pathseq_bam = pysam.AlignmentFile(snakemake.output[0],
def extract_reads(options): print('start assigning reads...') reads_file = '%s/read_names.txt' % (options.outdir) assign_file = '%s/assign_file.txt' % (options.outdir) out = open(assign_file, 'w') n = get_names(reads_file) bamfile = pysam.AlignmentFile(options.bam, 'rb') name_indexed = pysam.IndexedReads(bamfile) name_indexed.build() error, total, remove = 0, 0, 0 error_set = [] for name in n: try: iterator = name_indexed.find(name) dict = {} pair_dict = {} len_dict = {} for x in iterator: if x.is_unmapped: continue s = 0 # locus_set = x.get_reference_positions(full_length=True) t_name = x.reference_name # if name == 'A00132:58:HFL2TDSXX:4:2276:6515:29183': # print ('#', t_name, x.next_reference_name) if t_name != x.next_reference_name: continue # print (t_name, x.next_reference_name) # if not re.search('DRB1', t_name): # continue match_num = 0 soft_num = 0 all_num = 0 for ci in x.cigar: if ci[0] == 0: match_num += ci[1] elif ci[0] == 4: soft_num += ci[1] all_num += ci[1] if soft_num > 0: continue mis_NM = 0 for ta in x.get_tags(): if ta[0] == 'NM': match_num -= ta[1] mis_NM += ta[1] if mis_NM > options.max_nm: continue # print (ta, match_num) # match_rate = match_num/(all_num - soft_num) focus_len = all_num - soft_num # if name == 'A00217:72:HFKHYDSXX:4:2603:32542:29637': # print (x.cigar, match_rate, all_num, soft_num) # refSequence = x.get_reference_sequence() # pre_ref, pre_read = '', '' # j = 0 # for i in range(len(locus_set)): # read_allele = x.query_sequence[i].upper() # if str(locus_set[i]) != 'None': # ref_allele = refSequence[j].upper() # j += 1 # else: # ref_allele = 'NONE' # base_quality = x.query_qualities[i] # p = 10 ** (- base_quality / 10) # alpha = calculate_alpha(read_allele, ref_allele, pre_ref, pre_read, p, options) # beta = calculate_beta(read_allele, options) # # print (p, alpha, beta) # s += alpha # s += beta # pre_ref, pre_read = ref_allele, read_allele if t_name not in dict.keys(): dict[t_name] = match_num #round(s,3) len_dict[t_name] = focus_len pair_dict[t_name] = 1 else: dict[t_name] += match_num #round(s,3) len_dict[t_name] += focus_len pair_dict[t_name] += 1 #evaluation total += 1 if len(dict) == 0: continue # first_align = l[0][0].split('*')[0] # reads_len = #list(len_dict.values())[0] for key in dict.keys(): # print (len_dict[key]) if len_dict[key] < 0: #make sure the reads is paired mapped. dict[key] = 0 else: dict[key] = float(dict[key]) / len_dict[key] first_align = check_score(dict, options, name, pair_dict) # print (dict, first_align, reads_len) # break if first_align == 'REMOVE': remove += 1 continue print(name, first_align, file=out) except KeyError: pass out.close()
def filter_multiple_adapter(self): chek_file = self.adapter_bam_check_sort if not os.path.exists(chek_file): if not self.fastq_check: raise ValueError( 'merged fastq is needed for adapter check alignment.') if not os.path.exists("adapter/adapter.fa"): raise ValueError( 'adapter/adapter.fa is needed for adapter check alignment.' ) #alignment print("[PEM-Q] align to check adapter...") cmd = "bwa mem -t 8 adapter/adapter -k 5 -L 0 -T 14 {} > {} 2>barcode/bwa_align_adapter.log".format( self.fastq_check, self.adapter_sam_check) os.system(cmd) print("[PEM-Q] " + cmd) cmd = "samtools view -S -b -h {} > {} \ && samtools sort {} > {} \ && samtools index {}".format(self.adapter_sam_check, self.adapter_bam_check, self.adapter_bam_check, self.adapter_bam_check_sort, self.adapter_bam_check_sort) print("[PEM-Q] sort and index bam...") os.system(cmd) else: print("[PEM-Q] adapter check alignment file exist, jump...") #keep record of multiple adapters multiple_adapt = open("barcode/" + self.basename + "_multiple_adapt.txt", "w") #生成储存含有multiple adapter的文件 clean_adapt = open("barcode/" + self.basename + "_clean_adapt.txt", "w") #生成储存没有multiple adapter的文件 bam_file = pysam.AlignmentFile(self.adapter_bam_check_sort, "rb") #读入sort后的bam文件 multiple_adapt_list = [] clean_adapt_list = [] for read in bam_file: condition1 = any( 'SA' == tg[0] for tg in read.get_tags()) #判断该read是否有多条supplementary的比对结果 if condition1: multiple_adapt_list.append(read.query_name) multiple_adapt.write(read.query_name + "\n") else: clean_adapt_list.append(read.query_name) clean_adapt.write(read.query_name + "\n") multiple_adapt.close() clean_adapt.close() bam_file.close() #remove reads with multiple adapters primer_bam = pysam.AlignmentFile(self.primer_bam, 'rb') dedup_bam_sort = primer_bam filter_bam = pysam.AlignmentFile(self.filter_bam, "wb", template=dedup_bam_sort) name_indexed = pysam.IndexedReads(dedup_bam_sort) name_indexed.build() for name in clean_adapt_list: try: name_indexed.find(name) except KeyError: pass else: iterator = name_indexed.find(name) for x in iterator: filter_bam.write(x) dedup_bam_sort.close() filter_bam.close() pysam.sort("-o", self.filter_bam_sort, self.filter_bam) primer_bam.close() return ()
continue if targetPos in refPairedPositions: readTargetPos = queryPairedPositions[refPairedPositions.index( targetPos)] if readTargetPos != None: readTargetBase = entry.query_sequence[readTargetPos] if readTargetBase.upper() == targetBase: selectedReadNames.add(entry.query_name) sys.stderr.write("\nFound {} entries carrying a target base\n".format( len(selectedReadNames))) #index file for pulling out reads sys.stderr.write( "\nBuilding read name index for extracting selected pairs...\n") readIndex = pysam.IndexedReads(inBam) readIndex.build() sys.stderr.write("\nWriting {} selected entries...\n".format( len(selectedReadNames))) for readName in selectedReadNames: entries = readIndex.find(readName) for entry in entries: outBam.write(entry) inBam.close() outBam.close() targetsFile.close() sys.stderr.write("\nSorting output bam file...\n")