def __init__(self, fname, referenceFastaFname=None): self.filename = fname = abspath(expanduser(fname)) self.peer = AlignmentFile(fname, "rb", check_sq=False) self._checkFileCompatibility() self._loadReferenceInfo() self._loadReadGroupInfo() self._loadProgramInfo() self.referenceFasta = None if referenceFastaFname is not None: if self.isUnmapped: raise ValueError, "Unmapped BAM file--reference FASTA should not be given as argument to BamReader" self._loadReferenceFasta(referenceFastaFname)
def make_chrom_info(bam): base = make_basename(bam) chrom_info_filename = base + '.chrom_info' with AlignmentFile(bam, 'rb') as alignment, open(chrom_info_filename, 'wt') as chrom_info: for row in alignment.header['SQ']: name = row['SN'] length = row['LN'] chrom_info.write(name) chrom_info.write('\t') chrom_info.write(str(length)) chrom_info.write(os.linesep) return chrom_info_filename
def test_anno_1(tmpdir): "test --simple version" make_bam( tmpdir.strpath, """ 123456789_123456789_12 r1 + ........... r1 - ......*.... r2 + .........*. r2 - .....*....... r3 + ........... r3 - ....*...... r4 + ........... r4 - ........... 123456789_123456789_12 """) sam = AlignmentFile(tmpdir.join("test.bam").strpath) o = Namespace(query=tmpdir.join("test.vcf").strpath, cfdna=sam, gdna=None, simple=True, verbos=False, fast=False, qual=20, output=tmpdir.join("test_MrBam.vcf").strpath) anno(o) for i in open(tmpdir.join("test_MrBam.vcf").strpath): if i.startswith('#'): continue i = i.split('\t') if i[1] == '12': a, b, c, d = i[-1].split(':')[-1].strip().split(',') assert a == '0' assert b == '0' assert c == '1' assert d == '1' elif i[1] == '16': a, b, c, d = i[-1].split(':')[-1].strip().split(',') assert a == '0' assert b == '0' assert c == '0' assert d == '0' else: raise Exception("unexpected variant call")
def atac(args, logger): """ """ if not args.chrom_sizes: exit("Chrom sizes required for bam conversion") chrom_mods = build_transform(args.mod, logger) input_ = AlignmentFile(args.input, 'rb') header = update_header(input_.header.as_dict(), args.chrom_sizes) output = AlignmentFile(args.output, 'wb', header=header) curr_chrom = "" for line in input_: if input_.references[line.reference_id] != curr_chrom: curr_chrom = input_.references[line.reference_id] positions, deltas = get_positions_and_deltas( chrom_mods, curr_chrom, logger) # if line.is_reverse and (line.reference_length != len(line.seq)): # print line # print line.reference_length # print line.cigar # print len(line.seq) # print len(line.get_reference_positions()) # try: if not line.is_reverse: start_delta = find_delta(positions, deltas, int(line.reference_start)) line.reference_start = int(line.reference_start) + start_delta else: end_delta = find_delta(positions, deltas, int(line.reference_end)) mapped_end = int(line.reference_end) + end_delta line.reference_start = mapped_end - len( line.seq) # line.reference_length output.write(line)
def parse(bamfile, minqual): bamhandle = AlignmentFile(bamfile, 'rb') positions = Positions() for read in bamhandle: if isclip(read) is False or read.mapping_quality < minqual: continue clip = ClipRead(read) pos = positions.getposition(bamhandle.get_reference_name(read.reference_id), \ clip.getclippos()) pos.addclipread(clip) bamhandle.close() return positions
def run_project_alignments(args): """ Project mapped sam file""" sam = args.sam chromosomes = args.chromosomes.split(",") graph_dir = args.data_dir linear_ref_paths = {} haplotype_paths = {} out_sam = AlignmentFile(args.out_sam, "w", template=AlignmentFile(sam)) logging.info("Reading linear paths") for chromosome in tqdm(chromosomes): linear_ref_paths[chromosome] = NumpyIndexedInterval.from_file( graph_dir + chromosome + "_linear_pathv2.interval") haplotype_paths[chromosome] = NumpyIndexedInterval.from_file( args.linear_paths_base_name + "_" + chromosome + ".intervalcollection.indexed") logging.info("Converting") n_unmapped = 0 for sam_record in tqdm(read_sam(sam), total=number_of_lines_in_file(sam)): chromosome = sam_record.chromosome if chromosome is None: out_sam.write(sam_record.pysam_object) n_unmapped += 1 continue length = len(sam_record.sequence) projected_start = convert_position_on_haplotype_to_position_on_linear_ref( linear_ref_paths[chromosome], haplotype_paths[chromosome], sam_record.start) sam_record.set_start(projected_start) out_sam.write(sam_record.pysam_object) logging.info("%d sam records missed chromosome (unmapped)" % n_unmapped)
def test_pad_softclip_1(tmpdir): "it should memorize the result" make_bam(tmpdir.strpath, """ r1 + __.*....... r1 - .*.......__ """) o = Namespace(verbos=False, mismatch_limit=-1) sam = AlignmentFile(tmpdir.join("test.bam").strpath) a = pad_softclip(sam) b = pad_softclip(sam) assert a is b
def _recalibrate_reads(bam_path, reference_path, contig, start, end, covariate_kwargs, **kwargs): # Recalibrate the reads in bam_path global joined_prob # Global to share over multiprocessing # joined_prob contains P(error| d), where d is a descriptor generated by get_covariate_key o_path = f'out_{uuid4()}.bam' # Open source bam file: with AlignmentFile(bam_path) as alignments, FastaFile( reference_path) as fa: # @todo: extract only selected region from fasta file: reference = CachedFasta(fa) # Open target bam file: with AlignmentFile(o_path, header=alignments.header, mode='wb') as out: # Iterate all reads in the source bam file: for read in alignments.fetch(contig, start, end): recalibrate_base_calls(read, reference, joined_prob, covariate_kwargs) out.write(read) pysam.index(o_path) return o_path
def passes(self): from collections import Counter if self._passes is None: # for BAM of 1M reads, takes 10-15 seconds from pysam import AlignmentFile ccs = AlignmentFile(self.filename, check_sq=False) qnames = [a.qname for a in ccs] names = [int(qname.split("/")[1]) for qname in qnames] self.qnames = qnames lengths = [] for qname in qnames: a, b = qname.split("/")[2].split("_") lengths.append(int(b) - int(a)) self._lengths = lengths self._passes = list(Counter(names).values()) return self._passes
def getAllFragmentSizes(bamfile, lower, upper, atac=1): sizes = np.zeros(upper - lower, dtype=np.float) # loop over samfile bamHandle = AlignmentFile(bamfile) for read in bamHandle: if read.is_proper_pair and not read.is_reverse: if atac: #get insert size #correct by 8 base pairs to be inserion to insertion ilen = abs(read.template_length) - 8 else: ilen = abs(read.template_length) if ilen < upper and ilen >= lower: sizes[ilen - lower] += 1 bamHandle.close() return sizes
def run_tagging_tasks(args: tuple): """ Run tagging for one or more tasks Args: args (tuple): (alignments_path, temp_dir, timeout_time), arglist """ (alignments_path, temp_dir, timeout_time), arglist = args target_file = f"{temp_dir}/{uuid4()}.bam" timeout_tasks = [] total_molecules = 0 read_groups = dict() with AlignmentFile(alignments_path) as alignments: with sorted_bam_file(target_file, origin_bam=alignments, mode='wb', fast_compression=False, read_groups=read_groups) as output: for task in arglist: try: statistics = run_tagging_task(alignments, output, read_groups=read_groups, timeout_time=timeout_time, **task) total_molecules += statistics.get('total_molecules_written', 0) except TimeoutError: timeout_tasks.append( task ) meta = { 'timeout_tasks' : timeout_tasks, 'total_molecules' : total_molecules, } if total_molecules>0: return target_file, meta else: # Clean up ? try: remove(target_file) remove(f'{target_file}.bai') except Exception as e: print(f'Cleaning up failed for {target_file}') print(e) pass return None, meta
def bamToBed(alignmentFile, outputBedFile): with AlignmentFile(alignmentFile) as inputFhd, \ smart_out_open(outputBedFile,"w") as outputFhd: regions, name, strand = None, None, None for alignmentSegement in inputFhd: if alignmentSegement.is_unmapped: # unmapped reads continue if alignmentSegement.qname != name: # reads from new fragments if name: mergeAndOutpuBed(regions, outputFhd, name, strand) regions, name, strand = [ (alignmentSegement.reference_name, x[0], x[1]) for x in alignmentSegement.get_blocks() ], alignmentSegement.qname, "-" if alignmentSegement.is_reverse and alignmentSegement.is_read1 or not alignmentSegement.is_reverse and not alignmentSegement.is_read1 else "+" else: # reads from same fragments regions.extend([(alignmentSegement.reference_name, x[0], x[1]) for x in alignmentSegement.get_blocks()]) mergeAndOutpuBed(regions, outputFhd, name, strand)
def fetch_count_read (alignment_file, seq_name, start, end): """ Count the number of read that are at least partly overlapping a specified chromosomic region @param alignment_file Path to a sam or a bam file @param seq_name Name of the sequence where read are to be aligned on @param start Start genomic coordinates of the area of alignment @param end End End genomic coordinates of the area of alignment """ # Specific imports from pysam import AlignmentFile al = AlignmentFile(alignment_file, "rb") # Count read aligned at least partly on the specified region n = 0 for i in al.fetch(seq_name, start, end): n += 1 return n
def get_barcode_frequency_genomewide(bamfile, storage): """ This function obtains the barcode frequency and stores it in a table. Parameters ---------- bamfile : str Path to a bamfile. The bamfile must be indexed. storage : str Path to the output hdf5 file, which contains the counts per chromsome. """ # Obtain the header information afile = AlignmentFile(bamfile, 'rb') if 'RG' in afile.header: use_group = True else: use_group = False barcodes = {} if use_group: # extract barcodes for idx, item in enumerate(afile.header['RG']): barcodes[item['ID']] = 0 else: barcodes['dummy'] = 0 print('found {} barcodes'.format(len(barcodes))) for aln in afile.fetch(until_eof=True): if aln.is_proper_pair and aln.is_read1: barcodes[aln.get_tag('RG') if use_group else 'dummy'] += 1 if not aln.is_paired: barcodes[aln.get_tag('RG') if use_group else 'dummy'] += 1 afile.close() names = [key for key in barcodes] counts = [barcodes[key] for key in barcodes] df = pd.DataFrame({'barcodes': names, 'counts': counts}) df.to_csv(storage, sep='\t', header=True, index=False)
def main(bam, index, flags, flag_filter, min_quality, target, file=stdout, **kwargs): """Interpret arguments and dispatch data to subroutines""" if target == "cigar": chopper, integer_target = cigar_chopper, None else: chopper, integer_target = relative_chopper, interpret_flags(target) ecx = load_index(index) with AlignmentFile(bam) as alignment: print(str(alignment.header).rstrip("\n"), file=file) n_skipped = 0 bam_iterator = progressbar( filter_bam(alignment, [flags, flag_filter, min_quality]), desc="Chopping", unit="read", ) with errstate(invalid="ignore"): for entry in bam_iterator: if entry.query_sequence: chopped_entry, error = chopper( entry, ecx, integer_target, ) if chopped_entry.query_sequence: print(chopped_entry.to_string(), file=file) else: n_skipped += 1 if n_skipped: msg_mask = "Skipped {} reads to be safe (unsure where to chop)" print(msg_mask.format(n_skipped), file=stderr) warning = [ "WARNING: Read mapping positions were adjusted and retained;", " this is needed to comply with the SAM spec.", " Do not use these positions for analyses outside of edgeCase!", ] print("\n".join(warning), file=stderr) return 0
def test_pad_softclip_3(tmpdir): "it should pad softclipped bases" make_bam( tmpdir.strpath, """ 123456789_123 r1 + __.*....... r1 - .*......... r2 - ...*....... r2 + .*.......__ """) o = Namespace(verbos=False, mismatch_limit=-1) sam = AlignmentFile(tmpdir.join("test.bam").strpath) adjusted_pos = pad_softclip(sam) assert adjusted_pos["r1"] == (0, 13) # 0-based position assert adjusted_pos["r2"] == (0, 13)
def test_pad_softclip_2(tmpdir): "it should ignore more than two reads which share the same name" make_bam( tmpdir.strpath, """ r1 + __.*....... r1 - .*.......__ r1 - .*.......__ r2 + .*.......__ r2 - .*.......__ """) o = Namespace(verbos=False, mismatch_limit=-1) sam = AlignmentFile(tmpdir.join("test.bam").strpath) adjusted_pos = pad_softclip(sam) assert sum(1 for startpos, length in adjusted_pos.values() if startpos != -1) == 1
def gather_sv_data(options, collection): # Read regions of interest BED file regions = BedTool(options.region_file) # Read BAM file bamfile = AlignmentFile(options.bam_file, "rb") # Intersect regions for reg in regions: for read in bamfile.fetch(reg.chrom, reg.start, reg.end): #print read if read.query_name.endswith("2d"): collection[read.query_name] = [] if read.query_name.startswith("ctg"): collection[read.query_name] = [] #print read.reference_id, read.reference_start, read.reference_end #print read.query_name, read.query_alignment_start, read.query_alignment_end bamfile.close()
def create_table(ctx, input_bam, output_table, alignment_haplotypes): """Convert a BAM file to a tabular format sorted by read for downstream analysis""" from pysam import AlignmentFile from pore_c import model tmp_table = output_table + ".tmp" logger.debug(f"Writing temporary unsorted data to {tmp_table}") af = AlignmentFile(input_bam) chrom_order = list(af.references) assert "NULL" not in chrom_order chrom_order.append("NULL") logger.debug(f"Chromosome order {chrom_order}") align_df = model.AlignmentRecord.to_dataframe( [model.AlignmentRecord.from_aligned_segment(a) for a in af], chrom_order=chrom_order) align_df = align_df.sort_values(["read_name"]) num_aligns, num_reads = len(align_df), align_df.read_idx.nunique() logger.debug( f"Writing {num_aligns} alignments for {num_reads} reads to {output_table}" ) if alignment_haplotypes: ht_df = pd.read_csv(alignment_haplotypes, sep="\t") align_df = model.AlignmentRecord.update_dataframe_with_haplotypes( align_df, ht_df) align_df.to_parquet(output_table, engine=PQ_ENGINE, index=False, version=PQ_VERSION) g = align_df.groupby(["align_type"]) summary = pd.concat({ "num_reads": g["read_idx"].nunique(), "num_aligns": g.size() }).unstack(level=0) logger.info(f"Mapping summary:\n {summary}") haplotype_counts = (align_df.haplotype.value_counts().rename_axis( "haplotype").to_frame().rename(columns={"haplotype": "num_aligns"})) logger.info(f"Haplotype counts:\n {haplotype_counts}")
def constructDistributions(bamName, lengths): ''' Given a BAM file, constructs a coverage distribution for each long read Inputs - (str) bamName: BAM file name - (dict[(str) refName] = (int) read length) lengths: returns the length of the long read given its read name Outputs - ( dict[(str) refName] = (numpy.array of ints) distribution ) dists: contains the coverage distributions for each long read ''' samfile = AlignmentFile(bamName, 'r') iter = samfile.fetch() dists = {} for alignment in iter: refName = alignment.reference_name start = int(alignment.reference_start) cigarTups = alignment.cigartuples updateDistribution(dists, lengths, refName, start, cigarTups) return dists
def check_sam_header(input_file): ''' tries to parse the header of the sam or bam file used as input ''' m = get_mode_string(input_file, write=False) try: with AlignmentFile(input_file, m) as af: # no tag for the program that generated the output if not 'PG' in af.header: return False, None # only one program tag -> processed only by bwa if not len(af.header['PG'])==1: return False, None if not 'CL' in af.header['PG'][0]: return False, None # check progrma call return 'bwa sampe' in af.header['PG'][0]['CL'], None # catch any errors caused by pysam being unable to read the file except Exception as e: return False, str(e)
def extract_barcode(sam, barcode_file, outdir): # Create the hash set for cell names fin = open(barcode_file, 'r') barcodes_filtered = set() for line in fin: line = line.strip() barcodes_filtered.add(line) print(len(barcodes_filtered)) sam_file = AlignmentFile(sam, mode='r') #filter_file = AlignmentFile("-", mode='wh', template=sam_file) track = sam_file.fetch(until_eof=True) for i, aln in enumerate(track): # if aln.is_unmapped: # continue # print(i) ''' Error to use query_alignment_sequence, use query_sequence instead? ''' reads_name, reads, cell_barcode, umi, quality = aln.qname, aln.query_sequence, aln.get_tag( 'XC'), aln.get_tag('XM'), aln.qual # print(reads_name, reads, cell_barcode, umi, quality) # print(reads) # print(quality) if cell_barcode in barcodes_filtered: # print(reads_name, reads, cell_barcode, umi, quality) if len(reads) != len(aln.qual): print("Error, skipped:", reads, quality) continue fout_umi = open(outdir + '/' + cell_barcode + '.umi', 'a+') fout_umi.write(umi + '\n') fout_fq = open(outdir + '/' + cell_barcode + '.fastq', 'a+') fout_fq.write('@' + reads_name + '\n') fout_fq.write(reads + '\n') fout_fq.write('+\n') fout_fq.write(quality + '\n') if i % 100000 == 0: print(i / 209400000.0)
def __init__( self, mode, iterator, aligner, fd=sys.stdout, min_coverage=0.90, min_accuracy=0.99, ref_fn=None, groups=None ): super().__init__() self.fd = fd self.log = [] self.mode = mode self.aligner = aligner self.iterator = iterator self.min_coverage = min_coverage self.min_accuracy = min_accuracy self.output = AlignmentFile( fd, 'w' if self.mode == 'wfq' else self.mode, add_sam_header=self.mode != 'wfq', reference_filename=ref_fn, header=AlignmentHeader.from_references( reference_names=aligner.seq_names, reference_lengths=[len(aligner.seq(name)) for name in aligner.seq_names], text=sam_header(groups), ) )
def __init__(self, mode, iterator, aligner, fd=sys.stdout, duplex=False, ref_fn=None, groups=None, group_key=None): super().__init__() self.fd = fd self.log = [] self.mode = mode self.duplex = duplex self.aligner = aligner self.iterator = iterator self.fastq = mode == 'wfq' self.group_key = group_key self.output = AlignmentFile( fd, 'w' if self.fastq else self.mode, add_sam_header=not self.fastq, reference_filename=ref_fn, header=AlignmentHeader.from_references( reference_names=aligner.seq_names if aligner else [], reference_lengths=[ len(aligner.seq(name)) for name in aligner.seq_names ] if aligner else [], text=sam_header(groups), ) )
def bamtag(sam): ''' Convert a BAM/SAM with fastqtransformed read names to have UMI and cellular barcode tags ''' from pysam import AlignmentFile start_time = time.time() sam_file = open_bamfile(sam) out_file = AlignmentFile("-", "wh", template=sam_file) track = sam_file.fetch(until_eof=True) # peek at first alignment to determine the annotations queryalignment = track.next() annotations = detect_alignment_annotations(queryalignment) track = itertools.chain([queryalignment], track) re_string = construct_transformed_regex(annotations) parser_re = re.compile(re_string) for count, aln in enumerate(track, start=1): if count and not count % 100000: logger.info("Processed %d alignments." % count) match = parser_re.match(aln.qname) tags = aln.tags if "cellular" in annotations: aln.tags += [('XC', match.group('CB'))] if "molecular" in annotations: aln.tags += [('RX', match.group('MB'))] if "sample" in annotations: aln.tags += [('XS', match.group('SB'))] out_file.write(aln) total_time = time.time() - start_time logger.info('BAM tag conversion done - {:.3}s, {:,} alns/min'.format( total_time, int(60. * count / total_time))) logger.info("Processed %d alignments." % count)
def get_genome_size_from_bam(file): """ Extract chromosome sizes from a bam-file. Parameters ---------- file : str bam-file Returns ------- dict Dict with keys and values corresponding to chromosome names and lengths, respectively. """ afile = AlignmentFile(file, 'rb') # extract genome size genomesize = {} for chrom, length in zip(afile.references, afile.lengths): genomesize[chrom] = length afile.close() return genomesize
def getFragmentSizesFromChunkList(chunks, bamfile, lower, upper, atac=1): sizes = np.zeros(upper - lower, dtype=np.float) # loop over samfile bamHandle = AlignmentFile(bamfile) for chunk in chunks: for read in bamHandle.fetch(chunk.chrom, max(0, chunk.start - upper), chunk.end + upper): if read.is_proper_pair and not read.is_reverse: if atac: #get left position l_pos = read.pos + 4 #get insert size #correct by 8 base pairs to be inserion to insertion ilen = abs(read.template_length) - 8 else: l_pos = read.pos ilen = abs(read.template_length) center = l_pos + (ilen - 1) // 2 if ilen < upper and ilen >= lower and center >= chunk.start and center < chunk.end: sizes[ilen - lower] += 1 bamHandle.close() return sizes
def count_mapped_bp(args, tempdir, genes): """ Count number of bp mapped to each gene across pangenomes. Return number covered genes and average gene depth per species. Result contains only covered species, but being a defaultdict, would yield 0 for any uncovered species, which is appropriate. """ bam_path = f"{tempdir}/pangenomes.bam" bamfile = AlignmentFile(bam_path, "rb") covered_genes = {} # loop over alignments, sum values per gene for aln in bamfile.fetch(until_eof=True): gene_id = bamfile.getrname(aln.reference_id) gene = genes[gene_id] gene["aligned_reads"] += 1 if keep_read(aln, args.aln_mapid, args.aln_readq, args.aln_mapq, args.aln_cov): gene["mapped_reads"] += 1 gene["depth"] += len(aln.query_alignment_sequence) / float(gene["length"]) covered_genes[gene_id] = gene tsprint("Pangenome count_mapped_bp: total aligned reads: %s" % sum(g["aligned_reads"] for g in genes.values())) tsprint("Pangenome count_mapped_bp: total mapped reads: %s" % sum(g["mapped_reads"] for g in genes.values())) # Filter to genes with non-zero depth, then group by species nonzero_gene_depths = defaultdict(list) for g in covered_genes.values(): gene_depth = g["depth"] if gene_depth > 0: # This should always pass, because ags.aln_cov is always >0. species_id = g["species_id"] nonzero_gene_depths[species_id].append(gene_depth) # Compute number of covered genes per species, and average gene depth. num_covered_genes = defaultdict(int) mean_coverage = defaultdict(float) for species_id, non_zero_depths in nonzero_gene_depths.items(): num_covered_genes[species_id] = len(non_zero_depths) mean_coverage[species_id] = np.mean(non_zero_depths) return num_covered_genes, mean_coverage, covered_genes
def _extract_sites(self, sample): """ Loop through all positions and get pileup information. """ if not self.sites: return sample # get the pileup bam = AlignmentFile(sample.sample_bam) pileup = pd.DataFrame() for site in self.sites: pileup_site = self._pileup(bam, site) pileup_site = self._get_genotype_info(pileup_site, site['ref_allele'], site['alt_allele']) pileup = pileup.append(pileup_site, ignore_index=True) pileup = pileup[[ 'chrom', 'pos', 'ref', 'alt', 'reads_all', 'matches', 'mismatches', 'A', 'C', 'T', 'G', 'N', 'minor_allele_freq', 'genotype_class', 'genotype' ]] for col in [ 'pos', 'A', 'C', 'T', 'G', 'N', 'matches', 'mismatches', 'reads_all' ]: pileup[col] = pileup[col].astype(int) sample.pileup = pileup return sample
def fragmentlength_from_bam(bamfile, regions, mapq, maxlen): """ Compute fragment length per region from a bam-file or Parameters ---------- bamfile : str bam-file regions : str, BedTool Bed-file or BedTool object containing the regions. mapq : int Minimum mapping quality. maxlen : int Maximum fragment length. Returns ------- scipy.sparse.coo_matrix Sparse regions by maxlen matrix containing the fragment counts. """ chroms = [] starts = [] ends = [] tlens = [] afile = AlignmentFile(bamfile, "rb") for aln in afile.fetch(): if aln.mapping_quality < mapq: continue if aln.is_proper_pair and aln.is_read1: start = min(aln.reference_start, aln.next_reference_start) end = abs(aln.tlen) chroms.append(aln.reference_name) starts.append(start) ends.append(end) df = pd.DataFrame({'chrom': chroms, 'start': starts, 'end': ends}) fragments = BedTool.from_dataframe(df) return fragmentlength_from_bed(fragments, regions, maxlen)