def __encrypt_unmapped(alignment: pysam.AlignedSegment, secret: bytes): """ Stream cipher encryption / decryption. alignment + secret => encrypted_alignment encrypted_alignment + secret => alignment :param alignment: :param secret: :return: encrypter/decrypted alignment """ if alignment.is_unmapped: if secret is None: raise ValueError( 'Secret key must be present when unmapped alignments are iterated.' ) # use 64B long hash (encrypts 256 bases) sha512 = hashlib.sha512() sha512.update(secret + alignment.query_name.encode()) mut_seq = cmn.stream_cipher(alignment.query_sequence, sha512.digest()) # change and preserve quality # TODO: maybe something else with the quality? quality = alignment.query_qualities alignment.query_sequence = mut_seq alignment.query_qualities = quality
def check_read_quality(sam_record: pysam.AlignedSegment, run_info): """ Process an individual sam read and return quality attributes. """ read_ID = sam_record.query_name flag = sam_record.flag cigar = sam_record.cigarstring seq = sam_record.query read_length = sam_record.query_length dataset = sam_record.get_tag('RG') # Only use uniquely mapped transcripts if flag not in [0, 16]: return [dataset, read_ID, 0, 0, read_length, "NA", "NA"] # Only use reads that are greater than or equal to length threshold if read_length < run_info.min_length: return [dataset, read_ID, 0, 1, read_length, "NA", "NA"] # Locate the MD field of the sam transcript try: md_tag = sam_record.get_tag('MD') except KeyError: raise ValueError("SAM transcript %s lacks an MD tag" % read_ID) # Only use reads where alignment coverage and identity exceed # cutoffs coverage = compute_alignment_coverage(cigar) identity = compute_alignment_identity(md_tag, seq) if coverage < run_info.min_coverage or \ identity < run_info.min_identity: return [dataset, read_ID, 0, 1, read_length, coverage, identity] # At this point, the read has passed the quality control return [dataset, read_ID, 1, 1, read_length, coverage, identity]
def parse_chromium_bamread_metadata(alignment: pysam.AlignedSegment): """ return the readname, error-corrected cellbarvode and error corrected UMI" according to: 'https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/output/bam' """ cellbarcode = alignment.get_tag('CB') umi = alignment.get_tag('UB') readname = alignment.query_name return readname, cellbarcode, umi
def _new_rec(self, name: str, chrom: str, start: int, attrs: Optional[Dict[str, Any]]) -> AlignedSegment: """Generates a new AlignedSegment. Sets the segment up with the correct header and adds the RG attribute if not contained in attrs. Args: name: the name of the read/template chrom: the chromosome to which the read is mapped start: the start position of the read on the chromosome attrs: an optional dictionary of SAM attributes with two-char keys Returns: AlignedSegment: an aligned segment with name, chrom, pos, attributes the read group, and the unmapped flag all set appropriately. """ if chrom is not sam.NO_REF_NAME and chrom not in self._seq_lookup: raise ValueError( f"{chrom} is not a valid chromosome name in this builder.") rec = AlignedSegment(header=self._samheader) rec.query_name = name rec.reference_name = chrom rec.reference_start = start rec.mapping_quality = self.mapping_quality if chrom == sam.NO_REF_NAME or start == sam.NO_REF_POS: rec.is_unmapped = True attrs = attrs if attrs else dict() if "RG" not in attrs: attrs["RG"] = self.rg_id() rec.set_tags(list(attrs.items())) return rec
def test_sam_alignment_to_padded_alignment(): alignment = AlignedSegment() alignment.reference_start = 0 alignment.query_sequence = 'AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG' alignment.cigartuples = ((0,10), (2,1), (0,25)) ref = Reference('test', 'AGCTTAGCTAAGCTACCTATATCTTGGTCTTGGCCG') (pad_ref, pad_match, pad_query) = sam_alignment_to_padded_alignment(alignment, ref) assert pad_ref == 'AGCTTAGCTAAGCTACCTATATCTTGGTCTTGGCCG' assert pad_match == '|||||||||| |||||||||||||||||||||||||' assert pad_query == 'AGCTTAGCTA-GCTACCTATATCTTGGTCTTGGCCG'
def set_qc_fail(rec: pysam.AlignedSegment, tool: Callable[..., Any], reason: str) -> None: """Sets the QC fail flag, and adds tags containing the tool name and reason for failing. Args: rec: the record to fail tool: the tool (as a callable) that failed this record reason: the reason for failing """ assert '\t' not in reason, f"Reason may not contain tabs: {reason}" rec.is_qcfail = True rec.set_tag(QcFailToolTag, tool.__name__) rec.set_tag(QcFailReasonTag, reason)
def from_aligned_segment(cls, align: pysam.AlignedSegment) -> "AlignmentRecord": """Extract information from a pysam Aligned segment""" read_name, read_idx, align_idx = align.query_name.split(":") read_idx, align_idx = int(read_idx), int(align_idx) if align.is_unmapped: align_cat = "unmapped" chrom, start, end, align_score = "NULL", 0, 0, 0 read_length = align.query_length quals = align.query_qualities # TODO: handle this more gracefully if quals is None: align_base_qscore = 0 else: align_base_qscore = mean_qscore(np.array( align.query_qualities)) else: chrom, start, end = (align.reference_name, align.reference_start, align.reference_end) read_length = align.infer_read_length() align_score = align.get_tag("AS") align_base_qscore = mean_qscore( np.array(align.query_alignment_qualities)) if align.is_secondary: align_cat = "secondary" elif align.is_supplementary: align_cat = "supplementary" else: align_cat = "primary" optional = {} for key, tag in [("haplotype", "HP"), ("phase_set", "PS"), ("phase_qual", "PC")]: if align.has_tag(tag): optional[key] = int(align.get_tag(tag)) return cls( read_idx=read_idx, align_idx=align_idx, align_type=align_cat, chrom=chrom, start=start, end=end, strand=not align.is_reverse, read_name=read_name, read_length=read_length, read_start=align.query_alignment_start, read_end=align.query_alignment_end, mapping_quality=align.mapq, align_score=align_score, align_base_qscore=np.rint(align_base_qscore), **optional, )
def breakpoint_pos(read: pysam.AlignedSegment, orient: str = ORIENT.NS) -> int: """ assumes the breakpoint is the position following softclipping on the side with more softclipping (unless and orientation has been specified) Args: read: the read object orient: the orientation Returns: the position of the breakpoint in the input read """ typ, freq = read.cigar[0] end_typ, end_freq = read.cigar[-1] ORIENT.enforce(orient) if typ != CIGAR.S and end_typ != CIGAR.S: raise AttributeError( 'cannot compute breakpoint for a read without soft-clipping', read.cigar ) if orient == ORIENT.NS: if ( (typ == CIGAR.S and end_typ == CIGAR.S and freq > end_freq) or typ == CIGAR.S and end_typ != CIGAR.S ): orient = ORIENT.RIGHT # soft clipped to the left else: # soft clipped to the right orient = ORIENT.LEFT if orient == ORIENT.RIGHT: if typ != CIGAR.S: raise AttributeError( 'soft clipping doesn\'t support input orientation for a breakpoint', repr(orient), read.cigar, read.get_tags(), ) return read.reference_start else: if end_typ != CIGAR.S: raise AttributeError( 'soft clipping doesn\'t support input orientation for a breakpoint', orient, read.cigar, read.get_tags(), ) return read.reference_end - 1
def get_qc_fail(rec: pysam.AlignedSegment) -> Optional[Tuple[str, str]]: """Gets the tool and reason for why the QC fail flag is set, otherwise None if not set. If the QC fail flag is set, but the tool and filter reason SAM tags are not set, None will be returned. Use pysam.AlignedSegment.is_qcfail() to check if the record is simply QC failed. Args: rec: the record to fail """ if not rec.is_qcfail or not rec.has_tag(QcFailToolTag): return None else: tool_value = rec.get_tag(QcFailToolTag) reason_value = rec.get_tag(QcFailReasonTag) return (tool_value, reason_value)
def softclip_end_of_alignment_by_query( rec: AlignedSegment, bases_to_clip: int, clipped_base_quality: Optional[int] = None, tags_to_invalidate: Iterable[str] = TAGS_TO_INVALIDATE ) -> ClippingInfo: """ Adds soft-clipping to the end of a read's alignment. Clipping is applied before any existing hard or soft clipping. E.g. a read with cigar 100M5S that is clipped with bases_to_clip=10 will yield a cigar of 90M15S. If the read is unmapped or bases_to_clip < 1 then nothing is done. If the read has fewer clippable bases than requested the read will be unmapped. Args: rec: the BAM record to clip bases_to_clip: the number of additional bases of clipping desired in the read/query clipped_base_quality: if not None, set bases in the clipped region to this quality tags_to_invalidate: the set of extended attributes to remove upon clipping Returns: ClippingInfo: a named tuple containing the number of query/read bases and the number of target/reference bases clipped. """ if rec.is_unmapped or bases_to_clip < 1: return ClippingInfo(0, 0) num_clippable_bases = rec.query_alignment_length if bases_to_clip >= num_clippable_bases: return _clip_whole_read(rec, tags_to_invalidate) # Reverse the cigar and qualities so we can clip from the start cigar = Cigar.from_cigartuples(rec.cigartuples).reversed() quals = rec.query_qualities quals.reverse() new_cigar, clipping_info = _clip(cigar, quals, bases_to_clip, clipped_base_quality) # Then reverse everything back again quals.reverse() rec.query_qualities = quals rec.cigarstring = str(new_cigar.reversed()) _cleanup(rec, tags_to_invalidate) return clipping_info
def get_barcode_for_alignment(alignment: pysam.AlignedSegment, tags: List[str], raise_missing: bool) -> str: """ Get the barcode for an Alignment :param alignment: pysam.AlignedSegment An Alignment from pysam. :param tags: List[str] Tags in the bam that might contain barcodes. If multiple Tags are passed, will return the contents of the first tag that contains a barcode. :param raise_missing: bool Raise an error if no barcodes can be found. :return: str A barcode for the alignment, or None if one is not found and raise_missing is False. """ alignment_barcode = None for tag in tags: # The non-existent barcode should be the exceptional case, so try/except is faster than if/else try: alignment_barcode = alignment.get_tag(tag) break # Got the key, don't bother getting the next tag except KeyError: continue # Try to get the next tag if raise_missing and alignment_barcode is None: raise RuntimeError( "Alignment encountered that is missing {} tag(s).".format(tags)) return alignment_barcode
def aggregate(self, record: pysam.AlignedSegment): if self.maxMapQ < record.mapping_quality: self.maxMapQ = record.mapping_quality try: startPos = record.get_tag('OS') # type: int except KeyError: startPos = record.reference_start recordItr = CigarIterator(record) recordItr.skipClipped() i = startPos - record.reference_start while recordItr.valid: if len(self.cols) <= i: op = self.Op(recordItr.op, recordItr.seqBase) op += recordItr.baseQual or 0 pos = {} pos[(op.op, op.allele)] = op self.cols.append(pos) else: op = self.cols[i].get((recordItr.op, recordItr.seqBase)) if op: op += recordItr.baseQual or 0 else: self.cols[i][(recordItr.op, recordItr.seqBase)] = self.Op( recordItr.op, recordItr.seqBase) i += 1 recordItr.next() self.members.append(record.query_name)
def get_introns(sam_record: pysam.AlignedSegment, start, cigar): """ Locates the jI field in a list of SAM fields or computes it from the CIGAR string and start position if it isn't found. Note that positions refer to start and endpoints of introns, not exons, so adjustments are needed to avoid an off-by-one error if you want exons. Example jI strings: no introns: jI:B:i,-1 two introns: jI:B:i,167936516,167951806,167951862,167966628 Args: sam_record: a pysam AlignedSegment start: The start position of the transcript with respect to the forward strand cigar: SAM CIGAR string describing match operations to the reference genome Returns: intron_list: intron starts and ends in a list (sorted order) """ try: intron_list = sam_record.get_tag("jI").tolist() except KeyError: jI = compute_jI(start, cigar) intron_list = [int(x) for x in jI.split(",")[1:]] if intron_list[0] == -1: return [] else: return intron_list
def test_recordShouldBeFilteredOut_maskSpannedByRecordReturnsTrue(self, *mock): masker = Masker(tree=IntervalTree([Interval(10, 20, "chrom1")])) record = AlignedSegment() actual = masker.record_should_be_filtered_out(record) expected = True assert actual == expected
def test_recordShouldBeFilteredOut_recordDoesNotOverlapReturnsFalse(self, *mock): masker = Masker(tree=IntervalTree([Interval(10, 20, "chrom1")])) record = AlignedSegment() actual = masker.record_should_be_filtered_out(record) expected = False assert actual == expected
def assess_alignment(alignment: pysam.AlignedSegment, alignment_info: Dict): """ Compare alignment against reference alignment""" chrom_match = alignment.reference_name == alignment_info['chrom'] # assess reference bases that match between the two reads matching_pos = np.array(alignment.get_reference_positions(full_length=False)) base_range = (matching_pos >= alignment_info['start']) & (matching_pos <= alignment_info['end']) matching_prop = sum(base_range) / len(alignment_info['cigar']) return chrom_match, matching_prop
def is_snp_called_correctly(record: pysam.AlignedSegment) -> bool: for query_pos, ref_pos, ref_base in record.get_aligned_pairs( with_seq=True): if ref_pos == 100: if ref_base.islower(): return False else: return True
def merge_annotated_clusters( biggest: pysam.AlignedSegment, other: pysam.AlignedSegment) -> pysam.AlignedSegment: """Merges 2 annotated clusters together. Merges 2 annotated aligned segments, each representing a cluster. Merges the smaller into the larger. Adds the read number of the 2nd cluster to the first. Args: biggest: The larger of the 2 clusters, with a higher read number. other: The smaller of the 2 clusters, with a lower read number. Returns: The annotated aligned segment representing the merged cluster. """ merged_id = biggest.get_tag(CLUSTER_ID_TAG) if not merged_id.endswith("+"): merged_id = merged_id + "+" biggest.set_tag(CLUSTER_ID_TAG, merged_id, "Z") total_reads = biggest.get_tag(NUM_READS_TAG) + other.get_tag(NUM_READS_TAG) biggest.set_tag(NUM_READS_TAG, total_reads, "i") return biggest
def _set_length_dependent_fields( self, rec: pysam.AlignedSegment, length: int, bases: Optional[str] = None, quals: Optional[List[int]] = None, cigar: Optional[str] = None, ) -> None: """Fills in bases, quals and cigar on a record. If any of bases, quals or cigar are defined, they must all have the same length/query length. If none are defined then the length parameter is used. Undefined values are synthesize at the inferred length. Args: rec: a SAM record length: the length to use if all of bases/quals/cigar are None bases: an optional string of bases for the read quals: an optional list of qualities for the read cigar: an optional cigar string for the read """ # Do some validation to make sure all defined things have the same lengths lengths = set() if bases is not None: lengths.add(len(bases)) if quals is not None: lengths.add(len(quals)) if cigar is not None: cig = sam.Cigar.from_cigarstring(cigar) lengths.add(sum([elem.length_on_query for elem in cig.elements])) if not lengths: lengths.add(length) if len(lengths) != 1: raise ValueError( "Provided bases/quals/cigar are not length compatible.") # Fill in the record, making any parts that were not defined as params length = lengths.pop() rec.query_sequence = bases if bases else self._bases(length) rec.query_qualities = quals if quals else [self.base_quality] * length if not rec.is_unmapped: rec.cigarstring = cigar if cigar else f"{length}M"
def record_contains_expected_snp(record: pysam.AlignedSegment) -> bool: expected_base = record.query_name[-1] for query_pos, ref_pos, ref_base in record.get_aligned_pairs( with_seq=True): if query_pos == REF_PANEL_FLANK_WIDTH: return expected_base == ref_base return False
def get_tag_or_default(alignment: pysam.AlignedSegment, tag_key: str, default: Optional[str] = None) -> Optional[str]: """Extracts the value associated to `tag_key` from `alignment`, and returns a default value if the tag is not present.""" try: return alignment.get_tag(tag_key) except KeyError: return default
def set_pair_info(r1: AlignedSegment, r2: AlignedSegment, proper_pair: bool = True) -> None: """Resets mate pair information between reads in a pair. Requires that both r1 and r2 are mapped. Can be handed reads that already have pairing flags setup or independent R1 and R2 records that are currently flagged as SE reads. Args: r1: read 1 r2: read 2 with the same queryname as r1 """ assert not r1.is_unmapped, f"Cannot process unmapped mate {r1.query_name}/1" assert not r2.is_unmapped, f"Cannot process unmapped mate {r2.query_name}/2" assert r1.query_name == r2.query_name, f"Attempting to pair reads with different qnames." for r in [r1, r2]: r.is_paired = True r.is_proper_pair = proper_pair r1.is_read1 = True r1.is_read2 = False r2.is_read2 = True r2.is_read1 = False for src, dest in [(r1, r2), (r2, r1)]: dest.next_reference_id = src.reference_id dest.next_reference_start = src.reference_start dest.mate_is_reverse = src.is_reverse dest.mate_is_unmapped = False dest.set_tag("MC", src.cigarstring) insert_size = isize(r1, r2) r1.template_length = insert_size r2.template_length = -insert_size
def select_snps_from_single_read( read: pysam.AlignedSegment, snp_positions: Set[int], region_start: int ) -> Tuple[List[int], List[str]]: positions, nucls = [], [] # TODO use indels seq = read.query_sequence for read_pos, ref_pos in read.get_aligned_pairs(matches_only=True): ref_pos = ref_pos - region_start if ref_pos in snp_positions: positions.append(ref_pos) nucls.append(SNP.process_nucl(seq[read_pos])) return positions, nucls
def sam_string_to_aligned_segment(sam_string, header=None): """Convert a correctly formatted sam string into a pysam AlignedSegment object :param sam_string: correctly formatted SAM string :param header: AlignmentHeader object :return AlignedSegment """ if not header: header = AlignmentHeader.from_references([sam_string.split("\t")[2]], [100000000]) new_segment = AlignedSegment.fromstring(sam_string, header) return new_segment
def _set_flags(self, rec: pysam.AlignedSegment, is_r1: bool, strand: str) -> None: """Appropriately sets most flag fields on the given read. Args: rec: the read to set the flags on is_r1: True if the read is a R1, False if it is an R2 strand: Either "+" or "-" to indicate strand of the read """ rec.is_paired = True rec.is_read1 = is_r1 rec.is_read2 = not is_r1 rec.is_qcfail = False rec.is_duplicate = False rec.is_secondary = False rec.is_supplementary = False if not rec.is_unmapped: rec.is_reverse = strand != "+"
def _read_pos_at_ref_pos(rec: AlignedSegment, ref_pos: int, previous: Optional[bool] = None) -> Optional[int]: """ Returns the read or query position at the reference position. If the reference position is not within the span of reference positions to which the read is aligned an exception will be raised. If the reference position is within the span but is not aligned (i.e. it is deleted in the read) behavior is controlled by the "previous" argument. Args: rec: the AlignedSegment within which to find the read position ref_pos: the reference position to be found previous: Controls behavior when the reference position is not aligned to any read position. True indicates to return the previous read position, False indicates to return the next read position and None indicates to return None. Returns: The read position at the reference position, or None. """ if ref_pos < rec.reference_start or ref_pos >= rec.reference_end: raise ValueError( f"{ref_pos} is not within the reference span for read {rec.query_name}" ) pairs = rec.get_aligned_pairs() index = 0 read_pos = None for read, ref in pairs: if ref == ref_pos: read_pos = read break else: index += 1 if not read_pos and previous is not None: if previous: while read_pos is None and index > 0: index -= 1 read_pos = pairs[index][0] else: while read_pos is None and index < len(pairs): read_pos = pairs[index][0] index += 1 return read_pos
def run(self): with CSVLogger(summary_file(), sep='\t') as summary: for read, res in self.iterator: seq = res['sequence'] qstring = res.get('qstring', '*') mean_qscore = res.get('mean_qscore', mean_qscore_from_qstring(qstring)) mapping = res.get('mapping', False) mods_tags = res.get('mods', []) if self.duplex: samples = len(read[0].signal) + len(read[1].signal) read_id = '%s;%s' % (read[0].read_id, read[1].read_id) else: samples = len(read.signal) read_id = read.read_id tags = [ f'RG:Z:{read.run_id}_{self.group_key}', f'qs:i:{round(mean_qscore)}', *read.tagdata(), *mods_tags, ] if len(seq): if self.mode == 'wfq': write_fastq(read_id, seq, qstring, fd=self.fd, tags=tags) else: self.output.write( AlignedSegment.fromstring( sam_record(read_id, seq, qstring, mapping, tags=tags), self.output.header ) ) if self.duplex: summary.append(duplex_summary_row(read[0], read[1], len(seq), mean_qscore, alignment=mapping)) else: summary.append(summary_row(read, len(seq), mean_qscore, alignment=mapping)) self.log.append((read_id, samples)) else: logger.warn("> skipping empty sequence %s", read_id)
def _ref_pos2seq_pos(alignment: pysam.AlignedSegment, ref_pos: int) -> int: """ Retrieve base position in sequence string at refence position. Alignment and ref_pos are assumed to be of the same reference. :param alignment: pysam.AlignedSegment :param ref_pos: reference position of base :return: AlignedSegment.query_sequence position matched to ref_pos. None is returned if matching position is not found. """ # TODO optimalize: (try matches_only=True) # TODO optimalize: case when alignment is full matched based on CIGAR (e.g. 30M) seq_pos = None for current_seq_pos, current_ref_pos in alignment.get_aligned_pairs(matches_only=False, with_seq=False): # search for base in snv position if current_ref_pos == ref_pos: seq_pos = current_seq_pos break return seq_pos
def _cleanup(rec: AlignedSegment, tags_to_invalidate: Iterable[str]) -> None: """Removes extended tags from a record that may have become invalid after clipping.""" for tag in tags_to_invalidate: rec.set_tag(tag, None)
def _make_read_unmapped(rec: AlignedSegment) -> None: """Removes mapping information from a read.""" if rec.is_reverse: quals = rec.query_qualities quals.reverse() rec.query_sequence = dnautils.reverse_complement(rec.query_sequence) rec.query_qualities = quals rec.is_reverse = False rec.reference_id = sam.NO_REF_INDEX rec.reference_start = sam.NO_REF_POS rec.cigar = None rec.mapping_quality = 0 rec.template_length = 0 rec.is_duplicate = False rec.is_secondary = False rec.is_supplementary = False rec.is_proper_pair = False rec.is_unmapped = True
AEQ(BZ.hqRegion, VZ.hqRegion) AEQ(BZ.insertRegions, VZ.insertRegions) def testRead(self): BZR = self.BZ.read() VZR = self.VZ.read() EQ(BZR.basecalls(), VZR.basecalls()) # Mockup some bam records reflecting the internal "pulse BAM" spec from mock import Mock from pysam import AlignedSegment from pbcore.io import BamAlignment from PRmm.io.ZmwReadStitcherIO import StitchedZmw, FeatureDesc pulsePeer = AlignedSegment() pulsePeer.is_unmapped=True pulsePeer.seq = "GATTACAGATTACA" pulsePeer.qname = "FakePulseRead" tags = dict( RG="00000000", np=1, qs=0, qe=14, rq=0.80, sn=[2.0, 3.0, 5.0, 6.0], ip=[15]*14, pw=[16]*14, zm=42, cx=2, # Now, the pulse stuff