def __encrypt_unmapped(alignment: pysam.AlignedSegment, secret: bytes): """ Stream cipher encryption / decryption. alignment + secret => encrypted_alignment encrypted_alignment + secret => alignment :param alignment: :param secret: :return: encrypter/decrypted alignment """ if alignment.is_unmapped: if secret is None: raise ValueError( 'Secret key must be present when unmapped alignments are iterated.' ) # use 64B long hash (encrypts 256 bases) sha512 = hashlib.sha512() sha512.update(secret + alignment.query_name.encode()) mut_seq = cmn.stream_cipher(alignment.query_sequence, sha512.digest()) # change and preserve quality # TODO: maybe something else with the quality? quality = alignment.query_qualities alignment.query_sequence = mut_seq alignment.query_qualities = quality
def softclip_end_of_alignment_by_query( rec: AlignedSegment, bases_to_clip: int, clipped_base_quality: Optional[int] = None, tags_to_invalidate: Iterable[str] = TAGS_TO_INVALIDATE ) -> ClippingInfo: """ Adds soft-clipping to the end of a read's alignment. Clipping is applied before any existing hard or soft clipping. E.g. a read with cigar 100M5S that is clipped with bases_to_clip=10 will yield a cigar of 90M15S. If the read is unmapped or bases_to_clip < 1 then nothing is done. If the read has fewer clippable bases than requested the read will be unmapped. Args: rec: the BAM record to clip bases_to_clip: the number of additional bases of clipping desired in the read/query clipped_base_quality: if not None, set bases in the clipped region to this quality tags_to_invalidate: the set of extended attributes to remove upon clipping Returns: ClippingInfo: a named tuple containing the number of query/read bases and the number of target/reference bases clipped. """ if rec.is_unmapped or bases_to_clip < 1: return ClippingInfo(0, 0) num_clippable_bases = rec.query_alignment_length if bases_to_clip >= num_clippable_bases: return _clip_whole_read(rec, tags_to_invalidate) # Reverse the cigar and qualities so we can clip from the start cigar = Cigar.from_cigartuples(rec.cigartuples).reversed() quals = rec.query_qualities quals.reverse() new_cigar, clipping_info = _clip(cigar, quals, bases_to_clip, clipped_base_quality) # Then reverse everything back again quals.reverse() rec.query_qualities = quals rec.cigarstring = str(new_cigar.reversed()) _cleanup(rec, tags_to_invalidate) return clipping_info
def _set_length_dependent_fields( self, rec: pysam.AlignedSegment, length: int, bases: Optional[str] = None, quals: Optional[List[int]] = None, cigar: Optional[str] = None, ) -> None: """Fills in bases, quals and cigar on a record. If any of bases, quals or cigar are defined, they must all have the same length/query length. If none are defined then the length parameter is used. Undefined values are synthesize at the inferred length. Args: rec: a SAM record length: the length to use if all of bases/quals/cigar are None bases: an optional string of bases for the read quals: an optional list of qualities for the read cigar: an optional cigar string for the read """ # Do some validation to make sure all defined things have the same lengths lengths = set() if bases is not None: lengths.add(len(bases)) if quals is not None: lengths.add(len(quals)) if cigar is not None: cig = sam.Cigar.from_cigarstring(cigar) lengths.add(sum([elem.length_on_query for elem in cig.elements])) if not lengths: lengths.add(length) if len(lengths) != 1: raise ValueError( "Provided bases/quals/cigar are not length compatible.") # Fill in the record, making any parts that were not defined as params length = lengths.pop() rec.query_sequence = bases if bases else self._bases(length) rec.query_qualities = quals if quals else [self.base_quality] * length if not rec.is_unmapped: rec.cigarstring = cigar if cigar else f"{length}M"
def _make_read_unmapped(rec: AlignedSegment) -> None: """Removes mapping information from a read.""" if rec.is_reverse: quals = rec.query_qualities quals.reverse() rec.query_sequence = dnautils.reverse_complement(rec.query_sequence) rec.query_qualities = quals rec.is_reverse = False rec.reference_id = sam.NO_REF_INDEX rec.reference_start = sam.NO_REF_POS rec.cigar = None rec.mapping_quality = 0 rec.template_length = 0 rec.is_duplicate = False rec.is_secondary = False rec.is_supplementary = False rec.is_proper_pair = False rec.is_unmapped = True