def WASP(chrom, gpos, genotype, snps, tagval, read, debug=False): """ Do the WASP-style allele swap """ name = read.query_name pos = read.query_alignment_start name = name + "_-_" + chrom + ":" + str(gpos[pos]) # Read attributes q = pysam.array_to_qualitystring(read.query_qualities) r = read.query_sequence chrn = re.sub("^[Cc]hr", "", chrom) if (tagval == 1): for i in range(len(genotype)): if gpos[i] != None: if (str(chrn), int(gpos[i]), '1') in snps.keys() and (str(chrn), int( gpos[i]), '2') in snps.keys(): r = replace_str_index(r, i, snps[(str(chrn), int(gpos[i]), '2')]) if (tagval == 2): for i in range(len(genotype)): if gpos[i] != None: if (str(chrn), int(gpos[i]), '1') in snps.keys() and (str(chrn), int( gpos[i]), '2') in snps.keys(): r = replace_str_index(r, i, snps[(str(chrn), int(gpos[i]), '1')]) o = "@" + name + "\n" + r + "\n+\n" + q + "\n" return (o)
def out2fq(reads, fout): if reads.is_reverse: fout.write("@{0}\n{1}\n+\n{2}\n".format( reads.qname, reads.get_forward_sequence(), pysam.array_to_qualitystring(reads.get_forward_qualities()))) else: fout.write("@{0}\n{1}\n+\n{2}\n".format(reads.qname, reads.query, reads.qqual))
def Clip(self, readStart, readEnd): """ Clip this read to [readStart:readEnd), and return a copy of pysam.calignmentfile.AlignedSegment object. Assume that read.bamRecord.peer is an unmapped AlignedSegment. """ new_query_name = "%s/%s/%d_%d" % (self.movieName, self.holeNumber, readStart, readEnd) if not (readStart >= self.readStart and readStart <= readEnd and readEnd <= self.readEnd): raise ValueError("Unable to clip subread %s from read %s." % (new_query_name, self.readName)) s, e = readStart - self.readStart, readEnd - self.readStart QV_TAGS = ["iq", "dq", "dt", "st", "sq", "mq", "ip", "pw"] # Create an unaligned pysam.AlignedSegment object. ret = pysam.AlignedSegment() ret.query_name = new_query_name peer = self.bamRecord.peer ret.query_sequence = peer.query_sequence[s:e] ret.flag = peer.flag assert peer.reference_id == -1 assert peer.reference_start == -1 assert peer.cigartuples is None assert peer.next_reference_id == -1 assert peer.next_reference_start == -1 assert peer.template_length == 0 ret.reference_id = peer.reference_id ret.reference_start = peer.reference_start ret.cigar = [] ret.next_reference_id = peer.next_reference_id ret.next_reference_start = peer.next_reference_start ret.template_length = peer.template_length ret.mapping_quality = peer.mapping_quality if peer.query_qualities is None: ret.query_qualities = None else: ret.query_qualities = pysam.array_to_qualitystring(peer.query_qualities)[s:e] tags = peer.tags[::] for index, (tag_name, tag_val) in enumerate(tags): if tag_name in QV_TAGS: if self.__len__() != len(tag_val): raise ValueError("%s's %s length %d ! = sequence length %d" % (peer.query_name, tag_name, len(tag_val), self.__len__())) tags[index] = (tag_name, tag_val[s:e]) elif tag_name == 'qs': tags[index] = (tag_name, int(readStart)) elif tag_name == 'qe': tags[index] = (tag_name, int(readEnd)) ret.tags = tags return ret
def test_slideseq2(self): bam_fp = pipeline.convert_fastqs_to_unmapped_bam( self.fastq_slideseq2_fps, "slideseq2", tempfile.mkdtemp(), name="test", ) with pysam.AlignmentFile(bam_fp, "rb", check_sq=False) as f: alignments = list(f.fetch(until_eof=True)) self.assertEqual(2, len(alignments)) self.assertEqual( [ "NB501583:801:H7JLTBGXH:1:11101:20912:1050", "NB501583:801:H7JLTBGXH:1:11101:8670:1050", ], [al.query_name for al in alignments], ) self.assertEqual( [ read.sequence for read in ngs.fastq.Fastq(self.fastq_slideseq2_fps[1]) ], [al.query_sequence for al in alignments], ) self.assertEqual( [ read.qualities.string for read in ngs.fastq.Fastq(self.fastq_slideseq2_fps[1]) ], [ pysam.array_to_qualitystring(al.query_qualities) for al in alignments ], ) self.assertEqual( { ("UR", "TTTTTTTTT"), ("UY", "EEEEEEEEE"), ("CR", "CTTTGNTCAATGTT"), ("CY", "AAAAA#EEAEEEEE"), ("RG", "test"), }, set(alignments[0].get_tags()), ) self.assertEqual( { ("UR", "AGTGTCTCA"), ("UY", "EAEAEAEEE"), ("CR", "CTCTTNATCCTCAT"), ("CY", "AAAAA#EEE/EAE/"), ("RG", "test"), }, set(alignments[1].get_tags()), )
def test_indropsv3(self): bam_fp = pipeline.convert_fastqs_to_unmapped_bam( self.fastq_indropsv3_fps, "indropsv3", tempfile.mkdtemp(), name="test", ) with pysam.AlignmentFile(bam_fp, "rb", check_sq=False) as f: alignments = list(f.fetch(until_eof=True)) self.assertEqual(2, len(alignments)) self.assertEqual( [ "M03718:773:000000000-JKHP3:1:1101:18272:1693", "M03718:773:000000000-JKHP3:1:1101:17963:1710", ], [al.query_name for al in alignments], ) self.assertEqual( [ read.sequence for read in ngs.fastq.Fastq(self.fastq_10xv3_fps[1]) ], [al.query_sequence for al in alignments], ) self.assertEqual( [ read.qualities.string for read in ngs.fastq.Fastq(self.fastq_10xv3_fps[1]) ], [ pysam.array_to_qualitystring(al.query_qualities) for al in alignments ], ) self.assertEqual( { ("UR", "CCAAAA"), ("UY", "FFBFGG"), ("CR", "TACGTCATCTCCTACG"), ("CY", "1111AFAF1111AFAF"), ("RG", "test"), }, set(alignments[0].get_tags()), ) self.assertEqual( { ("UR", "TTAGAA"), ("UY", "FAAAFF"), ("CR", "TTAGATCGTTAGATCG"), ("CY", "1>>11DFA1>>11DFA"), ("RG", "test"), }, set(alignments[1].get_tags()), )
def read_bam(file): if file.endswith(".bam"): fh = pysam.AlignmentFile(file, "rb", check_sq=False) elif file.endswith(".sam"): fh = pysam.AlignmentFile(file, 'r') else: raise Exception("%r file format error" % file) for line in fh: #yield [line.qname, line.seq, pysam.array_to_qualitystring(line.query_qualities), line.get_tag('rq')] yield line.qname, line.seq, pysam.array_to_qualitystring( line.query_qualities) fh.close()
def convert_bam_to_df(data_fp: str) -> pd.DataFrame: """Converts a BAM file to a Pandas dataframe. Args: data_fp: The input filepath for the BAM file to be converted. Returns: A Pandas dataframe containing the BAM information. """ als = [] with pysam.AlignmentFile(data_fp, ignore_truncation=True, check_sq=False) as bam_fh: for al in bam_fh: cellBC, UMI, readCount, grpFlag = al.query_name.split("_") seq = al.query_sequence qual = al.query_qualities encode_qual = pysam.array_to_qualitystring(qual) als.append([ cellBC, UMI, int(readCount), grpFlag, seq, encode_qual, al.query_name, ]) return pd.DataFrame( als, columns=[ "cellBC", "UMI", "readCount", "grpFlag", "seq", "qual", "readName", ], )
def Clip(self, readStart, readEnd): """ Clip this read to [readStart:readEnd), and return a copy of pysam.calignmentfile.AlignedSegment object. Assume that read.bamRecord.peer is an unmapped AlignedSegment. """ new_query_name = "%s/%s/%d_%d" % (self.movieName, self.holeNumber, readStart, readEnd) if not (readStart >= self.readStart and readStart <= readEnd and readEnd <= self.readEnd): raise ValueError("Unable to clip subread %s from read %s." % (new_query_name, self.readName)) s, e = readStart - self.readStart, readEnd - self.readStart QV_TAGS = ["iq", "dq", "dt", "st", "sq", "mq", "ip", "pw"] # Create an unaligned pysam.AlignedSegment object. ret = pysam.AlignedSegment() ret.query_name = new_query_name peer = self.bamRecord.peer ret.query_sequence = peer.query_sequence[s:e] ret.flag = peer.flag assert peer.reference_id == -1 assert peer.reference_start == -1 assert peer.cigartuples is None assert peer.next_reference_id == -1 assert peer.next_reference_start == -1 assert peer.template_length == 0 ret.reference_id = peer.reference_id ret.reference_start = peer.reference_start ret.cigar = [] ret.next_reference_id = peer.next_reference_id ret.next_reference_start = peer.next_reference_start ret.template_length = peer.template_length ret.mapping_quality = peer.mapping_quality if peer.query_qualities is None: ret.query_qualities = None else: ret.query_qualities = pysam.array_to_qualitystring( peer.query_qualities)[s:e] tags = peer.tags[::] for index, (tag_name, tag_val) in enumerate(tags): if tag_name in QV_TAGS: if self.__len__() != len(tag_val): raise ValueError( "%s's %s length %d ! = sequence length %d" % (peer.query_name, tag_name, len(tag_val), self.__len__())) tags[index] = (tag_name, tag_val[s:e]) elif tag_name == 'qs': tags[index] = (tag_name, int(readStart)) elif tag_name == 'qe': tags[index] = (tag_name, int(readEnd)) ret.tags = tags return ret
def reverse_converter(input_file, output_dir, verbose=False, quiet=False, progress=False, **kwargs): """CRAM to Fast5 reverse conversion tool""" try: # Define logger with appropriate verbosity logger = get_logger(name="ont2cram_reverse_converter", verbose=verbose, quiet=quiet) logger.debug("Check input files") readable_file(input_file) #writable_dir(output_dir) check_destination_exists(input_file, output_dir) class Attribute: path = '' type = '' value = None is_col = False attr_dict = {} with pysam.AlignmentFile(input_file, "rc", check_sq=False) as samfile: logger.debug("Read CRAM header") for comment in samfile.header["CO"]: if not comment.startswith(("ATR:", "COL:")): continue parts = shlex.split(comment) part0 = parts[0].split(':') a = Attribute() a.path = part0[1] a.type = part0[2] a.value = parts[2][3:] if len(parts) == 3 else None a.is_col = comment.startswith("COL:") tag = parts[1][3:] attr_dict[tag] = a def is_hex_str(obj, expected_len, is_null_term=False): bytes_to_remove_end = 1 if is_null_term else 0 return isinstance(obj,bytes) and len(obj)==expected_len and (not is_null_term or obj[-1:]==0) \ and STR_HEX_PATTERN.match(obj[:expected_len-bytes_to_remove_end].decode()) def get_path(hdf_path, read_number_long, read_number_short): if read_number_short: hdf_path = hdf_path.replace("Read_YYY", read_number_short) if read_number_long: hdf_path = hdf_path.replace("read_XXXXX", read_number_long) return hdf_path def get_path_from_dummy(hdf_path, read_number_long, read_number_short): p = get_path(hdf_path, read_number_long, read_number_short) return p.replace("/dummy_attr", "") def write_hdf_attr(hdf5_file, attr_path, attr_value, attr_type): #print(f"path={attr_path}, val={attr_value}, type={attr_type}") group_name, _, attr_name = attr_path.rpartition('/') if attr_name == "noname": raise try: group = hdf5_file[group_name] except KeyError: group = hdf5_file.create_group(group_name) try: group.attrs.create(attr_name, attr_value, dtype=attr_type) except (TypeError, ValueError): group.attrs[attr_name] = convert_type( attr_value, attr_type) logger.debug("CRAM header") for read in tqdm.tqdm(samfile.fetch(until_eof=True), unit=" Reads", unit_scale=True, disable=not progress): COUNTER["Reads written"] += 1 fast5_filename = read.get_tag(FILENAME_TAG) read_number_long = None try: read_number_long = "read_" + str( read.get_tag(READ_NUM_TAG_LONG)) except KeyError: pass read_number_short = None try: read_number_short = "Read_" + str( read.get_tag(READ_NUM_TAG_SHORT)) except KeyError: pass output_file = os.path.join(output_dir, fast5_filename) dir = os.path.dirname(output_file) if not os.path.exists(dir) and len(dir) > 0: os.makedirs(dir) with h5py.File(output_file, "a") as f: if read.query_name != "nofastq": fastq_lines = np.string_("\n".join([ read.query_name, read.query_sequence, '+', pysam.array_to_qualitystring(read.query_qualities) + '\n' ])) f.create_dataset( "/Analyses/Basecall_1D_000/BaseCalled_template/Fastq", data=fastq_lines) DSETS = {} for tag_name, tag_val in read.get_tags(): if tag_name in RESERVED_TAGS: continue a = attr_dict[tag_name] if a.is_col: dset_name, _, col_name = get_path( a.path, read_number_long, read_number_short).rpartition('/') if dset_name.endswith( "Fastq") and col_name == "noname": continue if dset_name not in DSETS: DSETS[dset_name] = [] dset = DSETS[dset_name] if col_name == "noname": dset.append(tag_val) else: dset.append( np.array(list(tag_val.split('\x03')) if a.type.startswith( ('S', 'U')) else tag_val, dtype=[(col_name, a.type)])) for dset_name, columns in DSETS.items(): d = columns[0] if len( columns) == 1 else rfn.merge_arrays( columns, flatten=True, usemask=False) f.create_dataset(dset_name, data=d) # write constant values stored in cram header for a in attr_dict.values(): if a.is_col: continue if a.value: write_hdf_attr( f, get_path(a.path, read_number_long, read_number_short), a.value, a.type) # write tags stored in cram records for tag_name, tag_val in read.get_tags(): if tag_name in RESERVED_TAGS: continue a = attr_dict[tag_name] if is_empty_hdf_node(a.path): f.create_group( get_path_from_dummy(a.path, read_number_long, read_number_short)) continue if a.is_col: continue if a.value != tag_val: write_hdf_attr( f, get_path(a.path, read_number_long, read_number_short), tag_val, a.type) finally: logger.info(dict_to_str(COUNTER))
for aligned_segment in alignment_file: """ @type aligned_segment: pysam.libcalignedsegment.AlignedSegment """ if vendor_filter and aligned_segment.is_qcfail: continue # Assign the AlignedSegment to its ReadGroup-specific GzipFile. if aligned_segment.is_read1: fifo_queue = fifo_queue_dict[aligned_segment.get_tag('RG')][0] else: fifo_queue = fifo_queue_dict[aligned_segment.get_tag('RG')][1] fifo_queue.put( '@' + aligned_segment.query_name + '\n' + aligned_segment.query_sequence + '\n' + '+\n' + pysam.array_to_qualitystring(aligned_segment.query_qualities) + '\n' ) if aligned_segment.has_tag('BC'): # Assign the AlignedSegment to its ReadGroup-specific GzipFile. fifo_queue = fifo_queue_dict[aligned_segment.get_tag('RG')][2] fifo_queue.put( '@' + aligned_segment.query_name + '\n' + aligned_segment.get_tag('BC') + '\n' + '+\n' + aligned_segment.get_tag('QT') + '\n' ) for read_group_id in fifo_queue_dict.iterkeys(): for fifo_queue in fifo_queue_dict[read_group_id]:
def retrieve_reads_contig_wise(sam_merged, contig_data, output_dir): contig_dict = dict() opened_files = dict() def close_all_files(): for my_file in opened_files.values(): my_file.close() with open("contig_data.txt") as contig_data: for line in contig_data: fields = line.split() node = fields[0] core = fields[-3] contig_dict[node] = core query_getter = operator.attrgetter("query_name") with pysam.AlignmentFile(sam_merged, "rb") as sam_handle: for (my_read_name, alignment_pool) in itertools.groupby(sam_handle, query_getter): my_read_set = dict() my_core_set = set() while "Reading alignments from alignment pool": try: my_alignment = next(alignment_pool) # print(contig_dict[my_alignment.reference_name]) is_reverse = my_alignment.is_reverse my_seq = my_alignment.query_sequence my_qual = my_alignment.query_qualities my_qual_string = pysam.array_to_qualitystring(my_qual) if is_reverse: my_seq_string = str(Seq(my_seq).reverse_complement()) my_qual_string = my_qual_string[::-1] else: my_seq_string = my_seq my_seq_tuple = (my_seq_string, my_qual_string) if len(my_read_set) > 2: logger.warning( "Something's gone wrong with read set %s, as " "there are %s of them", my_read_name, len(my_read_set), ) elif len(my_read_set) == 0: my_read_set[my_seq_tuple] = "forward" elif my_seq_tuple not in my_read_set.keys(): my_read_set[my_seq_tuple] = "reverse" try: ref = contig_dict[my_alignment.reference_name] my_core_set.add(ref) except KeyError: my_core_set.add("unknown") except StopIteration: if len(my_read_set) == 2: for core_name in my_core_set: for my_tuple, file_id in my_read_set.items(): if file_id == "forward": file_end = ".end1" elif file_id == "reverse": file_end = ".end2" basename = "{core_name}{file_end}".format( core_name, file_end) filename = os.path.join(output_dir, basename) try: file_to_write = opened_files[filename] except KeyError: file_to_write = open(filename, "w") opened_files[filename] = file_to_write except IOError: logger.error( "Error when trying to handle" "%s. Maybe there are too many opened" "files at once: %s", filename, len(opened_files), ) raise seq, qual = my_tuple line = "@{}\n{}\n+\n{}\n".format( my_read_name, seq, qual) file_to_write.write(line) elif len(my_read_set) == 1: for core_name in my_core_set: file_end = ".end" basename = "{}{}".format(core_name, file_end) filename = os.path.join(output_dir, basename) try: file_to_write = opened_files[filename] except KeyError: file_to_write = open(filename, "w") opened_files[filename] = file_to_write except IOError: logger.error( "Error when trying to handle" "%s. Maybe there are too many opened" "files at once: %s", filename, len(opened_files), ) raise seq, qual = my_read_set.keys()[0] line = "@{}\n{}\n+\n{}\n".format( my_read_name, seq, qual) file_to_write.write(line) else: logger.warning( "Something's gone wrong with read set %s, as " "there are %s of them", my_read_name, len(my_read_set), ) break close_all_files()