def test_get_blocked_alignment(): bam = pysam.AlignmentFile( '/home/jgarthur/sv/analysis/alignments/bwa_mem/short-reads/jun_jul.mdup.merge.mdup.bam', 'rb') blocks = [ GenomeInterval('1', 0, 100), GenomeInterval('1', 110, 210), GenomeInterval('1', 210, 2000) ] aln = pysam.AlignedSegment() aln.pos = 0 aln.cigarstring = '50M' aln.seq = 'A' * 50 aln.is_reverse = False print(get_blocked_alignment(aln, blocks, 0, bam)) assert (get_blocked_alignment(aln, blocks, 0, bam) == ([1], 0)) assert (get_blocked_alignment(aln, blocks, 0, bam, is_rf=True) == ([0], 50)) aln.is_reverse = True print(get_blocked_alignment(aln, blocks, 0, bam)) assert (get_blocked_alignment(aln, blocks, 0, bam) == ([0], 50)) assert (get_blocked_alignment(aln, blocks, 0, bam, is_rf=True) == ([1], 0)) aln = pysam.AlignedSegment() aln.rname = 0 aln.pos = 90 aln.seq = 'A' * 40 aln.cigarstring = '20M20S' aln.set_tag('SA', '1,191,-,20M20S,60,0;', 'Z') print(get_blocked_alignment(aln, blocks, 0, bam)) assert (get_blocked_alignment(aln, blocks, 0, bam) == ([1, 2], -90)) assert (get_blocked_alignment(aln, blocks, 0, bam, is_rf=True) == ([3, 0], -80))
def pair_up(read_pair): r1_cp = pysam.AlignedSegment() r2_cp = pysam.AlignedSegment() r1_cp = copy.deepcopy(read_pair[0]) r2_cp = copy.deepcopy(read_pair[1]) if r1_cp.query_name != r2_cp.query_name: print("Error: read name unmathced.\n") sys.exit(1); # change flag flag_swag = flag_table_proper[(r1_cp.flag, r2_cp.flag)] r1_cp.flag = flag_swag[0] r2_cp.flag = flag_swag[1] # now change RNEXT and PNEXT if(r1_cp.reference_name == r2_cp.reference_name): #r1_cp.next_reference_name = r1_cp.reference_name #r2_cp.next_reference_name = r1_cp.reference_name r1_cp.next_reference_start = r2_cp.reference_start r2_cp.next_reference_start = r1_cp.reference_start r1_cp.template_length = r1_cp.next_reference_start - r1_cp.reference_start r2_cp.template_length = -r1_cp.template_length r1_cp.next_reference_name = r2_cp.next_reference_name = "=" else: r1_cp.next_reference_name, r2_cp.next_reference_name = r2_cp.reference_name, r1_cp.reference_name r1_cp.next_reference_start = r2_cp.reference_start r2_cp.next_reference_start = r1_cp.reference_start r1_cp.template_length = r1_cp.next_reference_start - r1_cp.reference_start r2_cp.template_length = -r1_cp.template_length return (r1_cp, r2_cp)
def setUp(self): parser = argparse.ArgumentParser() self.ref = 'AAAAAAAAAAAAAAAAAAAA' self.args = parser.parse_args([]) self.args.min_mq = 30 self.args.min_bq = 30 self.args.cons_cov = 2 self.aln1 = pysam.AlignedSegment() self.aln1.reference_start = 10 self.aln1.query_name = 'read1' self.aln1.mapping_quality = 30 self.aln1.query_sequence = "AAAAA" self.aln1.query_qualities = [30] * 5 self.aln1.cigarstring = '5M' self.aln2 = pysam.AlignedSegment() self.aln2.reference_start = 13 self.aln2.query_name = 'read2' self.aln2.mapping_quality = 30 self.aln2.query_sequence = "AAAAA" self.aln2.query_qualities = [30] * 5 self.aln2.cigarstring = '5M' self.aln3 = pysam.AlignedSegment() self.aln3.reference_start = 15 self.aln3.query_name = 'read3' self.aln3.mapping_quality = 30 self.aln3.query_sequence = "TAAAA" self.aln3.query_qualities = [30] * 5 self.aln3.cigarstring = '5M' return
def setUp(self): self.mq = 30 self.bq = 30 aln1 = pysam.AlignedSegment() aln1.reference_start = 10 aln1.query_name = 'read1' aln1.mapping_quality = 30 aln1.query_sequence = "AAAAATAAAATAAAAT" aln1.query_qualities = [30] * 16 aln1.cigarstring = '16M' aln2 = pysam.AlignedSegment() aln2.reference_start = 12 aln2.query_name = 'read2' aln2.mapping_quality = 20 aln2.query_sequence = "AAAGAAGAAAAG" qqual = [33] * 12 qqual[3] = 20 aln2.query_qualities = qqual aln2.cigarstring = '5M2D7M' aln3 = pysam.AlignedSegment() aln3.mapping_quality = 0 aln3.query_name = 'read3' self.alns = [aln1, aln2, aln3]
def simulate_read_pair(sequence, start, length=150, isize=400, flip=False): r1 = pysam.AlignedSegment() r1.query_sequence = sequence[start:start + length] r2 = pysam.AlignedSegment() pos2 = start + isize r2.query_sequence = reverse_comp(sequence[pos2 - length:pos2]) if flip: r1, r2 = r2, r1 return ReadPair(Alignment(r1), Alignment(r2), read_stats)
def test_iterable_molecule_iter(self): from singlecellmultiomics.molecule import MoleculeIterator from singlecellmultiomics.fragment import Fragment with pysam.AlignmentFile('test.sam', 'w', reference_names=['chr1', 'chr2'], reference_lengths=[1000, 1000]) as test_sam: read_A = pysam.AlignedSegment(test_sam.header) read_A.set_tag('SM', 'CELL_1') read_A.set_tag('RX', 'CAT') read_A.reference_name = 'chr1' read_A.reference_start = 100 read_A.query_sequence = 'ATCGGG' read_A.cigarstring = '6M' read_A.mapping_quality = 60 read_B = pysam.AlignedSegment(test_sam.header) read_B.set_tag('SM', 'CELL_1') read_B.set_tag('RX', 'CAT') read_B.reference_name = 'chr1' read_B.reference_start = 100 read_B.query_sequence = 'ATCGG' read_B.cigarstring = '5M' read_B.mapping_quality = 60 read_C = pysam.AlignedSegment(test_sam.header) read_C.set_tag('SM', 'CELL_2') read_C.set_tag('RX', 'CAT') read_C.reference_name = 'chr1' read_C.reference_start = 100 read_C.query_sequence = 'ATCGG' read_C.cigarstring = '5M' read_C.mapping_quality = 60 reads = [read_A, read_B, read_C] mi = MoleculeIterator(reads, yield_invalid=True) molecules = [] for molecule in mi: molecules.append(molecule) self.assertEqual(len(molecules), 2) self.assertEqual(max((len(m) for m in molecules)), 2) self.assertEqual(min((len(m) for m in molecules)), 1) # Test tags: a = molecules[0] a.write_tags() self.assertEqual(a[0][0].get_tag('TF'), 2) os.remove('test.sam')
def initialise_alignment(query_name, reference_id, reference_start, query_sequence, cigarstring, flag, mapping_quality=60, query_qualities=None, tags=None, header=None): """Create a `pysam.AlignedSegment` object. :param query_name: name of the query sequence :param reference_id: index to the reference name :param reference_start: 0-based index of first leftmost reference coordinate :param query_sequence: read sequence bases, including those soft clipped :param cigarstring: cigar string representing the alignment of query and reference :param flag: bitwise flag representing some properties of the alignment (see SAM format) :param mapping_quality: optional quality of the mapping or query to reference :param query_qualities: optional base qualities of the query, including soft-clipped ones! :param header: optional `pysam.AlignmentHeader` object, enabling use of the reference_name attr of the returned `pysam.AlignedSegment` obj. :returns: `pysam.AlignedSegment` object """ if tags is None: tags = dict() if header is None: a = pysam.AlignedSegment() else: a = pysam.AlignedSegment(header) a.query_name = query_name a.reference_id = reference_id a.reference_start = reference_start a.query_sequence = query_sequence a.cigarstring = cigarstring a.flag = flag a.mapping_quality = mapping_quality if query_qualities is not None: a.query_qualities = query_qualities for tag_name, tag_value in tags.items(): a.set_tag(tag_name, tag_value) return a
def package(self): """ Convert ``self.rec_1`` and ``self.rec_2`` from ``pysam.AlignedSegment`` to ``str`` The user may want to implement multiprocessing to decrease the amount of time to classify all reads in a SAM/BAM file. ``self.rec_1`` and self.``rec_2`` and all ``pysam.AlignedSegment`` objects are not pickleable and cannot be passed through a ``multiprocessing.Queue``. Instead of directly handling BAM/SAM strings, users can choose to create a ``Pair``, call ``package`` to convert the records to strings using the ``to_string()`` function from ``pysam`` and pass the ``Pair`` object through a ``Queue``. """ if type(self.rec_1) == type(pysam.AlignedSegment()) and type( self.rec_2) == type(pysam.AlignedSegment()): self.rec_1 = self.rec_1.to_string() self.rec_2 = self.rec_2.to_string()
def build_read(query_name="read_28833_29006_6945", query_sequence="AGCTTAGCTA", flag=99, reference_id=0, reference_start=32, mapping_quality=20, cigar=None, next_reference_id=0, next_reference_start=199, template_length=167, query_qualities=None): #pylint: disable=no-member,too-many-arguments a = pysam.AlignedSegment() a.query_name = query_name a.query_sequence = query_sequence a.flag = flag a.reference_id = reference_id a.reference_start = reference_start a.mapping_quality = mapping_quality if cigar is None: a.cigar = ((0, len(query_sequence)), ) else: a.cigar = cigar a.next_reference_id = next_reference_id a.next_reference_start = next_reference_start a.template_length = template_length if query_qualities is None: a.query_qualities = [27] * len(query_sequence) return MicroMock(aligned_segment=a)
def print_as_BAM(linked, header, path): with pysam.AlignmentFile(path, 'wb', header=header) as f: for n, introns in enumerate(linked): introns = sort_by_pos(introns) # calulate the postion, and distance to the next intron if len(introns) > 1: tlen = introns[-1][2] - introns[0][1] + 1 else: tlen = 0 # print out each intron as a seperate BAM entry for m, i in enumerate(introns): chrom, start, end, strand = i length = end - start + 1 if m < len(introns) - 1: next_ref = introns[m + 1][1] else: next_ref = introns[0][1] tlen = -tlen a = pysam.AlignedSegment() a.query_name = 'linked' + str(n) a.query_sequence = 'N' * length a.flag = 0 a.reference_id = chrom a.reference_start = start a.mapping_quality = 60 # 60 = unqiuely mapped for HISAT2 a.cigartuples = [(0, length)] a.next_reference_id = chrom a.next_reference_start = next_ref a.template_length = tlen a.query_qualities = pysam.qualitystring_to_array('/' * length) a.tags = [('XN', next_ref + 1), ('XI', len(introns))] f.write(a)
def block_parser_handle_hanging(opts, aln, bam, g, blocks, block_ends, insert_ranges, cached_dist, map_models, block_idx): mate = pysam.AlignedSegment() mate.is_unmapped = aln.mate_is_unmapped # make sure we don't end up with mate.rname == -1 if mate is unmapped if mate.is_unmapped: mate.rname = aln.rname mate.pos = aln.pos else: mate.rname = aln.mrnm mate.pos = aln.mpos mate.mapq = 0 if opts['use_mate_tags']: mate_rlen, mate_qmean = aln.get_tag('ZR'), aln.get_tag('ZQ') else: # values don't matter in this case since we won't condition on qmean/rlen mate_rlen, mate_qmean = aln.query_length, 0 mate.query_sequence = 'A' * mate_rlen block_parser_handle_pair(opts, aln, mate, bam, g, blocks, block_ends, insert_ranges, cached_dist, map_models, block_idx1=block_idx, qmean2=mate_qmean)
def get_chic_read(header, qname, contig='chr1', start=100, sequence='ATCGGG', cigar=None, umi='CAT', sample='CELL_1', is_reverse=False, read1=True, paired=False, proper_pair=True): read = pysam.AlignedSegment(header) read.set_tag( 'SM', sample ) # The sample to which the sample belongs is extracted from the SM tag read.set_tag('RX', umi) # The UMI is extracted from the RX tag read.set_tag('MX', 'scCHIC') # By default the molecule assignment is done based on the mapping location of read 1: read.reference_name = contig read.reference_start = start read.query_name = qname read.query_sequence = sequence read.is_reverse = is_reverse read.cigarstring = f'{len(sequence)}M' if cigar is None else cigar if read1: read.is_read1 = True read.is_read2 = False else: read.is_read1 = False read.is_read2 = True if paired: read.is_paired = True read.is_proper_pair = proper_pair return read
def bamFile(tmpdir_factory): header = {'HD': {'VN': '1.0'}, 'SQ': [{'LN': 1000, 'SN': 'ref'}]} p = tmpdir_factory.mktemp('test').join('test.bam') outFile = pysam.AlignmentFile(str(p), "wb", header=header) a = pysam.AlignedSegment() a.query_name = "read3" a.query_sequence = "GGGGAAAAAT" a.reference_start = 28 a.reference_id = 0 a.mapping_quality = 20 a.cigar = ((0, 10), ) #a.query_qualities = pysam.qualitystring_to_array("((((((((((") a.flag = 16 outFile.write(a) a.query_name = "read2" a.reference_start = 32 a.query_sequence = "AAAAATTTTT" a.flag = 0 outFile.write(a) a.query_name = "read1" a.query_sequence = "TTAAAAACCCCCGGC" #a.query_qualities = pysam.qualitystring_to_array("(((((((((((((") a.cigar = ((5, 5), (4, 2), (0, 10), (2, 2), (0, 1), (1, 1), (0, 1)) outFile.write(a) outFile.close() pysam.index(str(p)) return (p)
def test_simple(self): al = pysam.AlignedSegment() al.reference_start = 100 al.cigar = [(MATCH, 100)] al.seq = 'A' * 100 cropped = pysam.AlignedSegment() cropped.reference_start = 150 cropped.cigar = [(SOFT_CLIP, 50), (MATCH, 10), (SOFT_CLIP, 40)] cropped.seq = 'A' * 100 output = sam.crop_al_to_ref_int(al, 150, 159) self.assertEqual(output, cropped) output = sam.crop_al_to_ref_int(al, 150, 158) self.assertNotEqual(output, cropped)
def test_starts_just_before_deletion(self): al = pysam.AlignedSegment() al.reference_start = 100 al.cigar = [(MATCH, 50), (DEL, 10), (MATCH, 50)] al.seq = 'A' * 100 cropped = pysam.AlignedSegment() cropped.reference_start = 149 cropped.cigar = [(SOFT_CLIP, 49), (MATCH, 1), (DEL, 10), (MATCH, 50)] cropped.seq = 'A' * 100 output = sam.crop_al_to_ref_int(al, 149, 210) self.assertEqual(output, cropped) output = sam.crop_al_to_ref_int(al, 150, 210) self.assertNotEqual(output, cropped)
def create_Bam(alignments, outbam): fa = pyfaidx.Fasta('chr1.fa') dict_fa = {'HD': {'VN': 1.6, 'SO': 'coordinate'}, 'SQ': [{'SN': x, 'LN': len(fa[x])} for x in fa.keys()]} alignmentsSorted = sorted(alignments, key = attrgetter('contig', 'pos')) fh=pysam.AlignmentFile(outbam, mode="wb", header=dict_fa) for i, subreads in enumerate(alignmentsSorted): s = pysam.AlignedSegment(fh.header) if subreads.flag == 4: s.is_unmapped = True s.query_name = subreads.Rname s.query_sequence = subreads.seq s.query_qualities = np.array([ord(x) - 33 for x in list(subreads.basequal)]) else: #s = pysam.AlignedSegment(fh.header) s.is_unmapped = False s.reference_name = subreads.contig s.query_name = subreads.Rname s.query_sequence = subreads.seq s.reference_start = subreads.pos s.cigarstring = subreads.cigar s.is_reverse = True if subreads.flag == 16 else False s.mapping_quality = subreads.mapq s.set_tags([("MD", subreads.MDtag, "Z"), ("cs", subreads.cstag, "Z")]) s.query_qualities = np.array([ord(x) - 33 for x in list(subreads.basequal)]) fh.write(s) fh.close() pysam.sort("-o", "test.srt.bam", "test.bam") pysam.index("test.srt.bam")
def test_process_reads_read_obs_paired_end_overlap_1bad_base_qual(self): aln1b = pysam.AlignedSegment() aln1b.reference_start = 20 aln1b.query_name = 'read1' aln1b.mapping_quality = 20 aln1b.query_sequence = "AAAAATAAAACAAAAC" qqual = [30] * 16 qqual[0] = 5 aln1b.query_qualities = qqual aln1b.cigarstring = '16M' self.alns.append(aln1b) var_pos = [15, 20, 25, 35] res = preprocess.process_reads(self.alns, var_pos, 20, 10) exp = { 'read1': { 15: 'T', 20: 'T', 25: 'T', 35: 'C' }, 'read2': { 15: 'G', 20: 'G', 25: 'G' } } self.assertEqual(res, exp)
def test_pysam(): import pysam # Create BAM file from scratch # Code stolen from https://pysam.readthedocs.io/en/latest/usage.html#creating-bam-cram-sam-files-from-scratch header = { 'HD': {'VN': '1.0'}, 'SQ': [{'LN': 1575, 'SN': 'chr1'}, {'LN': 1584, 'SN': 'chr2'}] } file_name = "out.bam" with pysam.AlignmentFile(file_name, "wb", header=header) as outf: a = pysam.AlignedSegment() a.query_name = "read_28833_29006_6945" a.query_sequence="AGCTTAGCTAGCTACCTATATCTTGGTCTTGGCCG" a.flag = 99 a.reference_id = 0 a.reference_start = 32 a.mapping_quality = 20 a.cigar = ((0,10), (2,1), (0,25)) a.next_reference_id = 0 a.next_reference_start=199 a.template_length=167 a.query_qualities = pysam.qualitystring_to_array("<<<<<<<<<<<<<<<<<<<<<:<9/,&,22;;<<<") a.tags = (("NM", 1), ("RG", "L1")) outf.write(a) # Verify output file exists assert os.path.isfile(file_name) # Call samtools to sort the file # This will fail if the file is not a valid BAM file pysam.sort("-o", "sorted.bam", file_name) assert os.path.isfile("sorted.bam")
def make_unmapped_mate(mate, template, add_tag=True): """ Create mate for read using sequence and quality from template :param mate: :param template: :param add_tag: :return: """ a = pysam.AlignedSegment() a.query_name = template.query_name a.flag = template.flag a.reference_id = mate.reference_id a.reference_start = mate.reference_start a.mapping_quality = 0 # template.mapping_quality # a.cigar = # Not set a.next_reference_id = mate.reference_id a.next_reference_start = mate.reference_start # a.template_length = # Not set a.query_sequence = template.query_sequence a.query_qualities = template.query_qualities #a.tags = template.tags a.is_secondary = mate.is_secondary a.is_paired = True a.is_proper_pair = False a.is_unmapped = True # This tag indicates the segment is a "mock pair" a.setTag('YT', mate.get_tag('YT')) if add_tag: a.setTag('ZT', "MP") return a
def convert_to_AlignedSegment(header, sequence, quality, barcode_sequence, umi_sequence): """ This function converts the input variables (header,sequence,quality,barcode_sequence,umi_sequence) to a unaligned pysam.AlignedSegment with the umi and barcode informations as the following tags: Tag Value "B0" barcode_sequence "B3" umi_sequence :param header: string with the header information :param sequence: string with the DNA/RNA sequence :param quality: string with the base calling quality values :param barcode_sequence: string with the barcode sequence :param umi_sequence: string with the unique molecular identifier sequence """ # create aligned_segment = pysam.AlignedSegment() # Set the standard values # Header must not contain empty spaces aligned_segment.query_name = header.split()[0] aligned_segment.query_sequence = sequence aligned_segment.query_qualities = pysam.qualitystring_to_array(quality) # setting the flag to un_mapped aligned_segment.flag |= pysam.FUNMAP # Set the tags aligned_segment.set_tag('B0', barcode_sequence) aligned_segment.set_tag('B3', umi_sequence) aligned_segment.set_tag('RG', '0') return aligned_segment
def SPARKcreateBam(DataFrame, outbam): fa = pyfaidx.Fasta('chr1.fa') dict_fa = {'HD': {'VN': 1.6, 'SO': 'coordinate'}, 'SQ': [{'SN': x, 'LN': len(fa[x])} for x in fa.keys()]} dictSorted = DataFrame.take(DataFrame.count()) fh = pysam.AlignmentFile(outbam, mode="wb", header=dict_fa) for i in range(0, DataFrame.count()): s = pysam.AlignedSegment(fh.header) if dictSorted[i].flag == 4: s.is_unmapped = True s.query_name = dictSorted[i].Rname s.query_sequence = dictSorted[i].seq s.query_qualities = np.array([ord(x) - 33 for x in list(dictSorted[i].QUAL)]) else: s.is_unmapped = False s.reference_name = dictSorted[i].contig s.query_name = dictSorted[i].Rname s.query_sequence = dictSorted[i].seq s.reference_start = dictSorted[i].pos s.cigarstring = dictSorted[i].cigar s.is_reverse = True if dictSorted[i].flag == 16 else False s.mapping_quality = dictSorted[i].mapq s.set_tags([("MD", dictSorted[i].MDtag, "Z"), ("cs", dictSorted[i].cstag, "Z")]) s.query_qualities = np.array([ord(x) - 33 for x in list(dictSorted[i].QUAL)]) fh.write(s) fh.close() pysam.sort("-o", "test.srt.bam", "test.bam") pysam.index("test.srt.bam")
def write_reads(sam_path, gene, alleles, reads, out_path): y = list(reads.values()) y = sorted( y, key=lambda r: ( alleles[r[0].reference_name][0][0] + r[0].reference_start, r[0].query_name, ), ) with pysam.AlignmentFile(sam_path) as sam: with pysam.AlignmentFile(out_path, "wb", template=sam) as out: for x, _ in y: a = pysam.AlignedSegment() a.query_name = x.query_name.split("/")[0] a.query_sequence = x.query_sequence a.flag = x.flag a.reference_id = sam.get_tid("chr22") a.reference_start = x.reference_start + alleles[x.reference_name][0][0] a.mapping_quality = x.mapping_quality a.cigar = x.cigar a.next_reference_start = x.next_reference_start a.template_length = x.template_length a.query_qualities = x.query_qualities a.tags = x.tags out.write(a) cnv_chromosome, cnv_start, cnv_end = gene.cnv_region region = "chr{}:{}-{}".format(cnv_chromosome, cnv_start - 500, cnv_end + 1) for read in sam.fetch(region=region): out.write(read) cmd("samtools index {}".format(out_path))
def generate_read(self, read_length, query_name, cb, ub): reference_id = np.random.randint(len(self.chromosome2length)) chromosome, chr_length = list( self.chromosome2length.items())[reference_id] seq = self.chromosome2sequence[chromosome] start = np.random.randint(0, chr_length - read_length) # straight mapping a = pysam.AlignedSegment() a.query_name = query_name a.query_sequence = ''.join(seq[start:start + read_length]) # flag taken from pysam example, did not analyze a.flag = 99 a.reference_id = reference_id a.reference_start = start a.mapping_quality = 255 a.cigar = ((0, read_length), ) # a.next_reference_id = reference_id # a.next_reference_start = 199 a.template_length = read_length a.query_qualities = pysam.qualitystring_to_array("<" * read_length) a.tags = ( ("NM", 1), ("RG", "L1"), ("NH", 1), # normally should also add number of mutations compared to reference ("AS", read_length - 2), ("CB", cb), ("UB", ub), ) return a
def write_alignment(read_id, q_seq, chrm, strand, r_st, q_st, q_en, cigar): q_seq = q_seq[q_st:q_en] a = pysam.AlignedSegment() a.query_name = read_id a.query_sequence = q_seq if strand == 1 else mh.revcomp(q_seq) a.flag = 0 if strand == 1 else 16 a.reference_id = map_fp.get_tid(chrm) a.reference_start = r_st a.cigartuples = [(op, op_l) for op_l, op in cigar] a.template_length = q_en - q_st map_fp.write(a) nalign, nmatch, ndel, nins = [ 0, ] * 4 for op_len, op in cigar: if op not in (4, 5): nalign += op_len if op in (0, 7): nmatch += op_len elif op in (2, 3): ndel += op_len elif op == 1: nins += op_len # compute alignment stats summ_fp.write('{}\t{:.2f}\t{}\t{}\t{}\t{}\n'.format( read_id, 100 * nmatch / float(nalign), nalign, nmatch, ndel, nins)) summ_fp.flush() return
def test_ends_in_deletion(self): al = pysam.AlignedSegment() al.reference_start = 100 al.cigar = [(MATCH, 50), (DEL, 10), (MATCH, 50)] al.seq = 'A' * 100 cropped = pysam.AlignedSegment() cropped.reference_start = 100 cropped.cigar = [(MATCH, 50), (SOFT_CLIP, 50)] cropped.seq = 'A' * 100 output = sam.crop_al_to_ref_int(al, 100, 155) self.assertEqual(output, cropped) output = sam.crop_al_to_ref_int(al, 100, 160) self.assertNotEqual(output, cropped)
def sam_to_bam(sam_file, bam_file, check_sq=False): """ Convert sam to bam file @sam_file: Input sam filename @bam_file: Output bam filename """ in_f = pysam.AlignmentFile(sam_file, 'r', check_sq=check_sq) in_segs = [seg for seg in in_f.fetch(until_eof=True)] out_f = pysam.AlignmentFile(bam_file, 'wb', header=in_f.header) for seg in in_segs: a = pysam.AlignedSegment() a.query_name = seg.query_name a.query_sequence = seg.query_sequence a.flag = seg.flag a.reference_id = seg.reference_id a.reference_start = seg.reference_start a.mapping_quality = seg.mapping_quality a.cigar = seg.cigar a.next_reference_id = seg.next_reference_id a.next_reference_start = seg.next_reference_start a.template_length = seg.template_length a.query_qualities = seg.query_qualities a.tags = seg.tags out_f.write(a) in_f.close() out_f.close()
def test_ends_just_before_insertion(self): al = pysam.AlignedSegment() al.reference_start = 100 al.cigar = [(MATCH, 50), (INS, 10), (MATCH, 50)] al.seq = 'A' * 110 cropped = pysam.AlignedSegment() cropped.reference_start = 100 cropped.cigar = [(MATCH, 50), (SOFT_CLIP, 60)] cropped.seq = 'A' * 110 output = sam.crop_al_to_ref_int(al, 100, 149) self.assertEqual(output, cropped) output = sam.crop_al_to_ref_int(al, 100, 150) self.assertNotEqual(output, cropped)
def make_unaligned(read): unal = pysam.AlignedSegment() unal.query_name = read.name unal.is_unmapped = True unal.query_sequence = read.seq unal.query_qualities = fastq.decode_sanger(read.qual) return unal
def test_goodFiles(tmpdir, bamFile): d = tmpdir.mkdir('dir') p = d.join('test.bam') header = {'HD': {'VN': '1.0'}, 'SQ': [{'LN': 1000, 'SN': 'ref'}]} outFile = pysam.AlignmentFile(str(p), "wb", header=header) a = pysam.AlignedSegment() a.query_name = "read1" a.query_sequence = "AAAAATTTTT" a.reference_id = 0 a.reference_start = 32 a.mapping_quality = 20 a.cigar = ((0, 10), ) #a.query_qualities = pysam.qualitystring_to_array("((((((((((") outFile.write(a) outFile.close() pysam.index(str(p)) count = 0 out = next(getstartends.getStartsInFile(str(p))) assert out['start'] == 33 assert out['end'] == 42 assert out['strand'] == '+' assert out['ref'] == 'ref' for read, start, strand, end in zip( getstartends.getStartsInFile(str(bamFile), maxGaps=10), [29, 33, 33], ['-', '+', '+'], [38, 42, 46]): assert read['start'] == start assert read['strand'] == strand assert read['end'] == end for read, start, strand, end in zip( getstartends.getStartsInFile(str(bamFile)), [29, 33], ['-', '+'], [38, 42]): assert read['start'] == start assert read['strand'] == strand assert read['end'] == end
def toAlignedSegment(cls, read, targetIds): ret = pysam.AlignedSegment() # QNAME ret.query_name = read.fragmentName.encode(cls._encoding) # SEQ ret.query_sequence = read.alignedSequence.encode(cls._encoding) # FLAG ret.flag = cls.toSamFlag(read) # RNAME refName = read.alignment.position.referenceName ret.reference_id = targetIds[refName] # POS ret.reference_start = int(read.alignment.position.position) # MAPQ ret.mapping_quality = read.alignment.mappingQuality # CIGAR ret.cigar = cls.toCigar(read) # RNEXT nextRefName = read.nextMatePosition.referenceName ret.next_reference_id = targetIds[nextRefName] # PNEXT ret.next_reference_start = int(read.nextMatePosition.position) # TLEN ret.template_length = read.fragmentLength # QUAL ret.query_qualities = read.alignedQuality ret.tags = cls.toTags(read) return ret