def setUp(self): header = { "HD": { "VN": "1.0", "SO": "unsorted" }, "SQ": [ { "SN": "ref1|seq1", "LN": 1000000 }, { "SN": "ref1|seq2", "LN": 1000000 }, { "SN": "ref2|seq1", "LN": 1000000 }, ], } bam1_reads = [ "read1 0 ref1|seq1 24975 42 80M * 0 0 TGGGCCAGAAAAAATGACTTCTCCATCTCGCTGCCGGTAGACCGACTCTCTTTTCTGCTGGCGGTTGCCACGCTGAGCGG AAAAAF.A.FFAFFFFFAFFFFFFFFFFFFFF<FFFFAFFFFFFA.FFFFA<7FFFFFFFF<FFFFFF))<FFFFF.FFF AS:i:-3 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:76A3 YT:Z:UU", "read2 0 ref1|seq1 20984 42 80M * 0 0 GTTTAAACAGTTGTTGTTGTTCTTCCTGCGATACTCCACTTCCAGAAGCCATAATCGTCATTTTGATAACAGCGTGGTTG AAAAA.<FFAFFFFFFF<FFAFF)FFFFF<FFF.FFA)FFAF<F<F<.FF<F.FFAFFF7FAFFF.AF.<)F7FFAAFFF AS:i:-6 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:33A13T32 YT:Z:UU", "read3 0 ref2|seq1 3210 42 80M * 0 0 ACCTACCACTTCACCGACATATTCATGGCCCACGACCATCGGCACCGGGATGGATTTTTGCGACCACTCATCCCAGTTAT AAAA7FAFFFFF.FFFFF<FFFAA7FFFFFF7FFFFFFFA<FF7FFAF<F.FF.FFF7FFFFAF<FFFFAFFFFA77FFF AS:i:-3 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:53T26 YT:Z:UU", "read4 0 ref1|seq2 9298 23 79M * 0 0 CAGCATCGCTTCCAAAAATAGTAGTGCAGTTGATCGGAGTAGGAGCGTAATGGATTGCCTGCGTGATTGGCTATCTGGC AAAAAF.A.FFAFFFFFAFFFFFFFFFFFFFF<FFFFAFFFFFFA.FFFFA<7FFFFFFFF<FFFFFF))<FFFFF.FF AS:i:-23 XN:i:0 XM:i:6 XO:i:0 XG:i:0 NM:i:6 MD:Z:19T8A0C2T4T10T30 YT:Z:UU", ] aln_header = pysam.AlignmentHeader().from_dict(header) aln_segment = pysam.AlignedSegment() test_bam1 = pysam.AlignmentFile("test/test_bam1.bam", "wb", header=header) for read in bam1_reads: read = read_to_dict(read) test_bam1.write(aln_segment.from_dict(read, aln_header)) test_bam1.close() bam2_reads = [ "read1 0 ref2|seq1 24975 50 80M * 0 0 TGGGCCAGAAAAAATGACTTCTCCATCTCGCTGCCGGTAGACCGACTCTCTTTTCTGCTGGCGGTTGCCACGCTGAGCGG AAAAAF.A.FFAFFFFFAFFFFFFFFFFFFFF<FFFFAFFFFFFA.FFFFA<7FFFFFFFF<FFFFFF))<FFFFF.FFF AS:i:0 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:76A3 YT:Z:UU", "read2 0 ref2|seq1 20984 30 80M * 0 0 GTTTAAACAGTTGTTGTTGTTCTTCCTGCGATACTCCACTTCCAGAAGCCATAATCGTCATTTTGATAACAGCGTGGTTG AAAAA.<FFAFFFFFFF<FFAFF)FFFFF<FFF.FFA)FFAF<F<F<.FF<F.FFAFFF7FAFFF.AF.<)F7FFAAFFF AS:i:-12 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:33A13T32 YT:Z:UU", ] test_bam2 = pysam.AlignmentFile("test/test_bam2.bam", "wb", header=header) for read in bam2_reads: read = read_to_dict(read) test_bam2.write(aln_segment.from_dict(read, aln_header)) test_bam2.close()
def build_read(self): '''build an example read.''' header = pysam.AlignmentHeader(reference_names=["chr1", "chr2"], reference_lengths=[10000000, 10000000]) a = pysam.AlignedSegment(header) a.query_name = "read_12345" a.query_sequence = "ATGC" * 10 a.flag = 0 a.reference_id = 0 a.reference_start = 20 a.mapping_quality = 20 a.cigartuples = ((0, 10), (2, 1), (0, 9), (1, 1), (0, 20)) a.next_reference_id = 0 a.next_reference_start = 200 a.template_length = 167 a.query_qualities = pysam.qualitystring_to_array("1234") * 10 return a
def make_read(ref_id, name, start, cigar): """ Build sam format read by position """ header = pysam.AlignmentHeader() header_dict = { 'HD': { 'VN': '1.0' }, 'SQ': [{ 'LN': 1e6, 'SN': 'chr1' }, { 'LN': 1e6, 'SN': 'chr2' }] } header = header.from_dict(header_dict) tgs_read = pysam.AlignedSegment(header) tgs_read.query_name = name tgs_read.reference_id = ref_id tgs_read.reference_start = start tgs_read.cigar = cigar tgs_read.setTag('FL', 1) return tgs_read
def vcf_from_fasta(args): """Entry point for calling variants by consensus sequence alignment.""" logger = medaka.common.get_named_logger('CONS2VCF') with pysam.FastaFile(args.ref_fasta) as fasta: ref_seqs = {name: fasta.fetch(name) for name in fasta.references} contig_lengths = dict(zip(fasta.references, fasta.lengths)) total_bp = sum(fasta.lengths) ref_contigs = fasta.references h = pysam.AlignmentHeader().from_references(fasta.references, fasta.lengths) if args.bam is not None: alns = pysam.AlignmentFile(args.bam) out_bam = None else: out_bam = pysam.AlignmentFile(args.out_prefix + '.bam', 'wb', header=h) if args.regions is not None: contigs = [r.ref_name for r in args.regions] else: contigs = None alns = edlib_chunked_align_fastas(args.consensus, args.ref_fasta, contigs, chunk_size=args.chunk_size, pad=args.pad, mode=args.mode, header=h) vcf_fp = args.out_prefix + '.vcf' trees = collections.defaultdict(intervaltree.IntervalTree) t_log = now() log_interval = 5 msg = 'Processed {:.2%} of reference.' bp_done = collections.Counter() header_contigs = [ '{},length={}'.format(c, contig_lengths[c]) for c in ref_contigs ] with medaka.vcf.VCFWriter(vcf_fp, contigs=header_contigs) as writer: for aln in alns: # reference_start is 0 based, reference_end points to one past # the last aligned residue, i.e. same as bed file ref = aln.reference_name rstart, rend = aln.reference_start, aln.reference_end if trees[ref].overlaps(rstart, rend) and args.bam is not None: # We expect edlib alignments to overlap by 1 match so only # apply this check for a user-provided bam. logger.warning( ('WARNING: alignment {}:{}-{} overlaps another ' + 'alignment, which could cause overlapping variants.' + '\nCheck output bam and vcf for details.').format( ref, rstart, rend)) trees[ref].add(intervaltree.Interval(rstart, rend)) for v in yield_variants_from_aln(aln, ref_seqs[ref]): if 'N' in v.ref: continue writer.write_variant(v) if now() - t_log > log_interval: done = bp_done[ref] + v.pos - rstart logger.info(msg.format(done / total_bp)) t_log = now() bp_done[ref] += rend - rstart if out_bam is not None: out_bam.write(aln) if out_bam is not None: out_bam.close() pysam.index(out_bam.filename) bed_fp = args.out_prefix + '_coverage.bed' gap_bed_fp = args.out_prefix + '_coverage_gaps.bed' for tree in trees.values(): # strict=False to merge abutting alignments. tree.merge_overlaps(strict=False) medaka.common.write_intervaltrees_to_bed(trees, bed_fp) gap_trees = medaka.common.complement_intervaltrees(trees, contig_lengths) medaka.common.write_intervaltrees_to_bed(gap_trees, gap_bed_fp) # loop over contigs for which we have alignments checking for gaps for contig in trees: if len(gap_trees[contig]): logger.info(('WARNING: There are alignment gaps for ref contig' + ' {}, see bed files for details.').format(contig)) if len(ref_contigs) != len(trees): logger.info('WARNING: Some contigs have no alignments, see bed files' + ' for details.') # bp_done calculated above does not take account of overlapping alignments # hence recalculate here based on merged alignment intervals. aligned_bp = sum((i.length() for tree in trees.values() for i in tree)) msg = 'Alignments spanned {:%} of the reference.' logger.info(msg.format(aligned_bp / total_bp)) msg = 'Check bed files {} and {} for alignment coverage and gaps.' logger.info(msg.format(bed_fp, gap_bed_fp)) logger.info('All done. VCF written to {}.'.format(vcf_fp))
return clusters def align_clusters(first, second): al = sw.global_alignment(first.query_sequence, second.query_sequence) num_hq_mismatches = 0 for q_i, t_i in al['mismatches']: if (first.query_qualities[q_i] > 20) and (second.query_qualities[t_i] > 20): num_hq_mismatches += 1 return al['XO'], num_hq_mismatches cell_key = lambda al: al.get_tag(CELL_BC_TAG) UMI_key = lambda al: al.get_tag(UMI_TAG) loc_key = lambda al: (al.get_tag(LOC_TAG)) empty_header = pysam.AlignmentHeader() def sort_cellranger_bam(bam_fn, sorted_fn, sort_key, filter_func, show_progress=False): Path(sorted_fn).parent.mkdir(exist_ok=True) bam_fh = pysam.AlignmentFile(str(bam_fn)) als = bam_fh relevant = list(filter(filter_func, als)) max_read_length = 0 total_reads_out = 0 chunk_fns = []
"""Test cases for the bam2fastq.py script""" import unittest import pysam import bam2fastq from io import StringIO HEADER = pysam.AlignmentHeader().from_text("""@HD VN:1.0 SO:coordinate @SQ SN:R00000042 LN:5231428 AS:gi|26111730|gb|AE014075.1| SP:Ecol @RG ID:824f45e8-37f3-4cb9-8a05-63f0b7c9b959 PL:ILLUMINA PU:160129_D00417_0381_AHJ2VGBCXX_2 LB:VAU2662A45 DT:2016-01-31T00:00:00+0000 SM:H125100459 CN:WTCHG @RG ID:7f568ff7-e0f6-4a55-ad17-6fe778ed8f83 PL:ILLUMINA PU:160129_D00417_0381_AHJ2VGBCXX_1 LB:VAU2662A45 DT:2016-01-31T00:00:00+0000 SM:H125100459 CN:WTCHG @CO ID:stampy TM:Mon, 07 Mar 2016 17:58:14 GMT WD:/tmp/usecase3938954872417714817dir HN:gel-pipeline3 UN:compass @CO ID:stampy TM:Mon, 07 Mar 2016 20:24:40 GMT WD:/tmp/usecase3938954872417714817dir HN:gel-pipeline3 UN:compass @CO PN:stampy ID:stampy VN:1.0.23_(r2059) CL:--substitutionrate=0.01 -g /tmp/usecase3938954872417714817dir/references/R00000042/R00000042 -h /tmp/usecase3938954872417714817dir/references/R00000042/R00000042 -M bam -o /tmp/usecase3938954872417714817dir/0564a575-a6f5-40bc-8898-b0b5e944c4d9.sam --logfile=/tmp/usecase3938954872417714817dir/0564a575-a6f5-40bc-8898-b0b5e944c4d9.sam.log --readgroup=ID:824f45e8-37f3-4cb9-8a05-63f0b7c9b959 --outputformat=sam -v 3 @CO PN:stampy ID:stampy VN:1.0.23_(r2059) CL:--substitutionrate=0.01 -g /tmp/usecase3938954872417714817dir/references/R00000042/R00000042 -h /tmp/usecase3938954872417714817dir/references/R00000042/R00000042 -M bam -o /tmp/usecase3938954872417714817dir/570a7dde-6c04-419c-a898-87f872dd4eda.sam --logfile=/tmp/usecase3938954872417714817dir/570a7dde-6c04-419c-a898-87f872dd4eda.sam.log --readgroup=ID:7f568ff7-e0f6-4a55-ad17-6fe778ed8f83 --outputformat=sam -v 3 @CO PN:stampy ID:stampy VN:1.0.23_(r2059) CL:--substitutionrate=0.01 -t 8 -g /tmp/R00000042 -h /tmp/R00000042 --readgroup=ID:WTCHG_246141_245101,SM:7c2f06_45,PL:ILLUMINA,PU:160129_D00417_0381_AHJ2VGBCXX_1,LB:VAU2662A45,DT:2016-01-31,CN:WTCHG --comment=@MISC/WTCHG_246141_245101.comments.txt -M FASTQ/WTCHG_246141_245101_1.fastq.gz,FASTQ/WTCHG_246141_245101_2.fastq.gz @CO ID:stampy TM:Sun, 31 Jan 2016 13:04:28 GMT WD:/data1/GA-DATA/160129_D00417_0381_AHJ2VGBCXX/Data/Intensities/BaseCallsHN:comp03.mgmt.cluster2 UN:johnb @CO PN:stampy ID:stampy VN:1.0.23_(r2059) CL:--substitutionrate=0.01 -t 8 -g /tmp/R00000042 -h /tmp/R00000042 --readgroup=ID:WTCHG_246142_245101,SM:7c2f06_45,PL:ILLUMINA,PU:160129_D00417_0381_AHJ2VGBCXX_2,LB:VAU2662A45,DT:2016-01-31,CN:WTCHG --comment=@MISC/WTCHG_246142_245101.comments.txt -M FASTQ/WTCHG_246142_245101_1.fastq.gz,FASTQ/WTCHG_246142_245101_2.fastq.gz @CO ID:stampy TM:Mon, 07 Mar 2016 21:33:54 GMT WD:/tmp/usecase3938954872417714817dir HN:gel-pipeline3 UN:compass @CO PN:stampy ID:stampy VN:1.0.23_(r2059) CL:--substitutionrate=0.01 -g /tmp/usecase3938954872417714817dir/references/R00000042/R00000042 -h /tmp/usecase3938954872417714817dir/references/R00000042/R00000042 -M bam -o /tmp/usecase3938954872417714817dir/a00a7733-2cfc-46b3-a685-3657fdee6848.sam --logfile=/tmp/usecase3938954872417714817dir/a00a7733-2cfc-46b3-a685-3657fdee6848.sam.log --readgroup=ID:824f45e8-37f3-4cb9-8a05-63f0b7c9b959 --outputformat=sam -v 3 @CO ID:stampy TM:Sun, 31 Jan 2016 15:02:32 GMT WD:/data1/GA-DATA/160129_D00417_0381_AHJ2VGBCXX/Data/Intensities/BaseCallsHN:comp01.mgmt.cluster2 UN:johnb @CO CMD:/home/compass/PIPELINE/mmmPipeline/compass/g4_stampy.py -b bam -r R00000042 -o output -ss seqstats -fs flagstats -g e865d957-12e5-479a-9a08-131dfa0e9a5e""") reads_string = """HISEQ2500-09:381:HJ2VGBCXX:2:1101:10005:7635 99 R00000042 1619836 99 151M = 1620303 618 CCAGAACAGGCGCGGGAAATGTGCGATACCGCGCGCAAACTGGGCAAGGTGCTGGCCTACGACTTTCACCATCGTTTTGCGCTCGATACGCAACAGCTGCGTGAACAGGTGACCAACGGCGTTTTGGGAGAGATTTACGTTACCACCGCCC DDDDDIIIIIIIIIIIIIIIHIHIIIIIIIIGIIIIIIIIGIIIHHIIIIGIIIIIIIIIIHIIIIIIIIIIIIIIIIIHIICGHIIIIHGIIIIHIIIGIGHIIIIIIIIIHHIIIIGIICHHHHIIHEHIIIIIIIIHHIIIIIHHIII PQ:i:205 SM:i:96 UQ:i:78 MQ:i:96 XQ:i:270 NM:i:2 RG:Z:824f45e8-37f3-4cb9-8a05-63f0b7c9b959 HISEQ2500-09:381:HJ2VGBCXX:2:1101:10005:7635 99 R00000042 1619836 99 151M = 1620303 618 CCAGAACAGGCGCGGGAAATGTGCGATACCGCGCGCAAACTGGGCAAGGTGCTGGCCTACGACTTTCACCATCGTTTTGCGCTCGATACGCAACAGCTGCGTGAACAGGTGACCAACGGCGTTTTGGGAGAGATTTACGTTACCACCGCCC DDDDDIIIIIIIIIIIIIIIHIHIIIIIIIIGIIIIIIIIGIIIHHIIIIGIIIIIIIIIIHIIIIIIIIIIIIIIIIIHIICGHIIIIHGIIIIHIIIGIGHIIIIIIIIIHHIIIIGIICHHHHIIHEHIIIIIIIIHHIIIIIHHIII PQ:i:205 SM:i:96 UQ:i:78 MQ:i:96 XQ:i:270 NM:i:2 RG:Z:test-test HISEQ2500-09:381:HJ2VGBCXX:2:1101:10005:7635 1123 R00000042 1619836 99 151M = 1620303 618 CCAGAACAGGCGCGGGAAATGTGCGATACCGCGCGCAAACTGGGCAAGGTGCTGGCCTACGACTTTCACCATCGTTTTGCGCTCGATACGCAACAGCTGCGTGAACAGGTGACCAACGGCGTTTTGGGAGAGATTTACGTTACCACCGCCC DDDDDIIIIIIIIIIIIIIIHIHIIIIIIIIGIIIIIIIIGIIIHHIIIIGIIIIIIIIIIHIIIIIIIIIIIIIIIIIHIICGHIIIIHGIIIIHIIIGIGHIIIIIIIIIHHIIIIGIICHHHHIIHEHIIIIIIIIHHIIIIIHHIII PQ:i:205 SM:i:96 UQ:i:78 MQ:i:96 XQ:i:270 NM:i:2 RG:Z:824f45e8-37f3-4cb9-8a05-63f0b7c9b959 HISEQ2500-09:381:HJ2VGBCXX:2:1101:10005:7635 1123 R00000042 1619836 99 151M = 1620303 618 CCAGAACAGGCGCGGGAAATGTGCGATACCGCGCGCAAACTGGGCAAGGTGCTGGCCTACGACTTTCACCATCGTTTTGCGCTCGATACGCAACAGCTGCGTGAACAGGTGACCAACGGCGTTTTGGGAGAGATTTACGTTACCACCGCCC DDDDDIIIIIIIIIIIIIIIHIHIIIIIIIIGIIIIIIIIGIIIHHIIIIGIIIIIIIIIIHIIIIIIIIIIIIIIIIIHIICGHIIIIHGIIIIHIIIGIGHIIIIIIIIIHHIIIIGIICHHHHIIHEHIIIIIIIIHHIIIIIHHIII PQ:i:205 SM:i:96 UQ:i:78 MQ:i:96 XQ:i:270 NM:i:2 RG:Z:824f45e8-37f3-4cb9-8a05-63f0b7c9b959 HISEQ2500-09:381:HJ2VGBCXX:2:1101:10005:7635 147 R00000042 1620303 99 151M = 1619836 -618 TATGAACGTCAGCTTTTGTGGTGATAAAGCTGGTGCGACGCTGTTTCCAGCACATATCTACACCGATAACAACGGTGAATTAATGACGCTGATGCAACGGGAAATGGCAGACGACAACCGCCATTTGCGCAGCATGGAAGCCTTTATCAAT [email protected]@@.@HG?EHHGCEEIHHEIIHIIHHIHHDHC@CFHIH@F70HEHEHCIIHHGHHHEIIIIIHIHGIIIIIIIIIIIHHHIIIIIFIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIDDDDD PQ:i:205 SM:i:96 UQ:i:270 MQ:i:96 XQ:i:78 NM:i:7 RG:Z:824f45e8-37f3-4cb9-8a05-63f0b7c9b959 HISEQ2500-09:381:HJ2VGBCXX:2:1101:10005:7635 147 R00000042 1620303 99 151M = 1619836 -618 TATGAACGTCAGCTTTTGTGGTGATAAAGCTGGTGCGACGCTGTTTCCAGCACATATCTACACCGATAACAACGGTGAATTAATGACGCTGATGCAACGGGAAATGGCAGACGACAACCGCCATTTGCGCAGCATGGAAGCCTTTATCAAT [email protected]@@.@HG?EHHGCEEIHHEIIHIIHHIHHDHC@CFHIH@F70HEHEHCIIHHGHHHEIIIIIHIHGIIIIIIIIIIIHHHIIIIIFIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIDDDDD PQ:i:205 SM:i:96 UQ:i:270 MQ:i:96 XQ:i:78 NM:i:7 RG:Z:824f45e8-37f3-4cb9-8a05-63f0b7c9b959 HISEQ2500-09:381:HJ2VGBCXX:2:1101:10005:7635 1171 R00000042 1620303 99 151M = 1619836 -618 TATGAACGTCAGCTTTTGTGGTGATAAAGCTGGTGCGACGCTGTTTCCAGCACATATCTACACCGATAACAACGGTGAATTAATGACGCTGATGCAACGGGAAATGGCAGACGACAACCGCCATTTGCGCAGCATGGAAGCCTTTATCAAT [email protected]@@.@HG?EHHGCEEIHHEIIHIIHHIHHDHC@CFHIH@F70HEHEHCIIHHGHHHEIIIIIHIHGIIIIIIIIIIIHHHIIIIIFIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIDDDDD PQ:i:205 SM:i:96 UQ:i:270 MQ:i:96 XQ:i:78 NM:i:7 RG:Z:824f45e8-37f3-4cb9-8a05-63f0b7c9b959 HISEQ2500-09:381:HJ2VGBCXX:2:1101:10005:7635 1171 R00000042 1620303 99 151M = 1619836 -618 TATGAACGTCAGCTTTTGTGGTGATAAAGCTGGTGCGACGCTGTTTCCAGCACATATCTACACCGATAACAACGGTGAATTAATGACGCTGATGCAACGGGAAATGGCAGACGACAACCGCCATTTGCGCAGCATGGAAGCCTTTATCAAT [email protected]@@.@HG?EHHGCEEIHHEIIHIIHHIHHDHC@CFHIH@F70HEHEHCIIHHGHHHEIIIIIHIHGIIIIIIIIIIIHHHIIIIIFIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIDDDDD PQ:i:205 SM:i:96 UQ:i:270 MQ:i:96 XQ:i:78 NM:i:7 RG:Z:824f45e8-37f3-4cb9-8a05-63f0b7c9b959 HISEQ2500-09:381:HJ2VGBCXX:2:2201:15073:80781 1121 R00000042 1 70 1M7I143M = 47901254790275 ATTTTTCAGCTTTTCATTCTGACTGCAATGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTCTCTGACAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGGTCACTAAATACTTTAACCAATATA DDDDDHEHIHIIIIGHIIIIHHIIIIGIIHHFHHIHIFECFHHIGGHIIIFHEHIIIIIIIIHIHEHHIIIIIHHIIHIGIGH?HEHHHIIFGHHFHHHIHEFHIIIIIIHIIIIGHHHHHEHFHHHHHHHHHHEHHGHIFHHIGCHGHHH PQ:i:375 SM:i:70 UQ:i:217 MQ:i:96XQ:i:186 NM:i:10 RG:Z:824f45e8-37f3-4cb9-8a05-63f0b7c9b959"""