def test_somatic_variant_with_2_supporting_rna_reads(): variant = Variant("14", 105849746, "G", "A") base_dir = "data/somatic-variant-with-2-supporting-rna-reads/" normal_reads = load_bam(base_dir + "normal.14.105849746.G.A.no-alt.sorted.bam") tumor_reads = load_bam(base_dir + "tumor.14.105849746.G.A.many-alt.sorted.bam") rna_reads = load_bam(base_dir + "rna.14.105849746.G.A.2-alt.sorted.bam") normal_sample_variant_reads = reads_supporting_variant( variant=variant, samfile=normal_reads) eq_(len(normal_sample_variant_reads), 0) print(normal_sample_variant_reads) tumor_sample_variant_reads = reads_supporting_variant(variant=variant, samfile=tumor_reads) print(tumor_sample_variant_reads) eq_(len(tumor_sample_variant_reads), 8) rna_sample_variant_reads = reads_supporting_variant(variant=variant, samfile=rna_reads) print(rna_sample_variant_reads) eq_(len(rna_sample_variant_reads), 2) # Arun went through the hassle of pulling out the exact read names # in IGV expected_variant_rna_read_names = { "K00193:50:H5NKVBBXX:5:2202:6421:24964", "K00193:50:H5NKVBBXX:5:2119:30908:1138", } for variant_read in rna_sample_variant_reads: assert variant_read.name in expected_variant_rna_read_names
def test_somatic_variant_with_2_supporting_rna_reads(): variant = Variant("14", 105849746, "G", "A") base_dir = "data/somatic-variant-with-2-supporting-rna-reads/" normal_reads = load_bam(base_dir + "normal.14.105849746.G.A.no-alt.sorted.bam") tumor_reads = load_bam(base_dir + "tumor.14.105849746.G.A.many-alt.sorted.bam") rna_reads = load_bam(base_dir + "rna.14.105849746.G.A.2-alt.sorted.bam") normal_sample_variant_reads = reads_supporting_variant( variant=variant, samfile=normal_reads) eq_(len(normal_sample_variant_reads), 0) print(normal_sample_variant_reads) tumor_sample_variant_reads = reads_supporting_variant( variant=variant, samfile=tumor_reads) print(tumor_sample_variant_reads) eq_(len(tumor_sample_variant_reads), 8) rna_sample_variant_reads = reads_supporting_variant( variant=variant, samfile=rna_reads) print(rna_sample_variant_reads) eq_(len(rna_sample_variant_reads), 2) # Arun went through the hassle of pulling out the exact read names # in IGV expected_variant_rna_read_names = { "K00193:50:H5NKVBBXX:5:2202:6421:24964", "K00193:50:H5NKVBBXX:5:2119:30908:1138", } for variant_read in rna_sample_variant_reads: assert variant_read.name in expected_variant_rna_read_names
def test_assemble_transcript_fragments_snv(): samfile = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 65857041 ref = "G" alt = "C" variant = Variant( contig=chromosome, start=base1_location, ref=ref, alt=alt, ensembl=ensembl_grch38) variant_reads = reads_supporting_variant( variant=variant, samfile=samfile, chromosome=chromosome,) sequences = iterative_overlap_assembly( initial_variant_sequences_from_reads(variant_reads), min_overlap_size=30) assert len(sequences) > 0 max_read_length = max(len(r) for r in variant_reads) for s in sequences: print("%s%s%s weight=%d length=%d" % ( s.prefix, s.alt, s.suffix, len(s.reads), len(s.sequence))) eq_(s.alt, alt) assert len(s) > max_read_length, \ "Expected assembled sequences to be longer than read length (%d)" % ( max_read_length,)
def test_group_unique_sequences(): samfile = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 65857041 ref = "G" alt = "C" variant = Variant( contig=chromosome, start=base1_location, ref=ref, alt=alt, ensembl=ensembl_grch38) variant_reads = reads_supporting_variant( samfile=samfile, chromosome=chromosome, variant=variant) print("%d variant reads: %s" % ( len(variant_reads), variant_reads)) groups = group_unique_sequences( variant_reads, max_prefix_size=30, max_suffix_size=30) print("%d unique sequences: %s" % ( len(groups), groups)) # there are some redundant reads, so we expect that the number of # unique entries should be less than the total read partitions assert len(variant_reads) > len(groups)
def test_most_common_nucleotides_for_chr12_deletion(): samfile = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 70091490 ref = "TTGTAGATGCTGCCTCTCC" alt = "" variant = Variant( chromosome, base1_location, ref, alt, ensembl=ensembl_grch38) variant_reads = reads_supporting_variant( samfile=samfile, chromosome=chromosome, variant=variant) consensus_sequence, chosen_counts, other_counts = most_common_nucleotides( variant_reads) print(chosen_counts) print(other_counts) eq_(len(chosen_counts), len(consensus_sequence)) eq_(len(other_counts), len(consensus_sequence)) assert other_counts.sum() < chosen_counts.sum(), \ "Counts for alternate nucleotides should not exceed the chosen sequence" number_matching_reads = 0 for variant_read in variant_reads: full_seq = variant_read.prefix + variant_read.allele + variant_read.suffix number_matching_reads += (full_seq in consensus_sequence) fraction_matching_reads = number_matching_reads / float(len(variant_reads)) print("Fraction matching reads is %d/%d = %f" % ( number_matching_reads, len(variant_reads), fraction_matching_reads)) assert fraction_matching_reads > 0.5, \ "Expected majority of reads to match consensus sequence"
def test_sequence_counts_snv(): samfile = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 65857041 ref = "G" alt = "C" variant = Variant(chromosome, base1_location, ref, alt) variant_reads = reads_supporting_variant( samfile=samfile, chromosome=chromosome, variant=variant) variant_sequences = reads_to_variant_sequences( variant=variant, reads=variant_reads, preferred_sequence_length=61) assert len(variant_sequences) == 1 for variant_sequence in variant_sequences: print(variant_sequence) eq_(variant_sequence.alt, alt) eq_(len(variant_sequence.prefix), 30) eq_(len(variant_sequence.suffix), 30) eq_( variant_sequence.prefix + variant_sequence.alt + variant_sequence.suffix, variant_sequence.sequence)
def test_partitioned_read_sequences_deletion(): """ test_partitioned_read_sequences_deletion : Test that read gets correctly partitioned for chr1:4 TT>T where the sequence for chr1 is assumed to be "ACCTTG" """ # chr1_seq = "ACCTTG" chromosome = "chromosome" location = 4 ref = "TT" alt = "T" variant = Variant(chromosome, location, ref, alt, normalize_contig_name=False) read = make_read(seq="ACCTG", cigar="4M1D1M", mdtag="4^T1") samfile = DummySamFile(reads=[read]) variant_reads = reads_supporting_variant(samfile=samfile, chromosome=chromosome, variant=variant) print(variant_reads) assert len(variant_reads) == 1 variant_read = variant_reads[0] expected = AlleleRead(name=read.qname, prefix="ACCT", allele="", suffix="G") eq_(variant_read, expected)
def test_partitioned_read_sequences_deletion(): """ test_partitioned_read_sequences_deletion : Test that read gets correctly partitioned for chr1:4 TT>T where the sequence for chr1 is assumed to be "ACCTTG" """ # chr1_seq = "ACCTTG" chromosome = "chromosome" location = 4 ref = "TT" alt = "T" variant = Variant( chromosome, location, ref, alt, normalize_contig_name=False) read = make_read(seq="ACCTG", cigar="4M1D1M", mdtag="4^T1") samfile = DummySamFile(reads=[read]) variant_reads = reads_supporting_variant( samfile=samfile, chromosome=chromosome, variant=variant) print(variant_reads) assert len(variant_reads) == 1 variant_read = variant_reads[0] expected = AlleleRead( name=read.qname, prefix="ACCT", allele="", suffix="G") eq_(variant_read, expected)
def test_assemble_transcript_fragments_snv(): samfile = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 65857041 ref = "G" alt = "C" variant = Variant(contig=chromosome, start=base1_location, ref=ref, alt=alt, ensembl=ensembl_grch38) variant_reads = reads_supporting_variant( variant=variant, samfile=samfile, chromosome=chromosome, ) sequences = iterative_overlap_assembly( initial_variant_sequences_from_reads(variant_reads), min_overlap_size=30) assert len(sequences) > 0 max_read_length = max(len(r) for r in variant_reads) for s in sequences: print("%s%s%s weight=%d length=%d" % (s.prefix, s.alt, s.suffix, len(s.reads), len(s.sequence))) eq_(s.alt, alt) if len(s.read_names) > 1: # expect sequences supported by more than one read to be greater # than the read length assert len(s) > max_read_length, \ "Expected assembled sequences to be longer than read length (%d)" % ( max_read_length,)
def test_partition_variant_reads_deletion(): samfile = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 70091490 ref = "TTGTAGATGCTGCCTCTCC" alt = "" variant = Variant(contig=chromosome, start=base1_location, ref=ref, alt=alt, ensembl=ensembl_grch38) variant_reads = reads_supporting_variant(samfile=samfile, chromosome=chromosome, variant=variant) assert len(variant_reads) > 1 for variant_read in variant_reads: eq_(variant_read.allele, alt)
def test_somatic_variant_with_0_supporting_rna_reads(): variant = Variant("6", 90411765, "G", "A") base_dir = "data/somatic-variant-with-0-supporting-rna-reads/" normal_reads = load_bam(base_dir + "normal.6.90411765.G.A.sorted.bam") tumor_reads = load_bam(base_dir + "tumor.6.90411765.G.A.sorted.bam") rna_reads = load_bam(base_dir + "rna.6.90411765.G.A.sorted.bam") normal_sample_variant_reads = reads_supporting_variant( variant=variant, samfile=normal_reads) eq_(len(normal_sample_variant_reads), 0) print(normal_sample_variant_reads) tumor_sample_variant_reads = reads_supporting_variant(variant=variant, samfile=tumor_reads) print(tumor_sample_variant_reads) eq_(len(tumor_sample_variant_reads), 5) rna_sample_variant_reads = reads_supporting_variant(variant=variant, samfile=rna_reads) print(rna_sample_variant_reads) eq_(len(rna_sample_variant_reads), 0)
def test_partition_variant_reads_deletion(): samfile = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 70091490 ref = "TTGTAGATGCTGCCTCTCC" alt = "" variant = Variant( contig=chromosome, start=base1_location, ref=ref, alt=alt, ensembl=ensembl_grch38) variant_reads = reads_supporting_variant( samfile=samfile, chromosome=chromosome, variant=variant) assert len(variant_reads) > 1 for variant_read in variant_reads: eq_(variant_read.allele, alt)
def test_sequence_counts_snv(): samfile = load_bam("data/cancer-wgs-primary.chr12.bam") chromosome = "chr12" base1_location = 65857041 ref = "G" alt = "C" variant = Variant(chromosome, base1_location, ref, alt) variant_reads = reads_supporting_variant(samfile=samfile, chromosome=chromosome, variant=variant) variant_sequences = reads_to_variant_sequences( variant=variant, reads=variant_reads, preferred_sequence_length=61) assert len(variant_sequences) == 1 for variant_sequence in variant_sequences: print(variant_sequence) eq_(variant_sequence.alt, alt) eq_(len(variant_sequence.prefix), 30) eq_(len(variant_sequence.suffix), 30) eq_( variant_sequence.prefix + variant_sequence.alt + variant_sequence.suffix, variant_sequence.sequence)