def test_assembly_of_many_subsequences(): original_prefix = "ACTGAACCTTGGAAACCCTTTGGG" original_allele = "CCCTTT" original_suffix = "GGAAGGAAGGAATTTTTTTT" # generate 100 subsequences of all combinations of 0-9 # characters trimmed from beginning of prefix vs. end of suffix subsequences = [ VariantSequence( prefix=original_prefix[i:], alt=original_allele, suffix=original_suffix[:-j] if j > 0 else original_suffix, reads={str(i) + "_" + str(j)}) for i in range(10) for j in range(10) ] eq_(100, len(subsequences)) # adding one decoy sequence which doesn't match decoy = VariantSequence(prefix="G" + original_prefix[1:], alt=original_allele, suffix=original_suffix, reads={"decoy"}) input_sequences = subsequences + [decoy] results = iterative_overlap_assembly(input_sequences, min_overlap_size=len(original_allele)) eq_(len(results), 2) result = results[0] eq_(result.prefix, original_prefix) eq_(result.alt, original_allele) eq_(result.suffix, original_suffix) eq_(len(result.reads), len(subsequences)) result_decoy = results[1] eq_(result_decoy.sequence, decoy.sequence)
def test_variant_sequence_overlaps(): # AAA|GG|TT vs_3A = VariantSequence( prefix="AAA", alt="GG", suffix="TT", reads=[ AlleleRead( prefix="AAA", allele="GG", suffix="TT", name="1")]) # AA|GG|TT vs_2A = VariantSequence( prefix="AA", alt="GG", suffix="TT", reads=[ AlleleRead( prefix="AA", allele="GG", suffix="TT", name="1")]) for min_overlap_size in [1, 2, 3, 4, 5, 6]: assert vs_3A.left_overlaps(vs_2A, min_overlap_size=min_overlap_size), \ "Expected %s to overlap %s from left (min overlap size=%d)" % ( vs_3A, vs_2A, min_overlap_size) assert not vs_2A.left_overlaps(vs_3A, min_overlap_size=min_overlap_size), \ "Expected %s to not overlap %s from left (min overlap size=%d)" % ( vs_2A, vs_3A, min_overlap_size) assert not vs_3A.left_overlaps(vs_2A, min_overlap_size=7), \ "Unexpected overlap between %s and %s for min_overlap_size=7" % ( vs_3A, vs_2A)
def test_variant_sequence_add_reads(): vs = VariantSequence(prefix="A", alt="C", suffix="G", reads={"1"}) # adding reads '2' and '3', sometimes multiple times vs_result = vs.add_reads("2").add_reads("1").add_reads("2").add_reads("3") expected = VariantSequence(prefix="A", alt="C", suffix="G", reads={"1", "2", "3"}) eq_(vs_result, expected)
def test_variant_sequence_combine(): vs1 = VariantSequence(prefix="A", alt="C", suffix="GG", reads={"1"}) vs2 = VariantSequence(prefix="AA", alt="C", suffix="GG", reads={"2"}) vs_result_1_to_2 = vs1.combine(vs2) expected = VariantSequence(prefix="AA", alt="C", suffix="GG", reads={"1", "2"}) eq_(vs_result_1_to_2, expected) # shouldn't matter which sequence is first as an argument to the combine # function vs_result_2_to_1 = vs2.combine(vs1) eq_(vs_result_2_to_1, expected)
def test_variant_sequence_min_coverage(): # 1: AA|C|TT # 2: AA|C|T # 3: A|C|TT reads = [ AlleleRead(prefix="AA", allele="C", suffix="TT", name="1"), AlleleRead(prefix="AA", allele="C", suffix="T", name="2"), AlleleRead(prefix="A", allele="C", suffix="TT", name="3") ] vs = VariantSequence(prefix="AA", alt="C", suffix="TT", reads=reads) eq_(vs.min_coverage(), 2)
def test_variant_sequence_trim_by_coverage(): reads = [ AlleleRead(prefix="AA", allele="C", suffix="T", name="1"), AlleleRead(prefix="A", allele="C", suffix="T", name="2") ] vs = VariantSequence(prefix="AA", alt="C", suffix="T", reads=reads) # every nucleotide is spanned by one read eq_(vs.trim_by_coverage(1), vs) vs_expected_trim_by_2 = VariantSequence(prefix="A", alt="C", suffix="T", reads=reads) eq_(vs.trim_by_coverage(2), vs_expected_trim_by_2)
def test_variant_sequence_mean_coverage(): # 1: AA|C|TT # 2: AA|C|T # 3: A|C|TT reads = [ AlleleRead(prefix="AA", allele="C", suffix="TT", name="1"), AlleleRead(prefix="AA", allele="C", suffix="T", name="2"), AlleleRead(prefix="A", allele="C", suffix="TT", name="3") ] vs = VariantSequence(prefix="AA", alt="C", suffix="TT", reads=reads) # count the number of times a nucleotide in the sequences above # is contained in a read expected_mean_coverage = (2 + 3 + 3 + 3 + 2) / 5 eq_(vs.mean_coverage(), expected_mean_coverage)
def test_assembly_time(): original_prefix = "ACTGAACCTTGGAAACCCTTTGGG" original_allele = "CCCTTT" original_suffix = "GGAAGGAAGGAATTTTTTTTGGCC" # generate 400 subsequences of all combinations of 0-19 # characters trimmed from beginning of prefix vs. end of suffix subsequences = [ VariantSequence( prefix=original_prefix[i:], alt=original_allele, suffix=original_suffix[:-j] if j > 0 else original_suffix, reads={str(i) + "_" + str(j)}) for i in range(20) for j in range(20) ] eq_(len(subsequences), 400) t_start = time() results = iterative_overlap_assembly(subsequences, min_overlap_size=len(original_allele)) t_end = time() eq_(len(results), 1) result = results[0] eq_(result.prefix, original_prefix) eq_(result.suffix, original_suffix) t_elapsed = t_end - t_start assert t_elapsed < 0.1, \ "Expected assembly of 400 sequences to take less than 100ms: %0.4fms" % ( t_elapsed * 1000,)
def test_variant_sequence_min_coverage(): # 1: AA|C|TT # 2: AA|C|T # 3: A|C|TT reads = [ AlleleRead( prefix="AA", allele="C", suffix="TT", name="1"), AlleleRead( prefix="AA", allele="C", suffix="T", name="2"), AlleleRead( prefix="A", allele="C", suffix="TT", name="3") ] vs = VariantSequence( prefix="AA", alt="C", suffix="TT", reads=reads) eq_(vs.min_coverage(), 2)
def test_collapse_substrings(): # AAA|C|GGG vs_longer = VariantSequence(prefix="AAA", alt="C", suffix="GGG", reads={"1"}) # AAA|C|GG vs_shorter = VariantSequence(prefix="AAA", alt="C", suffix="GG", reads={"2"}) vs_unrelated = VariantSequence("TAA", alt="C", suffix="GG", reads={"3"}) results = collapse_substrings([vs_longer, vs_shorter, vs_unrelated]) eq_(len(results), 2), "Expected two sequences, got %d: %s" % (len(results), results) vs_combined = vs_longer.add_reads({"2"}) assert vs_combined in results, "Expected %s to be in %s" % (vs_combined, results) assert vs_unrelated in results, "Expected %s to be in %s" % (vs_unrelated, results)
def test_variant_sequence_trim_by_coverage(): reads = [ AlleleRead( prefix="AA", allele="C", suffix="T", name="1"), AlleleRead( prefix="A", allele="C", suffix="T", name="2") ] vs = VariantSequence( prefix="AA", alt="C", suffix="T", reads=reads) # every nucleotide is spanned by one read eq_(vs.trim_by_coverage(1), vs) vs_expected_trim_by_2 = VariantSequence( prefix="A", alt="C", suffix="T", reads=reads) eq_(vs.trim_by_coverage(2), vs_expected_trim_by_2)
def test_variant_sequence_mean_coverage(): # 1: AA|C|TT # 2: AA|C|T # 3: A|C|TT reads = [ AlleleRead( prefix="AA", allele="C", suffix="TT", name="1"), AlleleRead( prefix="AA", allele="C", suffix="T", name="2"), AlleleRead( prefix="A", allele="C", suffix="TT", name="3") ] vs = VariantSequence( prefix="AA", alt="C", suffix="TT", reads=reads) # count the number of times a nucleotide in the sequences above # is contained in a read expected_mean_coverage = (2 + 3 + 3 + 3 + 2) / 5 eq_(vs.mean_coverage(), expected_mean_coverage)
def test_assembly_of_simple_sequence_from_mock_reads(): # Read sequences: # AAAAA|CC|TTTTT # AAAAA|CC|TTTTT # GAAAAA|CC|TTTTTG # AAAA|CC|TTTT reads = [ # two identical reads with sequence AAAAA|CC|TTTTT AlleleRead(prefix="A" * 5, allele="CC", suffix="T" * 5, name="dup1"), AlleleRead(prefix="A" * 5, allele="CC", suffix="T" * 5, name="dup2"), # longer sequence GAAAAA|CC|TTTTTG AlleleRead(prefix="G" + "A" * 5, allele="CC", suffix="T" * 5 + "G", name="longer"), # shorter sequence AAAA|CC|TTTT AlleleRead(prefix="A" * 4, allele="CC", suffix="T" * 4, name="shorter"), ] expected_variant_sequence = VariantSequence(prefix="G" + "A" * 5, alt="CC", suffix="T" * 5 + "G", reads=reads) initial_variant_sequences = initial_variant_sequences_from_reads(reads) # expecting one fewer sequence than reads since two of the reads are # duplicates eq_(len(initial_variant_sequences), len(reads) - 1) # calling into either iterative_overlap_assembly or greedy_merge should # give same results for fn in [greedy_merge, iterative_overlap_assembly]: assembled_variant_sequences = fn(initial_variant_sequences, min_overlap_size=1) # since no reads contradict each other then we should get back a single # assembled sequence eq_( len(assembled_variant_sequences), 1, "Unexpected number of variant sequences: %s" % (assembled_variant_sequences, )) assembled_variant_sequence = assembled_variant_sequences[0] eq_(assembled_variant_sequence, expected_variant_sequence) eq_(len(assembled_variant_sequence.reads), len(reads)) eq_(assembled_variant_sequence.min_coverage(), 1) # 2 bases with 1/4 reads, 2 bases with 3/4 reads, remaining 10 bases with # all 4/4 reads expected_mean_coverage = (2 * 1 + 2 * 3 + 10 * 4) / 14 eq_(assembled_variant_sequence.mean_coverage(), expected_mean_coverage)
def test_variant_sequence_read_names(): vs = VariantSequence(prefix="A", alt="C", suffix="T", reads=[ AlleleRead(prefix="A", allele="C", suffix="T", name="1"), AlleleRead(prefix="A", allele="C", suffix="T", name="2") ]) eq_(vs.read_names, {"1", "2"})
def test_assembly_unrelated_sequences(): # 2 overlapping sequences, 1 with a different suffix, # and 2 totally unrelated sequences variant_sequences = [ VariantSequence(prefix="CCC", alt="T", suffix="GGG", reads={"1"}), VariantSequence(prefix="TCCC", alt="T", suffix="G", reads={"2"}), VariantSequence(prefix="CCC", alt="T", suffix="AAA", reads={"3"}), VariantSequence(prefix="AGG", alt="T", suffix="CGG", reads={"4"}), VariantSequence(prefix="CAC", alt="T", suffix="TTT", reads={"5"}) ] results = iterative_overlap_assembly(variant_sequences, min_overlap_size=1) eq_(len(results), 4) # first two sequences were overlapping count_multiple = 0 count_singleton = 0 for result in results: # all but one result are singletons if len(result.reads) > 1: eq_(result.reads, {"1", "2"}) count_multiple += 1 else: count_singleton += 1 eq_(3, count_singleton) eq_(1, count_multiple)
def test_variant_sequence_overlaps(): # AAA|GG|TT vs_3A = VariantSequence( prefix="AAA", alt="GG", suffix="TT", reads=[AlleleRead(prefix="AAA", allele="GG", suffix="TT", name="1")]) # AA|GG|TT vs_2A = VariantSequence( prefix="AA", alt="GG", suffix="TT", reads=[AlleleRead(prefix="AA", allele="GG", suffix="TT", name="1")]) for min_overlap_size in [1, 2, 3, 4, 5, 6]: assert vs_3A.left_overlaps(vs_2A, min_overlap_size=min_overlap_size), \ "Expected %s to overlap %s from left (min overlap size=%d)" % ( vs_3A, vs_2A, min_overlap_size) assert not vs_2A.left_overlaps(vs_3A, min_overlap_size=min_overlap_size), \ "Expected %s to not overlap %s from left (min overlap size=%d)" % ( vs_2A, vs_3A, min_overlap_size) assert not vs_3A.left_overlaps(vs_2A, min_overlap_size=7), \ "Unexpected overlap between %s and %s for min_overlap_size=7" % ( vs_3A, vs_2A)
def make_inputs_for_tp53_201_variant( cdna_prefix="ATG", cdna_suffix="AGGAGCCGCAGTCAGAT", n_bad_nucleotides_at_start=0, mismatches_before_variant=0, mismatches_after_variant=14, # the read is that much longer than the reference (17 vs 3) reference_context_size=3): """ Parameters ---------- cdna_prefix : str Transcript nucleotides before the variant that we're pretending got detected from RNA-seq reads. cdna_suffix : str Transcript nucleotides after the variant that we're pretending got detected from RNA-seq reads. n_bad_nucleotides_at_start : int Number of nucleotides we expect to get trimmed from the beginning of the variant sequence while matching to a reference context. mismatches_before_variant : int Expected number of nucleotide mismatches in the result before the variant locus. reference_context_size : int Number of nucleotides before the variant locus to try matching against a reference transcript. """ # TP53-201 is an isoform of TP53 which seems to lack untranslated # regions so the sequence is: # First exon: chr17 7,676,594 - 7,676,521 # ATG|GAG|GAG|CCG|CAG|TCA|GAT... # -M-|-E-|-E-|-P-|-Q-|-S-|-D- # we're assuming a variant # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K) variant = Variant("chr17", 7676591, "C", "T", "GRCh38") # TP53-201 transcript = variant.ensembl.transcripts_by_name("TP53-201")[0] effect = variant.effect_on_transcript(transcript) eq_(effect.__class__.__name__, "Substitution") eq_(effect.aa_ref, "E") eq_(effect.aa_alt, "K") cdna_alt = "A" # genomic DNA is the reverse complement of the cDNA # for TP53-001 since it's on the negative strand gdna_prefix = reverse_complement_dna(cdna_suffix) gdna_alt = reverse_complement_dna(cdna_alt) gdna_suffix = reverse_complement_dna(cdna_prefix) # variant sequence supported by two reads # one fully spanning the variant sequence # and another missing the last nucleotide fully_overlapping_read = AlleleRead(prefix=gdna_prefix, allele=gdna_alt, suffix=gdna_suffix, name="full-overlap") # testing the prefix and allele to make sure they have the expected # TP53-201 sequence but the suffix might change depending on what's # passed in as cdna_prefix if cdna_suffix == "AGGAGCCGCAGTCAGAT": eq_(fully_overlapping_read.prefix, "ATCTGACTGCGGCTCCT") eq_(fully_overlapping_read.allele, "T") partially_overlapping_read = AlleleRead(prefix=gdna_prefix, allele=gdna_alt, suffix=gdna_suffix[:-1], name="partial-overlap") if cdna_suffix == "AGGAGCCGCAGTCAGAT": eq_(partially_overlapping_read.prefix, "ATCTGACTGCGGCTCCT") eq_(partially_overlapping_read.allele, "T") variant_sequence = VariantSequence( prefix=gdna_prefix, alt=gdna_alt, suffix=gdna_suffix, reads=[fully_overlapping_read, partially_overlapping_read]) assert isinstance(variant_sequence, VariantSequence) prefix_length = len(cdna_prefix) - n_bad_nucleotides_at_start reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript( variant=variant, transcript=transcript, context_size=reference_context_size) assert isinstance(reference_coding_sequence_key, ReferenceCodingSequenceKey) reference_context = ReferenceContext.from_reference_coding_sequence_key( key=reference_coding_sequence_key, variant=variant, transcripts=[transcript]) assert isinstance(reference_context, ReferenceContext) expected = VariantSequenceInReadingFrame( cdna_sequence=cdna_prefix[-prefix_length:] + cdna_alt + cdna_suffix, offset_to_first_complete_codon=prefix_length % 3, variant_cdna_interval_start=prefix_length, variant_cdna_interval_end=prefix_length + 1, reference_cdna_sequence_before_variant="ATG"[-prefix_length:], reference_cdna_sequence_after_variant= "AGGAGCCGCAGTCAGAT"[:reference_context_size], number_mismatches_before_variant=mismatches_before_variant, number_mismatches_after_variant=mismatches_after_variant) assert isinstance(expected, VariantSequenceInReadingFrame) return variant_sequence, reference_context, expected
def test_variant_sequence_contains(): # AA|C|T vs_longer_prefix = VariantSequence(prefix="AA", alt="C", suffix="T", reads=[ AlleleRead(prefix="AA", allele="C", suffix="T", name="longer_prefix") ]) # A|C|TT vs_longer_suffix = VariantSequence(prefix="A", alt="C", suffix="TT", reads=[ AlleleRead(prefix="A", allele="C", suffix="TT", name="longer_suffix") ]) # A|C|T vs_short = VariantSequence( prefix="A", alt="C", suffix="T", reads=[AlleleRead(prefix="A", allele="C", suffix="T", name="short")]) # two longer sequences contain the shorter subsequence assert vs_longer_prefix.contains(vs_short), \ "Expected %s to contain %s" % (vs_longer_prefix, vs_short) assert vs_longer_suffix.contains(vs_short), \ "Expected %s to contain %s" % (vs_longer_suffix, vs_short) # other pairs do not contain each other assert not vs_longer_prefix.contains(vs_longer_suffix), \ "Expected %s to not contain %s" % (vs_longer_prefix, vs_longer_suffix) assert not vs_longer_suffix.contains(vs_longer_prefix), \ "Expected %s to not contain %s" % (vs_longer_suffix, vs_longer_prefix) assert not vs_short.contains(vs_longer_prefix), \ "Expected %s to not contain %s" % (vs_short, vs_longer_prefix) assert not vs_short.contains(vs_longer_suffix), \ "Expected %s to not contain %s" % (vs_short, vs_longer_suffix) # Sequences above has 'C' allele whereas this one has 'G' # A|G|T vs_different_allele = VariantSequence( prefix="A", alt="G", suffix="T", reads=[AlleleRead(prefix="A", allele="G", suffix="T", name="short")]) for vs in [vs_longer_suffix, vs_longer_prefix, vs_short]: assert not vs.contains(vs_different_allele), \ "Expected %s to not contain %s" % (vs, vs_different_allele) assert not vs_different_allele.contains(vs), \ "Expected %s to not contain %s" % (vs_different_allele, vs)
def test_variant_sequence_len(): vs = VariantSequence(prefix="AA", alt="C", suffix="TT", reads=[]) eq_(len(vs), 5)
def test_assembly_1_sequence(): vs = VariantSequence(prefix="CCC", alt="T", suffix="GGG", reads={"1"}) eq_(iterative_overlap_assembly([vs]), [vs])
def test_variant_sequence_contains(): # AA|C|T vs_longer_prefix = VariantSequence( prefix="AA", alt="C", suffix="T", reads=[ AlleleRead( prefix="AA", allele="C", suffix="T", name="longer_prefix")]) # A|C|TT vs_longer_suffix = VariantSequence( prefix="A", alt="C", suffix="TT", reads=[ AlleleRead( prefix="A", allele="C", suffix="TT", name="longer_suffix")]) # A|C|T vs_short = VariantSequence( prefix="A", alt="C", suffix="T", reads=[ AlleleRead( prefix="A", allele="C", suffix="T", name="short")]) # two longer sequences contain the shorter subsequence assert vs_longer_prefix.contains(vs_short), \ "Expected %s to contain %s" % (vs_longer_prefix, vs_short) assert vs_longer_suffix.contains(vs_short), \ "Expected %s to contain %s" % (vs_longer_suffix, vs_short) # other pairs do not contain each other assert not vs_longer_prefix.contains(vs_longer_suffix), \ "Expected %s to not contain %s" % (vs_longer_prefix, vs_longer_suffix) assert not vs_longer_suffix.contains(vs_longer_prefix), \ "Expected %s to not contain %s" % (vs_longer_suffix, vs_longer_prefix) assert not vs_short.contains(vs_longer_prefix), \ "Expected %s to not contain %s" % (vs_short, vs_longer_prefix) assert not vs_short.contains(vs_longer_suffix), \ "Expected %s to not contain %s" % (vs_short, vs_longer_suffix) # Sequences above has 'C' allele whereas this one has 'G' # A|G|T vs_different_allele = VariantSequence( prefix="A", alt="G", suffix="T", reads=[ AlleleRead( prefix="A", allele="G", suffix="T", name="short")]) for vs in [vs_longer_suffix, vs_longer_prefix, vs_short]: assert not vs.contains(vs_different_allele), \ "Expected %s to not contain %s" % (vs, vs_different_allele) assert not vs_different_allele.contains(vs), \ "Expected %s to not contain %s" % (vs_different_allele, vs)