예제 #1
0
def test_allele_count_dataframe():
    variant = Variant("test_contig", 50, "C", "G")
    read_evidence = ReadEvidence(trimmed_base1_start=50,
                                 trimmed_ref="C",
                                 trimmed_alt="G",
                                 ref_reads=[
                                     AlleleRead(prefix="AAA",
                                                allele="C",
                                                suffix="TTT",
                                                name="C1"),
                                     AlleleRead(prefix="AAC",
                                                allele="C",
                                                suffix="TTA",
                                                name="C2"),
                                 ],
                                 alt_reads=[
                                     AlleleRead(prefix="AAA",
                                                allele="G",
                                                suffix="TTT",
                                                name="G1")
                                 ],
                                 other_reads=[])
    df = allele_counts_dataframe([(variant, read_evidence)])
    assert len(df) == 1, "Wrong number of rows in DataFrame: %s" % (df, )
    row = df.iloc[0]
    eq_(row.num_ref_reads, 2)
    eq_(row.num_alt_reads, 1)
    eq_(row.num_other_reads, 0)
def test_variant_sequence_overlaps():
    # AAA|GG|TT
    vs_3A = VariantSequence(
        prefix="AAA",
        alt="GG",
        suffix="TT",
        reads=[
            AlleleRead(
                prefix="AAA", allele="GG", suffix="TT", name="1")])
    # AA|GG|TT
    vs_2A = VariantSequence(
        prefix="AA",
        alt="GG",
        suffix="TT",
        reads=[
            AlleleRead(
                prefix="AA", allele="GG", suffix="TT", name="1")])
    for min_overlap_size in [1, 2, 3, 4, 5, 6]:
        assert vs_3A.left_overlaps(vs_2A, min_overlap_size=min_overlap_size), \
            "Expected %s to overlap %s from left (min overlap size=%d)" % (
                vs_3A, vs_2A, min_overlap_size)

        assert not vs_2A.left_overlaps(vs_3A, min_overlap_size=min_overlap_size), \
            "Expected %s to not overlap %s from left (min overlap size=%d)" % (
                vs_2A, vs_3A, min_overlap_size)
    assert not vs_3A.left_overlaps(vs_2A, min_overlap_size=7), \
        "Unexpected overlap between %s and %s for min_overlap_size=7" % (
            vs_3A, vs_2A)
def test_variant_sequence_read_names():
    vs = VariantSequence(
        prefix="A",
        alt="C",
        suffix="T",
        reads=[
            AlleleRead(prefix="A", allele="C", suffix="T", name="1"),
            AlleleRead(prefix="A", allele="C", suffix="T", name="2")])
    eq_(vs.read_names, {"1", "2"})
def test_variant_sequence_contains():
    # AA|C|T
    vs_longer_prefix = VariantSequence(
        prefix="AA",
        alt="C",
        suffix="T",
        reads=[
            AlleleRead(
                prefix="AA", allele="C", suffix="T", name="longer_prefix")])
    # A|C|TT
    vs_longer_suffix = VariantSequence(
        prefix="A",
        alt="C",
        suffix="TT",
        reads=[
            AlleleRead(
                prefix="A", allele="C", suffix="TT", name="longer_suffix")])
    # A|C|T
    vs_short = VariantSequence(
        prefix="A",
        alt="C",
        suffix="T",
        reads=[
            AlleleRead(
                prefix="A", allele="C", suffix="T", name="short")])

    # two longer sequences contain the shorter subsequence
    assert vs_longer_prefix.contains(vs_short), \
        "Expected %s to contain %s" % (vs_longer_prefix, vs_short)
    assert vs_longer_suffix.contains(vs_short), \
        "Expected %s to contain %s" % (vs_longer_suffix, vs_short)
    # other pairs do not contain each other
    assert not vs_longer_prefix.contains(vs_longer_suffix), \
        "Expected %s to not contain %s" % (vs_longer_prefix, vs_longer_suffix)
    assert not vs_longer_suffix.contains(vs_longer_prefix), \
        "Expected %s to not contain %s" % (vs_longer_suffix, vs_longer_prefix)
    assert not vs_short.contains(vs_longer_prefix), \
        "Expected %s to not contain %s" % (vs_short, vs_longer_prefix)
    assert not vs_short.contains(vs_longer_suffix), \
        "Expected %s to not contain %s" % (vs_short, vs_longer_suffix)

    # Sequences above has 'C' allele whereas this one has 'G'
    # A|G|T
    vs_different_allele = VariantSequence(
        prefix="A",
        alt="G",
        suffix="T",
        reads=[
            AlleleRead(
                prefix="A", allele="G", suffix="T", name="short")])

    for vs in [vs_longer_suffix, vs_longer_prefix, vs_short]:
        assert not vs.contains(vs_different_allele), \
            "Expected %s to not contain %s" % (vs, vs_different_allele)
        assert not vs_different_allele.contains(vs), \
            "Expected %s to not contain %s" % (vs_different_allele, vs)
def test_variant_sequence_min_coverage():
    # 1: AA|C|TT
    # 2: AA|C|T
    # 3:  A|C|TT
    reads = [
        AlleleRead(prefix="AA", allele="C", suffix="TT", name="1"),
        AlleleRead(prefix="AA", allele="C", suffix="T", name="2"),
        AlleleRead(prefix="A", allele="C", suffix="TT", name="3")
    ]
    vs = VariantSequence(prefix="AA", alt="C", suffix="TT", reads=reads)
    eq_(vs.min_coverage(), 2)
예제 #6
0
def test_assembly_of_simple_sequence_from_mock_reads():
    # Read sequences:
    #    AAAAA|CC|TTTTT
    #    AAAAA|CC|TTTTT
    #   GAAAAA|CC|TTTTTG
    #     AAAA|CC|TTTT
    reads = [
        # two identical reads with sequence AAAAA|CC|TTTTT
        AlleleRead(prefix="A" * 5, allele="CC", suffix="T" * 5, name="dup1"),
        AlleleRead(prefix="A" * 5, allele="CC", suffix="T" * 5, name="dup2"),
        # longer sequence GAAAAA|CC|TTTTTG
        AlleleRead(prefix="G" + "A" * 5,
                   allele="CC",
                   suffix="T" * 5 + "G",
                   name="longer"),
        # shorter sequence AAAA|CC|TTTT
        AlleleRead(prefix="A" * 4, allele="CC", suffix="T" * 4,
                   name="shorter"),
    ]
    expected_variant_sequence = VariantSequence(prefix="G" + "A" * 5,
                                                alt="CC",
                                                suffix="T" * 5 + "G",
                                                reads=reads)
    initial_variant_sequences = initial_variant_sequences_from_reads(reads)
    # expecting one fewer sequence than reads since two of the reads are
    # duplicates
    eq_(len(initial_variant_sequences), len(reads) - 1)

    # calling into either iterative_overlap_assembly or greedy_merge should
    # give same results
    for fn in [greedy_merge, iterative_overlap_assembly]:

        assembled_variant_sequences = fn(initial_variant_sequences,
                                         min_overlap_size=1)

        # since no reads contradict each other then we should get back a single
        # assembled sequence
        eq_(
            len(assembled_variant_sequences), 1,
            "Unexpected number of variant sequences: %s" %
            (assembled_variant_sequences, ))
        assembled_variant_sequence = assembled_variant_sequences[0]
        eq_(assembled_variant_sequence, expected_variant_sequence)

        eq_(len(assembled_variant_sequence.reads), len(reads))

        eq_(assembled_variant_sequence.min_coverage(), 1)
        # 2 bases with 1/4 reads, 2 bases with 3/4 reads, remaining 10 bases with
        # all 4/4 reads
        expected_mean_coverage = (2 * 1 + 2 * 3 + 10 * 4) / 14
        eq_(assembled_variant_sequence.mean_coverage(), expected_mean_coverage)
def test_variant_sequence_mean_coverage():
    # 1: AA|C|TT
    # 2: AA|C|T
    # 3:  A|C|TT
    reads = [
        AlleleRead(prefix="AA", allele="C", suffix="TT", name="1"),
        AlleleRead(prefix="AA", allele="C", suffix="T", name="2"),
        AlleleRead(prefix="A", allele="C", suffix="TT", name="3")
    ]
    vs = VariantSequence(prefix="AA", alt="C", suffix="TT", reads=reads)
    # count the number of times a nucleotide in the sequences above
    # is contained in a read
    expected_mean_coverage = (2 + 3 + 3 + 3 + 2) / 5
    eq_(vs.mean_coverage(), expected_mean_coverage)
def test_variant_sequence_trim_by_coverage():
    reads = [
        AlleleRead(prefix="AA", allele="C", suffix="T", name="1"),
        AlleleRead(prefix="A", allele="C", suffix="T", name="2")
    ]
    vs = VariantSequence(prefix="AA", alt="C", suffix="T", reads=reads)
    # every nucleotide is spanned by one read
    eq_(vs.trim_by_coverage(1), vs)

    vs_expected_trim_by_2 = VariantSequence(prefix="A",
                                            alt="C",
                                            suffix="T",
                                            reads=reads)
    eq_(vs.trim_by_coverage(2), vs_expected_trim_by_2)
def test_partitioned_read_sequences_deletion():
    """
    test_partitioned_read_sequences_deletion : Test that read gets correctly
    partitioned for chr1:4 TT>T where the sequence for chr1 is assumed to
    be "ACCTTG"
    """
    # chr1_seq = "ACCTTG"
    chromosome = "chromosome"
    location = 4
    ref = "TT"
    alt = "T"
    variant = Variant(
        chromosome, location, ref, alt, grch38, normalize_contig_name=False)

    read = make_pysam_read(
        seq="ACCTG",
        cigar="4M1D1M",
        mdtag="4^T1")
    samfile = MockAlignmentFile(
        references=(chromosome,),
        reads=[read])
    read_creator = ReadCollector()
    variant_reads = read_creator.allele_reads_supporting_variant(
        alignment_file=samfile,
        variant=variant)
    print(variant_reads)
    assert len(variant_reads) == 1
    variant_read = variant_reads[0]
    expected = AlleleRead(
        name=read.qname,
        prefix="ACCT",
        allele="",
        suffix="G")
    eq_(variant_read, expected)
예제 #10
0
def make_inputs_for_tp53_201_variant(
    cdna_prefix="ATG",
    cdna_suffix="AGGAGCCGCAGTCAGAT",
    n_bad_nucleotides_at_start=0,
    mismatches_before_variant=0,
    mismatches_after_variant=14,  # the read is that much longer than the reference (17 vs 3)
    reference_context_size=3):
    """
    Parameters
    ----------
    cdna_prefix : str
        Transcript nucleotides before the variant that we're pretending
        got detected from RNA-seq reads.

    cdna_suffix : str
        Transcript nucleotides after the variant that we're pretending
        got detected from RNA-seq reads.

    n_bad_nucleotides_at_start : int
        Number of nucleotides we expect to get trimmed from the
        beginning of the variant sequence while matching to a reference context.

    mismatches_before_variant : int
        Expected number of nucleotide mismatches in the result before
        the variant locus.

    reference_context_size : int
        Number of nucleotides before the variant locus to try matching
        against a reference transcript.
    """
    # TP53-201 is an isoform of TP53 which seems to lack untranslated
    # regions so the sequence is:
    # First exon: chr17 7,676,594 - 7,676,521
    # ATG|GAG|GAG|CCG|CAG|TCA|GAT...
    # -M-|-E-|-E-|-P-|-Q-|-S-|-D-

    # we're assuming a variant
    # chr17. 7,676,591 C>T which changes GAG (E) > AAG (K)
    variant = Variant("chr17", 7676591, "C", "T", "GRCh38")

    # TP53-201
    transcript = variant.ensembl.transcripts_by_name("TP53-201")[0]

    effect = variant.effect_on_transcript(transcript)

    eq_(effect.__class__.__name__, "Substitution")
    eq_(effect.aa_ref, "E")
    eq_(effect.aa_alt, "K")

    cdna_alt = "A"

    # genomic DNA is the reverse complement of the cDNA
    # for TP53-001 since it's on the negative strand
    gdna_prefix = reverse_complement_dna(cdna_suffix)
    gdna_alt = reverse_complement_dna(cdna_alt)
    gdna_suffix = reverse_complement_dna(cdna_prefix)

    # variant sequence supported by two reads
    # one fully spanning the variant sequence
    # and another missing the last nucleotide
    fully_overlapping_read = AlleleRead(prefix=gdna_prefix,
                                        allele=gdna_alt,
                                        suffix=gdna_suffix,
                                        name="full-overlap")
    # testing the prefix and allele to make sure they have the expected
    # TP53-201 sequence but the suffix might change depending on what's
    # passed in as cdna_prefix
    if cdna_suffix == "AGGAGCCGCAGTCAGAT":
        eq_(fully_overlapping_read.prefix, "ATCTGACTGCGGCTCCT")
    eq_(fully_overlapping_read.allele, "T")

    partially_overlapping_read = AlleleRead(prefix=gdna_prefix,
                                            allele=gdna_alt,
                                            suffix=gdna_suffix[:-1],
                                            name="partial-overlap")
    if cdna_suffix == "AGGAGCCGCAGTCAGAT":
        eq_(partially_overlapping_read.prefix, "ATCTGACTGCGGCTCCT")
    eq_(partially_overlapping_read.allele, "T")

    variant_sequence = VariantSequence(
        prefix=gdna_prefix,
        alt=gdna_alt,
        suffix=gdna_suffix,
        reads=[fully_overlapping_read, partially_overlapping_read])
    assert isinstance(variant_sequence, VariantSequence)

    prefix_length = len(cdna_prefix) - n_bad_nucleotides_at_start

    reference_coding_sequence_key = ReferenceCodingSequenceKey.from_variant_and_transcript(
        variant=variant,
        transcript=transcript,
        context_size=reference_context_size)
    assert isinstance(reference_coding_sequence_key,
                      ReferenceCodingSequenceKey)

    reference_context = ReferenceContext.from_reference_coding_sequence_key(
        key=reference_coding_sequence_key,
        variant=variant,
        transcripts=[transcript])
    assert isinstance(reference_context, ReferenceContext)

    expected = VariantORF(
        cdna_sequence=cdna_prefix[-prefix_length:] + cdna_alt + cdna_suffix,
        offset_to_first_complete_codon=prefix_length % 3,
        variant_cdna_interval_start=prefix_length,
        variant_cdna_interval_end=prefix_length + 1,
        reference_cdna_sequence_before_variant="ATG"[-prefix_length:],
        reference_cdna_sequence_after_variant=
        "AGGAGCCGCAGTCAGAT"[:reference_context_size],
        num_mismatches_before_variant=mismatches_before_variant,
        num_mismatches_after_variant=mismatches_after_variant)
    assert isinstance(expected, VariantORF)

    return variant_sequence, reference_context, expected
예제 #11
0
def test_allele_read_from_single_read_at_locus_trim_N_nucleotides():
    read_at_locus = make_read_at_locus(prefix="NCCN", alt="A", suffix="TNNA")
    allele_read = AlleleRead.from_locus_read(read_at_locus)
    print(allele_read)
    expected = AlleleRead(prefix="", allele="A", suffix="T", name="dummy")
    eq_(allele_read, expected)