Пример #1
0
def test_with_skipped_region_for_plus_strand_clv():
    """
               AAA             <-tail of suffix contig
       ACGG--GC┘||             <-suffix contig with skip
       0123  456789            <-contig coord
              |  1
       ctg_clv^  ^init_ctg_idx <-contig coord
    ...ACGGTTGCGGT...          <-genome
       789012345678            <-genome coord
          1   |  |
       ref_clv^  ^init_ref_idx
    """
    ctg = MagicMock()
    ctg.reference_name = 'chr1'
    ctg.query_sequence = 'ACGGGCAAA'
    ctg.cigartuples = ((S.BAM_CMATCH, 4), (S.BAM_CREF_SKIP, 2),
                       (S.BAM_CMATCH, 2), (S.BAM_CSOFT_CLIP, 3))

    ref_fa = MagicMock()
    ref_fa.get_reference_length.return_value = 100
    ref_fa.fetch = MagicMock(return_value='TT')
    kw = dict(contig=ctg, strand='+', ref_clv=14, ref_fa=ref_fa, ctg_clv=5)
    assert extract_seq(**kw) == 'ACGGTTGC'
    assert extract_seq(window=3, **kw) == 'TGC'
    ref_fa.fetch.assert_called_with('chr1', 11, 13)
Пример #2
0
def test_hardclip_minus_strand(mock_apautils):
    """
      TT
      |└CGCACCG       <-suffix contig with hardclip
      | |   ///       <-hardclip mask
      01234567890      <-contig coord
   icb^ ^cc     1
      XXCGCACCG...    <-genome
      3456789012      <-genome coord
      | |    1
   irb^ ^rc
    """
    ctg = MagicMock()
    ctg.reference_name = 'chr2'
    mock_apautils.infer_query_sequence.return_value = 'TTCGCACCG'
    ctg.cigartuples = (
        (S.BAM_CSOFT_CLIP, 2),
        (S.BAM_CMATCH, 4),
        (S.BAM_CHARD_CLIP, 3),
    )
    ref_fa = MagicMock()
    ref_fa.get_reference_length.return_value = 100
    kw = dict(contig=ctg, strand='-', ref_clv=5, ref_fa=ref_fa, ctg_clv=2)
    assert extract_seq(**kw) == 'CGCACCG'
    assert extract_seq(window=1, **kw) == 'C'
    assert extract_seq(window=3, **kw) == 'CGC'
def test_with_hardclip_after_clv(mock_apautils):
    """
             TT
             |└GTGA     <-bridge read
           ATTCGTGA     <-bridge contig (hardcipped), chimeric
                 //     <-hardclip mask
           012345678    <-contig coord
        icb^   ^cc
        ...ATTCGTGA...
           567890123    <-genome coordinate
           |   |1
        irb^   ^rc
    """
    ctg = MagicMock()
    ctg.reference_name = 'chr2'
    mock_apautils.infer_query_sequence.return_value = 'ATTCGTGA'
    ctg.cigartuples = (
        (S.BAM_CMATCH, 6),
        (S.BAM_CHARD_CLIP, 2),
    )

    ref_fa = MagicMock()
    ref_fa.get_reference_length.return_value = 100
    kw = dict(contig=ctg, strand='-', ref_clv=9, ref_fa=ref_fa, ctg_clv=4)
    assert extract_seq(**kw) == 'GTGA'
    assert extract_seq(window=5, **kw) == 'GTGA'
Пример #4
0
def test_with_skipped_region_and_insertions_mismatches_for_plus_strand_clv():
    """
        G
        ┬       AAA             <-tail of suffix contig
       A TA--GCG┘||             <-suffix contig with skip
       0 23  456789            <-contig coord
       | |x  | |
       ctg_clv ^  ^init_ctg_idx <-contig coord
    ...A TTCCGCGXXX...          <-genome
       7 8901234567            <-genome coord
           1   |  |
        ref_clv^  ^init_ref_idx
    """
    ctg = MagicMock()
    ctg.reference_name = 'chr3'
    ctg.query_sequence = 'AGTAGCGAAA'
    ctg.cigartuples = (
        (S.BAM_CMATCH, 1),
        (S.BAM_CINS, 1),
        (S.BAM_CMATCH, 2),
        (S.BAM_CREF_SKIP, 2),
        (S.BAM_CMATCH, 3),
        (S.BAM_CSOFT_CLIP, 3),
    )

    ref_fa = MagicMock()
    ref_fa.get_reference_length.return_value = 100
    ref_fa.fetch.return_value = 'CC'
    kw = dict(contig=ctg, strand='+', ref_clv=14, ref_fa=ref_fa, ctg_clv=6)
    assert extract_seq(**kw) == 'AGTACCGCG'
    ref_fa.fetch.assert_called_once_with('chr3', 10, 12)
    assert extract_seq(window=1, **kw) == 'G'
    assert extract_seq(window=3, **kw) == 'GCG'
    assert extract_seq(window=8, **kw) == 'GTACCGCG'
Пример #5
0
def test_with_skipped_and_deleted_regions_for_plus_strand_clv():
    """
               AAA             <-tail of suffix contig
       A_TT--GC┘||             <-suffix contig with skip
       0 12  345678            <-contig coord
              |
       ctg_clv^  ^init_ctg_idx <-contig coord
    ...ACTTAAGCGGT...          <-genome
       789012345678            <-genome coord
          1   |  |
       ref_clv^  ^init_ref_idx
    """
    ctg = MagicMock()
    ctg.reference_name = 'chr3'
    ctg.query_sequence = 'ATTGCAAA'
    ctg.cigartuples = (
        (S.BAM_CMATCH, 1),
        (S.BAM_CDEL, 1),
        (S.BAM_CMATCH, 2),
        (S.BAM_CREF_SKIP, 2),
        (S.BAM_CMATCH, 2),
        (S.BAM_CSOFT_CLIP, 3),
    )

    ref_fa = MagicMock()
    ref_fa.get_reference_length.return_value = 100
    ref_fa.fetch.return_value = 'AA'
    kw = dict(contig=ctg, strand='+', ref_clv=14, ref_fa=ref_fa, ctg_clv=4)
    assert extract_seq(**kw) == 'ATTAAGC'
    ref_fa.fetch.assert_called_once_with('chr3', 11, 13)
    assert extract_seq(window=5, **kw) == 'TAAGC'
Пример #6
0
def test_with_5_base_inserted_region_for_plus_strand_clv():
    """
         AATCC
           ┬   AA              <-tail of suffix contig
       ACGG GCG┘|              <-suffix contig with skip
       0123 9012            <-contig coord
          | |1| |
       ctg_clv^ ^init_ctg_idx <-contig coord
    ...ACGG GCGXXX...           <-genome
       7890 1234567            <-genome coord
         1    | |
       ref_clv^  ^init_ref_idx
    """
    ctg = MagicMock()
    ctg.reference_name = 'chr1'
    ctg.query_sequence = 'ACGGAATCCGCGAA'
    ctg.cigartuples = (
        (S.BAM_CMATCH, 4),
        (S.BAM_CINS, 5),
        (S.BAM_CMATCH, 3),
        (S.BAM_CSOFT_CLIP, 2),
    )

    ref_fa = MagicMock()
    ref_fa.get_reference_length.return_value = 100
    kw = dict(contig=ctg, strand='+', ref_clv=13, ref_fa=ref_fa, ctg_clv=11)
    assert extract_seq(**kw) == 'ACGGAATCCGCG'
    assert extract_seq(window=5, **kw) == 'CCGCG'
Пример #7
0
def test_hardclip_plust_strand(mock_apautils):
    """
           AAA
    CGCACCG┘ |       <-suffix contig with hardclip
    \\\|  |  |       <-hardclip mask
    01234567890      <-contig coord
       |cc^   ^ice
 ...XXXACCGTCG...    <-genome
    234567890123     <-genome coord
          | 1 |
        rc^   ^ire
    """
    ctg = MagicMock()
    ctg.reference_name = 'chr2'
    mock_apautils.infer_query_sequence.return_value = 'CGCACCGAAA'
    ctg.cigartuples = (
        (S.BAM_CHARD_CLIP, 3),
        (S.BAM_CMATCH, 4),
        (S.BAM_CSOFT_CLIP, 3),
    )

    ref_fa = MagicMock()
    ref_fa.get_reference_length.return_value = 100
    kw = dict(contig=ctg, strand='+', ref_clv=8, ref_fa=ref_fa, ctg_clv=6)
    assert extract_seq(**kw) == 'CGCACCG'
    assert extract_seq(window=1, **kw) == 'G'
    assert extract_seq(window=3, **kw) == 'CCG'
Пример #8
0
def test_with_indel_and_skipped_regions_and_mismatches_for_plus_strand_clv():
    """
             TC
             ┬       AA             <-tail of suffix contig
       A---CC GTA__GC┘|             <-suffix contig with skip
       0   12 567  8901             <-contig coord
           x   x    |1
           x ctg_clv^ ^init_ctg_idx <-contig coord
    ...ACTGTC GAATTGC...            <-genome
       789012 345678901             <-genome coord
          1         | |
             ref_clv^ ^init_ref_idx
    """
    ctg = MagicMock()
    ctg.reference_name = 'chr3'
    ctg.query_sequence = 'ACCTCGTAGCAA'
    ctg.cigartuples = (
        (S.BAM_CMATCH, 1),
        (S.BAM_CREF_SKIP, 3),
        (S.BAM_CMATCH, 2),
        (S.BAM_CINS, 2),
        (S.BAM_CMATCH, 3),
        (S.BAM_CDEL, 2),
        (S.BAM_CMATCH, 2),
        (S.BAM_CSOFT_CLIP, 2),
    )

    ref_fa = MagicMock()
    ref_fa.get_reference_length.return_value = 100
    ref_fa.fetch.return_value = 'CTG'
    kw = dict(contig=ctg, strand='+', ref_clv=19, ref_fa=ref_fa, ctg_clv=9)
    assert extract_seq(**kw) == 'ACTGCCTCGTAGC'
    ref_fa.fetch.assert_called_once_with('chr3', 8, 11)

    assert extract_seq(window=10, **kw) == 'GCCTCGTAGC'
Пример #9
0
def test_extract_seq_for_minus_strand_clv_supported_by_suffix():
    """
       TTT         <-tail of suffix contig
         └ACATC    <-suffix contig
       012345678    <-contig coord
    icb^  ^ctg_clv
    ...567890129... <-genome coord
    irb^  ^ref_clv
    """
    strand = '-'
    ref_clv = 8
    ctg_clv = 3
    ref_fa = MagicMock()
    contig = MagicMock()
    contig.query_sequence = 'TTTACATCG'
    contig.cigartuples = (
        (S.BAM_CSOFT_CLIP, 3),
        (S.BAM_CMATCH, 6),
    )

    args = contig, strand, ref_clv, ref_fa, ctg_clv
    assert extract_seq(*args) == 'ACATCG'
    assert extract_seq(*args, window=1) == 'A'
    assert extract_seq(*args, window=2) == 'AC'
    assert extract_seq(*args, window=3) == 'ACA'
    assert extract_seq(*args, window=4) == 'ACAT'
Пример #10
0
def test_with_2_base_insertion():
    """
              GA
         TTT  ┬       <-tail of suffix contig
         ||└AC TCG    <-suffix contig
         01234 5678   <-contig coord
      ici^  ^ctg_clv
         XXXXX XXX    <-genome
         45678 9012... <-genome coord
            |  1
            ^ref_clv/iri
    """
    ctg = MagicMock()
    ctg.reference_name = 'chr1'
    ctg.query_sequence = 'TTTACGATCG'
    ctg.cigartuples = (
        (S.BAM_CSOFT_CLIP, 3),
        (S.BAM_CMATCH, 2),
        (S.BAM_CINS, 2),
        (S.BAM_CMATCH, 3),
    )

    ref_fa = MagicMock()
    ref_fa.get_reference_length.return_value = 100
    kw = dict(contig=ctg, strand='-', ref_clv=7, ref_fa=ref_fa, ctg_clv=3)
    assert extract_seq(**kw) == 'ACGATCG'
    assert extract_seq(window=2, **kw) == 'AC'
    assert extract_seq(window=3, **kw) == 'ACG'
Пример #11
0
def test_with_skipped_region():
    """
                TTT             <-tail of suffix contig
                ||└GT--C        <-suffix contig with skip
                01234  56      <-contig coord
             icb^  ^ctg_clv     <-contig coord
             ...XXXGTTGC...    <-genome
                5678901234      <-genome coord
                |  | 1
             irb^  ^ref_clv/iri
    """
    ctg = MagicMock()
    ctg.reference_name = 'chr2'
    ctg.query_sequence = 'TTTGTC'
    ctg.cigartuples = ((S.BAM_CSOFT_CLIP, 3), (S.BAM_CMATCH, 2),
                       (S.BAM_CREF_SKIP, 2), (S.BAM_CMATCH, 1))

    ref_fa = MagicMock()
    ref_fa.get_reference_length.return_value = 100
    ref_fa.fetch = MagicMock(return_value='TG')
    kw = dict(contig=ctg, strand='-', ref_clv=8, ref_fa=ref_fa, ctg_clv=3)
    assert extract_seq(**kw) == 'GTTGC'
    ref_fa.fetch.assert_called_with('chr2', 10, 12)

    # **kw needs go after window for py34 syntax
    assert extract_seq(window=1, **kw) == 'G'
    assert extract_seq(window=3, **kw) == 'GTT'
    assert extract_seq(window=4, **kw) == 'GTTG'
Пример #12
0
def test_extract_seq_for_bridge_with_deletion():
    """
               AA
             CG┘      <-bridge read
       GAC__TCGTC     <-bridge contig
       012  345678    <-contig coord
          | ||x
          | |^cc ^ice
    ...GACGGTCCTC...  <-genome
       56789012345    <-genome coord
            1|   |
             ^rc ^rce
    """
    ctg = MagicMock()
    ctg.reference_name = 'chr2'
    ctg.query_sequence = 'GACTCGTC'
    ctg.cigartuples = (
        (S.BAM_CMATCH, 3),
        (S.BAM_CDEL, 2),
        (S.BAM_CMATCH, 5),
    )

    ref_fa = MagicMock()
    ref_fa.get_reference_length.return_value = 100
    kw = dict(contig=ctg, strand='+', ref_clv=15, ref_fa=ref_fa, ctg_clv=5)
    assert extract_seq(**kw) == 'GACTCG'
Пример #13
0
def test_extract_seq_for_bridge_with_insertion():
    """
         AG   AA      <-inserted bases
         ┬  GT┘       <-bread read
       GA CGGTCGC     <-bridge contig
       01 45678901    <-contig coord
        x   1|   |
        x  cc^   ^ice
    ...GT CGGTCGC...  <-genome
       56 78901234    <-genome coord
             |   |
           rc^   ^ire
    """
    ctg = MagicMock()
    ctg.reference_name = 'chr2'
    ctg.query_sequence = 'GAAGCGGTCGC'
    ctg.cigartuples = (
        (S.BAM_CMATCH, 2),
        (S.BAM_CINS, 2),
        (S.BAM_CMATCH, 7)
    )

    ref_fa = MagicMock()
    ref_fa.get_reference_length.return_value = 100
    kw = dict(contig=ctg, strand='+', ref_clv=10, ref_fa=ref_fa, ctg_clv=7)
    assert extract_seq(**kw) == 'GAAGCGGT'
Пример #14
0
def test_extract_seq_with_skip_before_and_after_ctg_clv():
    """
             AA
           GT┘       <-bridge read
       G--AGT-GC     <-bridge contig
       0  123 456    <-contig coord
     ctg_clv^   ^ice
    ...GACAGTTGC...  <-genome
       5678901234    <-genome coord
            1   |
     ref_clv^   ^ire
    """
    ctg = MagicMock()
    ctg.reference_name = 'chr2'
    ctg.query_sequence = 'GAGTGC'
    ctg.cigartuples = (
        (S.BAM_CMATCH, 1),
        (S.BAM_CREF_SKIP, 2),
        (S.BAM_CMATCH, 3),
        (S.BAM_CREF_SKIP, 1),
        (S.BAM_CMATCH, 2),
    )

    ref_fa = MagicMock()
    ref_fa.get_reference_length.return_value = 100
    ref_fa.fetch.return_value = 'AC'
    kw = dict(contig=ctg, strand='+', ref_clv=10, ref_fa=ref_fa, ctg_clv=3)
    assert extract_seq(**kw) == 'GACAGT'
    ref_fa.fetch.assert_called_once_with('chr2', 6, 8)
def test_hardclip_spanning_clv_from_after_edgecase_3(mock_apautils):
    """
             AAA
          GTT┘           <-bridge read
       A-GGTTGCA         <-bridge contig
       | | /////         <-hardclip mask
       0 12345678        <-contig coord
          cc^   ^ice
    ...ACGGTTGCA...      <-genome
       7890123456        <-genome coord
          1 |   |
          rc^   ^ie
    """
    ctg = MagicMock()
    ctg.reference_name = 'chr1'
    mock_apautils.infer_query_sequence.return_value = 'AGGTTGCA'
    ctg.cigartuples = (
        (S.BAM_CMATCH, 1),
        (S.BAM_CREF_SKIP, 1),
        (S.BAM_CMATCH, 2),
        (S.BAM_CHARD_CLIP, 5),
    )

    ref_fa = MagicMock()
    ref_fa.get_reference_length.return_value = 100
    ref_fa.fetch = MagicMock(return_value='C')
    kw = dict(contig=ctg, strand='+', ref_clv=12, ref_fa=ref_fa, ctg_clv=4)
    assert extract_seq(**kw) == 'ACGGTT'
Пример #16
0
def test_extract_seq_for_bridge_with_multiple_skips_before_clv():
    """
               AA
             TA┘       <-bridge read
       G-C--CTAGC      <-bridge contig
       0 1  234567     <-contig coord
        ||| x ^cc^ice
    ...GACTGGTAGC...   <-genome
       56789012345     <-genome coord
            1 |  |
            rc^  ^ire
    """
    ctg = MagicMock()
    ctg.reference_name = 'chr2'
    ctg.query_sequence = 'GCCTAGC'
    ctg.cigartuples = (
        (S.BAM_CMATCH, 1),
        (S.BAM_CREF_SKIP, 1),
        (S.BAM_CMATCH, 1),
        (S.BAM_CREF_SKIP, 2),
        (S.BAM_CMATCH, 5)
    )

    ref_fa = MagicMock()
    ref_fa.get_reference_length.return_value = 100
    ref_fa.fetch.side_effect = list(reversed(['A', 'TG']))  # from Right => Left
    kw = dict(contig=ctg, strand='+', ref_clv=12, ref_fa=ref_fa, ctg_clv=4)
    assert extract_seq(**kw) == 'GACTGCTA'
Пример #17
0
def test_with_two_skipped_regions_and_a_mismatch():
    """
                TTT              <-tail of suffix contig
                ||└GT--CAG-AC    <-suffix contig with skip
                01234  567 890   <-contig coord
    init_ctg_clv^  ^cc  x        <-contig coord
             ...XXXGTTGCGGCAC... <-genome
                56789012345678   <-genome coord
                   | 1
                   ^ref_clv/iri
    """
    ctg = MagicMock()
    ctg.reference_name = 'chr2'
    ctg.query_sequence = 'TTTGTCAGAC'
    ctg.cigartuples = (
        (S.BAM_CSOFT_CLIP, 3),
        (S.BAM_CMATCH, 2),
        (S.BAM_CREF_SKIP, 2),
        (S.BAM_CMATCH, 3),
        (S.BAM_CREF_SKIP, 1),
        (S.BAM_CMATCH, 2),
    )

    ref_fa = MagicMock()
    ref_fa.get_reference_length.return_value = 100
    ref_fa.fetch.side_effect = ['TG', 'C']
    kw = dict(contig=ctg, strand='-', ref_clv=8, ref_fa=ref_fa, ctg_clv=3)
    assert extract_seq(**kw) == 'GTTGCAGCAC'
    assert ref_fa.fetch.call_count == 2
    ref_fa.fetch.assert_has_calls([call('chr2', 10, 12), call('chr2', 15, 16)])
def test_with_hardclip_spanning_clv_from_before_edgecase_3(mock_apautils):
    """
             TT
              └GTGA     <-bridge read
               GTGA     <-bridge contig (hardcipped), chimeric
               /        <-hardclip mask
               012345678    <-contig coord
               ^icb/cc
            ...GTGA...
               0123    <-genome coordinate
               |1
               ^irb/rc
    """
    ctg = MagicMock()
    ctg.reference_name = 'chr2'
    mock_apautils.infer_query_sequence.return_value = 'GTGA'
    ctg.cigartuples = (
        (S.BAM_CHARD_CLIP, 1),
        (S.BAM_CMATCH, 3),
    )

    ref_fa = MagicMock()
    ref_fa.get_reference_length.return_value = 100
    kw = dict(contig=ctg, strand='-', ref_clv=0, ref_fa=ref_fa, ctg_clv=0)
    assert extract_seq(**kw) == 'GTGA'
Пример #19
0
def test_with_three_skipped_region_and_mismatches_for_plus_strand_clv():
    """
                     AA             <-tail of suffix contig
       A---CC-GTA--GC┘|             <-suffix contig with skip
       0|||12|345||678              <-contig coord
        |||x | x || |
        |||x ctg_clv^ ^init_ctg_idx <-contig coord
    ...ACTGTCAGAATTGCX...           <-genome
       789012345678901              <-genome coord
          1         |2|
             ref_clv^ ^init_ref_idx
    """
    ctg = MagicMock()
    ctg.reference_name = 'chr3'
    ctg.query_sequence = 'ACCGTAGCAA'
    ctg.cigartuples = (
        (S.BAM_CMATCH, 1),
        (S.BAM_CREF_SKIP, 3),
        (S.BAM_CMATCH, 2),
        (S.BAM_CREF_SKIP, 1),
        (S.BAM_CMATCH, 3),
        (S.BAM_CREF_SKIP, 2),
        (S.BAM_CMATCH, 2),
        (S.BAM_CSOFT_CLIP, 2),
    )

    ref_fa = MagicMock()
    ref_fa.get_reference_length.return_value = 100
    ref_fa.fetch.side_effect = ['TT', 'A', 'CTG']
    kw = dict(contig=ctg, strand='+', ref_clv=20, ref_fa=ref_fa, ctg_clv=7)
    assert extract_seq(**kw) == 'ACTGCCAGTATTGC'
    assert ref_fa.fetch.call_count == 3
    ref_fa.fetch.assert_has_calls(
        [call('chr3', 17, 19),
         call('chr3', 13, 14),
         call('chr3', 8, 11)])

    ref_fa = MagicMock()
    ref_fa.get_reference_length.return_value = 100
    ref_fa.fetch.side_effect = ['TT', 'A']
    kw.update(ref_fa=ref_fa)
    assert extract_seq(window=9, **kw) == 'CAGTATTGC'
    assert ref_fa.fetch.call_count == 2
Пример #20
0
def test_extract_seq_for_minus_strand_clv_supported_by_link():
    """
       ACATC    <-link contig
       01234    <-contig coord
    ...89012... <-genome coord
       ^ref_clv
    """
    strand = '-'
    ref_clv = 8
    ctg_clv = 0
    ref_fa = MagicMock()
    contig = MagicMock()
    contig.query_sequence = 'ACATCG'
    contig.cigartuples = ((S.BAM_CMATCH, 6), )

    args = contig, strand, ref_clv, ref_fa, ctg_clv
    assert extract_seq(*args) == 'ACATCG'
    assert extract_seq(*args, window=1) == 'A'
    assert extract_seq(*args, window=2) == 'AC'
    assert extract_seq(*args, window=3) == 'ACA'
    assert extract_seq(*args, window=4) == 'ACAT'
def test_hardclip_before_clv(mock_apautils):
    """
           AA
         TC┘         <-bridge read
    CGCATTCGTCG      <-bridge contig (hardcipped, could be chimeric https://www.biostars.org/p/109333/)
    \\\|  |          <-hardclip mask
    012345678901      <-contig coord
       |cc^    ^ice
 ...XXXATTCGTCG...   <-genome
    234567890123     <-genome coord
          | 1  |
        rc^    ^ire
    """
    ctg = MagicMock()
    ctg.reference_name = 'chr2'
    mock_apautils.infer_query_sequence.return_value = 'CGCATTCGTCG'
    ctg.cigartuples = ((S.BAM_CHARD_CLIP, 3), (S.BAM_CMATCH, 8))

    ref_fa = MagicMock()
    ref_fa.get_reference_length.return_value = 100
    kw = dict(contig=ctg, strand='+', ref_clv=8, ref_fa=ref_fa, ctg_clv=6)
    assert extract_seq(**kw) == 'CGCATTC'
    assert extract_seq(window=1, **kw) == 'C'
    assert extract_seq(window=2, **kw) == 'TC'
    assert extract_seq(window=3, **kw) == 'TTC'
    assert extract_seq(window=4, **kw) == 'ATTC'
    assert extract_seq(window=5, **kw) == 'CATTC'
def test_hardclip_after_clv(mock_apautils):
    """
             AAA
          GTT┘                 <-bridge read
       A-GGTTGCAGA             <-bridge contig
       | |  | |///             <-hardclip mask
       0 1234567890            <-contig coord
     ctg_clv^     ^ice <-contig coord
    ...ACGGTTGCAGA...          <-genome
       789012345678            <-genome coord
          1 |     |
     ref_clv^     ^init_fe
    """
    ctg = MagicMock()
    ctg.reference_name = 'chr1'
    mock_apautils.infer_query_sequence.return_value = 'AGGTTGCAGA'
    ctg.cigartuples = ((S.BAM_CMATCH, 1), (S.BAM_CREF_SKIP, 1),
                       (S.BAM_CMATCH, 6), (S.BAM_CHARD_CLIP, 3))

    ref_fa = MagicMock()
    ref_fa.get_reference_length.return_value = 100
    ref_fa.fetch = MagicMock(return_value='C')
    kw = dict(contig=ctg, strand='+', ref_clv=12, ref_fa=ref_fa, ctg_clv=4)
    assert extract_seq(**kw) == 'ACGGTT'
    ref_fa.fetch.assert_called_with('chr1', 8, 9)

    assert extract_seq(window=1, **kw) == 'T'
    assert extract_seq(window=2, **kw) == 'TT'
    assert extract_seq(window=3, **kw) == 'GTT'
    assert extract_seq(window=4, **kw) == 'GGTT'
    assert extract_seq(window=5, **kw) == 'CGGTT'
Пример #23
0
def test_extract_seq_for_plus_strand_clv_supported_by_link():
    """
       ATCGAC    <-link contig
       012345    <-contig coord
            ^ctg_clv
    ...789012... <-genome coord
          1 ^ref_clv
    """
    strand = '+'
    ref_clv = 12
    ctg_clv = 5
    ref_fa = MagicMock()
    contig = MagicMock()
    contig.query_sequence = 'ATCGAC'
    contig.cigartuples = ((S.BAM_CMATCH, 6), )

    args = contig, strand, ref_clv, ref_fa, ctg_clv
    assert extract_seq(*args) == 'ATCGAC'
    assert extract_seq(*args, window=1) == 'C'
    assert extract_seq(*args, window=2) == 'AC'
    assert extract_seq(*args, window=3) == 'GAC'
    assert extract_seq(*args, window=4) == 'CGAC'
Пример #24
0
def test_with_two_skipped_region_for_plus_strand_clv():
    """
               AAA             <-tail of suffix contig
       A-TT--GC┘||             <-suffix contig with skip
       0 12  345678            <-contig coord
              |
       ctg_clv^  ^init_ctg_idx <-contig coord
    ...ACTTAAGCGGT...          <-genome
       789012345678            <-genome coord
          1   |  |
       ref_clv^  ^init_ref_idx
    """
    ctg = MagicMock()
    ctg.reference_name = 'chr3'
    ctg.query_sequence = 'ATTGCAAA'
    ctg.cigartuples = (
        (S.BAM_CMATCH, 1),
        (S.BAM_CREF_SKIP, 1),
        (S.BAM_CMATCH, 2),
        (S.BAM_CREF_SKIP, 2),
        (S.BAM_CMATCH, 2),
        (S.BAM_CSOFT_CLIP, 3),
    )

    ref_fa = MagicMock()
    ref_fa.get_reference_length.return_value = 100
    ref_fa.fetch.side_effect = ['AA', 'C']
    kw = dict(contig=ctg, strand='+', ref_clv=14, ref_fa=ref_fa, ctg_clv=4)
    assert extract_seq(**kw) == 'ACTTAAGC'
    assert ref_fa.fetch.call_count == 2
    ref_fa.fetch.assert_has_calls([call('chr3', 11, 13), call('chr3', 8, 9)])

    # use a new mock, couldn't make ref_fa.reset_mock() work
    ref_fa = MagicMock()
    ref_fa.get_reference_length.return_value = 100
    ref_fa.fetch.return_value = 'AA'
    kw.update(ref_fa=ref_fa)
    assert extract_seq(window=3, **kw) == 'AGC'
    assert ref_fa.fetch.call_count == 1
Пример #25
0
def test_extract_seq_for_plus_strand_clv_supported_by_suffix():
    """
             AA  <-tail of suffix contig
       ATCGAC┘   <-suffix contig
       012345    <-contig coord
    ...789012... <-genome coord
            ^ref_clv
    """
    strand = '+'
    ctg_clv = 5
    ref_clv = 12
    ref_fa = MagicMock()
    contig = MagicMock()
    contig.query_sequence = 'ATCGACAA'
    contig.cigartuples = ((S.BAM_CMATCH, 6), (S.BAM_CSOFT_CLIP, 2))

    args = contig, strand, ref_clv, ref_fa, ctg_clv
    assert extract_seq(*args) == 'ATCGAC'
    assert extract_seq(*args, window=1) == 'C'
    assert extract_seq(*args, window=2) == 'AC'
    assert extract_seq(*args, window=3) == 'GAC'
    assert extract_seq(*args, window=4) == 'CGAC'
Пример #26
0
def gen_hex_tuple(contig, strand, ref_clv, ref_fa, ctg_clv, dd_bridge):
    # TODO: the returning of None is pretty ugly, to refactor
    seqname = contig.reference_name
    clv_key = apautils.gen_clv_key_tuple_with_ctg_clv(seqname, strand, ref_clv,
                                                      ctg_clv)
    if dd_bridge['hexamer_tuple'][clv_key] is None:  # do search
        hex_src_seq = extract_seq(contig, strand, ref_clv, ref_fa, ctg_clv)

        ctg_hex_tuple = search(strand, ref_clv, hex_src_seq)
        if ctg_hex_tuple is None:
            ctg_hex_tuple = ('NA', -1, -1)
    else:
        ctg_hex_tuple = None
    return ctg_hex_tuple
Пример #27
0
 def test_extract_seq_with_ending_softclip(self):
     """
        AA
     GGG┘|
     012345   <-contig coord
     234567 <-genome coord
     """
     contig = MagicMock()
     contig.query_sequence = 'GGGAA'
     contig.cigartuples = ((S.BAM_CMATCH, 3), (S.BAM_CSOFT_CLIP, 2))
     assert extract_seq(contig,
                        strand='+',
                        ref_clv=7,
                        ref_fa=MagicMock(),
                        ctg_clv=2) == 'GGG'
Пример #28
0
 def test_extract_seq_with_ending_softclip(self, mock_apautils):
     """
        AA
     GGG┘|
     012345   <-contig coord
     234567 <-genome coord
     """
     contig = MagicMock()
     mock_apautils.infer_query_sequence.return_value = 'GGGAA'
     contig.cigartuples = ((S.BAM_CMATCH, 3), (S.BAM_CHARD_CLIP, 2))
     assert extract_seq(contig,
                        strand='+',
                        ref_clv=7,
                        ref_fa=MagicMock(),
                        ctg_clv=2) == 'GGG'
Пример #29
0
 def test_extract_seq_with_starting_softclip(self):
     """
     TT
      └CAA
     012345 <-contig coord
     678901 <-genome coord
     """
     strand = '-'
     contig = MagicMock()
     contig.query_sequence = 'TTCCA'
     contig.cigartuples = ((S.BAM_CSOFT_CLIP, 2), (S.BAM_CMATCH, 4))
     assert extract_seq(contig,
                        strand,
                        ref_clv=8,
                        ref_fa=MagicMock(),
                        ctg_clv=2) == 'CCA'
Пример #30
0
 def test_extract_seq_with_starting_softclip(self, mock_apautils):
     """
     TT
      └CAA
     012345 <-contig coord
     678901 <-genome coord
     """
     strand = '-'
     contig = MagicMock()
     mock_apautils.infer_query_sequence.return_value = 'TTCCA'
     contig.cigartuples = ((S.BAM_CHARD_CLIP, 2), (S.BAM_CMATCH, 4))
     assert extract_seq(contig,
                        strand,
                        ref_clv=8,
                        ref_fa=MagicMock(),
                        ctg_clv=2) == 'CCA'