示例#1
0
def test_for_skip_check_sizes():
    """
         TT
          └A--T       <-bridge read
         GTA--AT      <-bridge contig
         012--345     <-contig coord
           ^ctg_clv
        5432--10      <-rev contig coord
           |  |
        76543210      <-rev genome offset coord
         01234567     <-genome offset coord
              ^gnm_offset
    """
    cigartuples = [
        (S.BAM_CMATCH, 3),
        (S.BAM_CREF_SKIP, 2),
        (S.BAM_CMATCH, 2),
    ]

    assert calc_genome_offset(cigartuples,
                              ctg_clv=2,
                              tail_side='left',
                              skip_check_size=1) == 5

    assert calc_genome_offset(cigartuples, ctg_clv=2, tail_side='left') == 2
示例#2
0
def test_for_a_case_derived_from_a_real_one_E1_L_4362_chr16_plus_strand():
    """
    mostly just the bases around the clv are copied, the coordinates are
    arbitray

               AA
      AAG-----G┘  AAAA     <-bridge read
      AAG-----GCTT┘        <-bridge contig
      012     34567        <-contig coord
              ^ctg_clv
      AAGXXXXXGCTT
      0123456789012        <-genome offset coord
              ^gnm_offset

    """
    cigartuples = [
        (S.BAM_CMATCH, 3),
        (S.BAM_CREF_SKIP, 5),
        (S.BAM_CMATCH, 4),
        (S.BAM_CSOFT_CLIP, 4),
    ]
    gnm_offset = 8
    assert calc_genome_offset(cigartuples, ctg_clv=3, tail_side='right') == gnm_offset

    assert calc_genome_offset(cigartuples, ctg_clv=3, tail_side='right', skip_check_size=1) == 2
示例#3
0
def analyze_bridge(contig, read, ref_fa, dd_bridge, bridge_skip_check_size):
    """
    :param dd_bridge: holds bridge_evidence for a given contig, here it's just
    used to check if hexamer_search has already been done for a given ref_clv
    """
    seqname = contig.reference_name

    bdg_support = do_bridge(contig, read)
    if bdg_support is None:  # likely a chimeric contig
        return

    strand, ctg_clv, tail_len, tail_direction = bdg_support

    offset = calc_genome_offset(contig.cigartuples,
                                ctg_clv,
                                tail_direction,
                                skip_check_size=3)

    if offset < 0:  # meaning the clv is on soft/hard clipped region
        return

    ref_clv = contig.reference_start + offset

    ctg_hex_tuple = gen_hex_tuple(contig, strand, ref_clv, ref_fa, ctg_clv,
                                  dd_bridge)

    return seqname, strand, ref_clv, ctg_clv, tail_len, ctg_hex_tuple
示例#4
0
def test_for_nonskipped_contig(ctg_cigartuples, ctg_clv, gnm_offset):
    """
     TT
      └AC     <-bridge read
      AACG    <-bridge contig
      01234   <-contig offset coord: different from "contig coord", it doesn't consider clipped regions
       ^ctg_clv
      01234   <-genome offset coord
       ^gnm_offset
    """
    assert calc_genome_offset(ctg_cigartuples, ctg_clv, 'left') == gnm_offset
示例#5
0
def test_for_skipped_with_clv_before_a_skip(ctg_cigartuples, ctg_clv,
                                            gnm_offset):
    """
     TT
      └AC            <-bridge read
      AACGTA--ATCG    <-bridge contig
      012345  67890   <-contig offset coord
        ^
      0123456789012   <-genome offset coord
        ^ctf/gnm_offset
    """
    assert calc_genome_offset(ctg_cigartuples, ctg_clv, 'left') == gnm_offset
示例#6
0
def test_for_skipped_contig_with_clv_after_a_skip(ctg_cigartuples, ctg_clv,
                                                  gnm_offset):
    """
          TT
          |└AC  <-bridge read
      CG--ATCGAT    <-bridge contig
      01  2345678   <-contig offset coord
            ^ctg_clv
      01234567890   <-genome offset coord
            ^gnm_offset
    """
    assert calc_genome_offset(ctg_cigartuples, ctg_clv, 'left') == gnm_offset
示例#7
0
def test_for_contig_with_hardclip(ctg_clv, expected_gnm_offset):
    """
    TT
     └ACG
    \\ACGT     <-contig
    0123456    <-contig offset coord
      ^ctg_clv
      0123     <-genome offset coord
      ^gnm_offset
    """
    ctg_cigartuples = ((S.BAM_CHARD_CLIP, 2), (S.BAM_CMATCH, 3))
    assert calc_genome_offset(ctg_cigartuples, ctg_clv,
                              'left') == expected_gnm_offset
示例#8
0
def test_for_contig_with_softclip(ctg_clv, expected_gnm_offset):
    """
    TTT
    012       <-contig offset coord for tail
      └XXX    <-contig
       345    <-contig offset coord
       ^ctg_clv
       012    <-genome offset coord
       ^gnm_offset
    """
    ctg_cigartuples = ((S.BAM_CSOFT_CLIP, 3), (S.BAM_CMATCH, 3))
    assert calc_genome_offset(ctg_cigartuples, ctg_clv,
                              'left') == expected_gnm_offset
示例#9
0
def test_for_a_long_contig_with_deletion(ctg_cigartuples, ctg_clv, gnm_offset):
    """
       TT                       TT TT
       |└TC                      └C └TC                                            <-bridge read
    ATCGATCGATCGATCGATCGATCGATCGATC__ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG  <-bridge contig
    0123456789012345678901234567890  123456789012345678901234567890123456789012345 <-bridge offset coord
         |    1         2         3  |        4         5         6         7
         ^ctg_clv          (case2)^  ^ctg_clv   (case3)
    012345678901234567890123456789012345678901234567890123456789012345678901234567 <-genome offset coord
         |    1         2         3  |      4         5         6         7
         ^gnm_offset(case1)          ^gnm_offset(case3)
    """
    assert calc_genome_offset(ctg_cigartuples, ctg_clv, 'left') == gnm_offset
示例#10
0
def test_for_contig_with_two_skips_and_soft_clip_edge_case_2():
    """
                AA
            C--G┘    AAAA    <-bridge read
      CG--ATC--GAT┘   <-bridge contig
      01  234  56789012   <-contig offset coord
               ^ctg_clv
      0123456789012   <-genome offset coord
               ^gnm_offset(check the passing for introns)
    """
    cigartuples = [
        (S.BAM_CMATCH, 2),
        (S.BAM_CREF_SKIP, 2),
        (S.BAM_CMATCH, 3),
        (S.BAM_CREF_SKIP, 2),
        (S.BAM_CMATCH, 3),
        (S.BAM_CSOFT_CLIP, 4),
    ]
    gnm_offset = 9
    assert calc_genome_offset(cigartuples, ctg_clv=5, tail_side='right') == gnm_offset

    assert calc_genome_offset(cigartuples, ctg_clv=5, tail_side='right', skip_check_size=1) == 6
    assert calc_genome_offset(cigartuples, ctg_clv=5, tail_side='right', skip_check_size=2) == 6
示例#11
0
def test_for_contig_with_two_skips_and_soft_clip_with_clv_before_the_skip():
    """
            AA
          AT┘    AAAA    <-bridge read
      CG--ATC--GAT┘   <-bridge contig
      01  234  5678901   <-contig offset coord
           ^ctg_clv
      0123456789012   <-genome offset coord
           ^gnm_offset
    """
    cigartuples = [
        (S.BAM_CMATCH, 2),
        (S.BAM_CREF_SKIP, 2),
        (S.BAM_CMATCH, 3),
        (S.BAM_CREF_SKIP, 2),
        (S.BAM_CMATCH, 3),
        (S.BAM_CSOFT_CLIP, 4),
    ]
    gnm_offset = 5
    assert calc_genome_offset(cigartuples, ctg_clv=3, tail_side='right') == gnm_offset

    assert calc_genome_offset(cigartuples, ctg_clv=3, tail_side='right', skip_check_size=1) == 5
    assert calc_genome_offset(cigartuples, ctg_clv=3, tail_side='right', skip_check_size=2) == 1
示例#12
0
def test_for_a_case_derived_from_a_real_one_E1_L_4362_chr16_plus_strand_v2():
    """
    basically the same as above, much a bit more bases are included on both sides

                   AA
      AATAAAG-----G┘     AAAA     <-bridge read
      AATAAAG-----GCTTGGA┘        <-bridge contig
      0123456     78901234        <-contig coord
            |     ^ctg_clv
      AATAAAGXXXXXGCTTGGA
      01234567890123456789        <-genome offset coord
                1 ^gnm_offset

    """
    cigartuples = [
        (S.BAM_CMATCH, 7),
        (S.BAM_CREF_SKIP, 5),
        (S.BAM_CMATCH, 7),
        (S.BAM_CSOFT_CLIP, 4),
    ]
    gnm_offset = 12
    assert calc_genome_offset(cigartuples, ctg_clv=7, tail_side='right') == gnm_offset

    assert calc_genome_offset(cigartuples, ctg_clv=7, tail_side='right', skip_check_size=1) == 6
示例#13
0
def test_for_contig_with_clv_in_hardclip_spanning_clv():
    """
    TT
     └ACG
    CGACGTA      <-contig
    \\\   |      <-hardclip mask
    01234567     <-contig coord
   76543210      <-rev contig coord
      ^ctg_clv
      |01234     <-genome offset coord
   76543210      <-rev genome offset coord
      ^gnm_offset
    """
    ctg_cigartuples = (
        (S.BAM_CHARD_CLIP, 3),
        (S.BAM_CMATCH, 4),
    )
    expected_gnm_offset = -1
    assert calc_genome_offset(ctg_cigartuples, ctg_clv=2,
                              tail_side='left') == expected_gnm_offset

    # just shift ctg_clv to the left a bit more
    assert calc_genome_offset(ctg_cigartuples, ctg_clv=1,
                              tail_side='left') == -2
示例#14
0
def init_ref_beg(ref_clv, cigartuples, ctg_clv):
    """
    Initialize the beginning index in genome coordinate by calculating the
    offset from left, using `calc_genome_offset`
    """
    # TODO: left may not matter in such case
    offset = calc_genome_offset(cigartuples, ctg_clv, 'left')

    # ref_clv = contig.reference_start + offset, so
    # ref_clv - offset = contig.reference_start,
    # so still needs to subtract clipped region, which isn't in reference_start

    cgr = cigartuples[0]
    if cgr[0] == S.BAM_CSOFT_CLIP or cgr[0] == S.BAM_CHARD_CLIP:
        offset += cgr[1]
    return ref_clv - offset
示例#15
0
def test_for_skipped_contig_with_clv_right_before_the_skip():
    """
          AA
        AT┘         <-bridge read
      CGAT--CGAT    <-bridge contig
      0123  45678   <-contig offset coord
         ^ctg_clv
      01234567890   <-genome offset coord
         ^gnm_offset
    """
    cigartuples = [
        (S.BAM_CMATCH, 4),
        (S.BAM_CREF_SKIP, 2),
        (S.BAM_CMATCH, 4)
    ]
    gnm_offset = 3
    assert calc_genome_offset(cigartuples, ctg_clv=3, tail_side='right') == gnm_offset
示例#16
0
def test_for_bridge_support_on_suffix_contig_edgecase():
    """
                 AA
           CGTACT┘|      <-bridge read
           012345678
           |||AAA  |
       ATGACGT┘ |  |     <-suffix contig
       0123456789012     <-contig offset coord
           |    ^ctg_clv
       0123456789012     <-genome offset coord
                ^gnm_offset
    """
    ctg_cigartuples = ((S.BAM_CMATCH, 7), (S.BAM_CSOFT_CLIP, 3))

    expected_gnm_offset = 9
    assert calc_genome_offset(ctg_cigartuples, ctg_clv=9,
                              tail_side='right') == expected_gnm_offset
示例#17
0
def test_skip_check():
    """
           AA
       G--A┘         <-bridge read
      CG--ATC    <-bridge contig
      01  2345   <-contig offset coord
          ^ctg_clv
      01234567   <-genome offset coord
       ^gnm_offset
    """
    cigartuples = [
        (S.BAM_CMATCH, 2),
        (S.BAM_CREF_SKIP, 2),
        (S.BAM_CMATCH, 3),
    ]
    gnm_offset = 1
    assert calc_genome_offset(cigartuples, ctg_clv=2, tail_side='right', skip_check_size=1) == gnm_offset
示例#18
0
def test_for_contig_with_clv_after_insertion_so_tail_side_does_not_matter(
        ctg_clv, tail_side, expected_gnm_offset):
    """
            TT AA      <-bridge read tail
             └A┘       <-bridge read, for visual convenience two cases for different tail sides are merged with only one base shown
              |        # blank line to separate the bridge read the insertion
              AGC      <-inserted sequence
              456      <-contig offset coord for inserted sequence
               ┬
           ATCG GT    <-contig
           0123 789   <-contig offset coord
                ^ctg_clv
           0123 456   <-genome offset coord
                ^gnm_offset
    """
    ctg_cigartuples = ((S.BAM_CMATCH, 4), (S.BAM_CINS, 3), (S.BAM_CMATCH, 2))
    assert calc_genome_offset(ctg_cigartuples, ctg_clv,
                              tail_side) == expected_gnm_offset
示例#19
0
def test_for_contig_with_clv_before_insertion_so_tail_side_does_not_matter(
        ctg_clv, tail_side, expected_gnm_offset):
    """
    TT AA      <-bridge read tail
     └C┘       <-bridge read
      |        # blank line to separate the bridge read the insertion
      |AGC     <-inserted sequence
      |456     <-contig offset coord for inserted sequence
      | ┬
    ATCG GT    <-contig
    0123 789   <-contig offset coord
      ^ctg_clv
    0123 456   <-genome offset coord
      ^gnm_offset
    """
    ctg_cigartuples = ((S.BAM_CMATCH, 4), (S.BAM_CINS, 3), (S.BAM_CMATCH, 2))
    assert calc_genome_offset(ctg_cigartuples, ctg_clv,
                              tail_side) == expected_gnm_offset
示例#20
0
def test_for_contig_with_3_base_insertion_with_clv_after_the_insertion(
        ctg_clv, expected_gnm_offset):
    """
       TT      <-bridge read tail
        └GT    <-bridge read
         |     # blank line to separate the bridge read the insertion
      AGC|     <-inserted sequence
      456|     <-contig offset coord for inserted sequence
       ┬ |
    ATCG GT    <-contig
    0123 78    <-contig offset coord
         ^ctg_clv
    0123 45    <-genome offset coord
         ^gnm_offset
    """
    ctg_cigartuples = ((S.BAM_CMATCH, 3), (S.BAM_CINS, 3), (S.BAM_CMATCH, 2))
    assert calc_genome_offset(ctg_cigartuples, ctg_clv,
                              'left') == expected_gnm_offset
示例#21
0
def test_for_contig_with_3_base_insertion_with_clv_inside_the_insertion_and_clv_is_in_the_middle_of_insertion(
        ctg_clv, expected_gnm_offset):
    """
             TT        <-bridge read tail
              └G       <-bridge read
               |       # blank line to separate the bridge read the insertion
              AGC      <-inserted sequence
              456      <-contig offset coord for inserted sequence
       ctg_clv^|
              |┬
           ATCG GT     <-contig
           0123 789    <-contig offset coord
           0123 456    <-genome offset coord
              ^gnm_offset
    """
    ctg_cigartuples = ((S.BAM_CMATCH, 4), (S.BAM_CINS, 3), (S.BAM_CMATCH, 2))
    assert calc_genome_offset(ctg_cigartuples, ctg_clv,
                              'left') == expected_gnm_offset
示例#22
0
def test_for_contig_with_1_base_insertion_with_clv_after_insertion(
        ctg_clv, expected_gnm_offset):
    """
       TT
        └AC    <-bridge read
         |     # blank line to separate the bridge read the insertion
        G|     <-inserted sequence
        4|     <-contig offset coord for inserted sequence
        ┬|
    ATCG AC    <-contig
    0123 56    <-contig offset coord
         ^ctg_clv
    0123 45    <-genome offset coord
         ^gnm_offset
    see parameters in the decorator for various ctg_clv
    """
    ctg_cigartuples = ((S.BAM_CMATCH, 4), (S.BAM_CINS, 1), (S.BAM_CMATCH, 2))
    assert calc_genome_offset(ctg_cigartuples, ctg_clv,
                              'left') == expected_gnm_offset
示例#23
0
def test_for_contig_with_one_base_insertion_with_clv_before_insertion(
        ctg_clv, tail_side, expected_gnm_offset):
    """
    TT AA
     └C┘       <-bridge read, for visual convenience two cases for different tail sides are merged with only one base shown
      |        # blank line to separate the bridge read the insertion
      | G      <-inserted sequence
      | 4      <-contig offset coord for inserted sequence
      | ┬
    ATCG AC    <-contig
    0123 56    <-contig offset coord
      ^ctg_clv
    0123 45    <-genome offset coord
      ^gnm_offset
    see parameters in the decorator for various ctg_clv
    """
    ctg_cigartuples = ((S.BAM_CMATCH, 4), (S.BAM_CINS, 1), (S.BAM_CMATCH, 2))
    assert calc_genome_offset(ctg_cigartuples, ctg_clv,
                              tail_side) == expected_gnm_offset
示例#24
0
def test_clv_inside_insertion(ctg_clv, tail_side, skip_check_size,
                              expected_gnm_offset):
    """
            TT AA      <-bridge read tail
             └A┘       <-bridge read, for visual convenience two cases for different tail sides are merged with only one base shown
              |        # blank line to separate the bridge read the insertion
              AGC      <-inserted sequence
              345      <-contig offset coord for inserted sequence
       ctg_clv^|
              |┬
           AT-G GT     <-contig
           01 2 678    <-contig offset coord
           0123 456    <-genome offset coord
              ^gnm_offset
    """
    ctg_cigartuples = ((S.BAM_CMATCH, 2), (S.BAM_CREF_SKIP, 1),
                       (S.BAM_CMATCH, 1), (S.BAM_CINS, 3), (S.BAM_CMATCH, 2))
    assert calc_genome_offset(ctg_cigartuples, ctg_clv, tail_side,
                              skip_check_size) == expected_gnm_offset
示例#25
0
def test_for_contig_with_two_skips_with_clv_right_before_the_skip():
    """
             AA
           TC┘         <-bridge read
      CG--ATC--GAT    <-bridge contig
      01  234  5678   <-contig offset coord
            ^ctg_clv
      0123456789012   <-genome offset coord
            ^gnm_offset
    """
    cigartuples = [
        (S.BAM_CMATCH, 2),
        (S.BAM_CREF_SKIP, 2),
        (S.BAM_CMATCH, 3),
        (S.BAM_CREF_SKIP, 2),
        (S.BAM_CMATCH, 3),
    ]
    gnm_offset = 6
    assert calc_genome_offset(cigartuples, ctg_clv=4, tail_side='right') == gnm_offset
示例#26
0
def test_for_contig_with_two_skips_and_soft_clip_edge_case_1():
    """
             AA
           TC┘    AAAA    <-bridge read
      CG--ATC--GAT┘   <-bridge contig
      01  234  56789012   <-contig offset coord
            ^ctg_clv
      0123456789012   <-genome offset coord
            ^gnm_offset
    """
    cigartuples = [
        (S.BAM_CMATCH, 2),
        (S.BAM_CREF_SKIP, 2),
        (S.BAM_CMATCH, 3),
        (S.BAM_CREF_SKIP, 2),
        (S.BAM_CMATCH, 3),
        (S.BAM_CSOFT_CLIP, 4),
    ]
    gnm_offset = 6
    assert calc_genome_offset(cigartuples, ctg_clv=4, tail_side='right') == gnm_offset
示例#27
0
文件: xseq_plus.py 项目: zyxue/kleat
def init_ref_end(ref_clv, cigartuples, ctg_clv, ctg_seq):
    """
    Initialize the end index in genome coordinate by calculating the offset
    from right, using `calc_genome_offset`

    comparing to the minus corresponding function, there is one additional
    argument neeed, i.e. `ctg_seq`

    :param ctg_seq: should include soft/hardclipped region if it's clipped
    """
    cigartuples = list(reversed(cigartuples))
    ctg_clv = len(ctg_seq) - ctg_clv  # from_the_right

    # TODO: left may not matter in such case
    offset = calc_genome_offset(cigartuples, ctg_clv, 'left')

    # note ref_clv = contig.reference_start + offset

    cgr = cigartuples[0]
    if cgr[0] == S.BAM_CSOFT_CLIP or cgr[0] == S.BAM_CHARD_CLIP:
        offset += cgr[1]
    return ref_clv + offset