Exemplo n.º 1
0
def guess_amplicons(fastq_r1,fastq_r2,number_of_reads_to_consider,flash_command,max_paired_end_reads_overlap,min_paired_end_reads_overlap,aln_matrix,needleman_wunsch_gap_open,needleman_wunsch_gap_extend,min_freq_to_consider=0.2,amplicon_similarity_cutoff=0.95):
    """
    guesses the amplicons used in an experiment by examining the most frequent read (giant caveat -- most frequent read should be unmodified)
    input:
    fastq_r1: path to fastq r1 (can be gzipped)
    fastq_r2: path to fastq r2 (can be gzipped)
    number_of_reads_to_consider: number of reads from the top of the file to examine
    flash_command: command to call flash
    min_paired_end_reads_overlap: min overlap in bp for flashing (merging) r1 and r2
    max_paired_end_reads_overlap: max overlap in bp for flashing (merging) r1 and r2
    needleman_wunsch_gap_open: alignment penalty assignment used to determine similarity of two sequences
    needleman_wunsch_gap_extend: alignment penalty assignment used to determine similarity of two sequences
    min_freq_to_consider: selected ampilcon must be frequent at least at this percentage in the population
    amplicon_similarity_cutoff: if the current amplicon has similarity of greater than this cutoff to any other existing amplicons, it won't be added

    returns:
    list of putative amplicons
    """
    seq_lines = get_most_frequent_reads(fastq_r1,fastq_r2,number_of_reads_to_consider,flash_command,max_paired_end_reads_overlap,min_paired_end_reads_overlap)

    curr_amplicon_id = 1

    amplicon_seq_arr = []

    #add most frequent amplicon to the list
    count,seq = seq_lines[0].strip().split()
    amplicon_seq_arr.append(seq)
    curr_amplicon_id += 1

    #for the remainder of the amplicons, test them before adding
    for i in range(1,len(seq_lines)):
        count,seq = seq_lines[i].strip().split()
        last_count,last_seq = seq_lines[i-1].strip().split()
        #if this allele is present in at least XX% of the samples
        if float(last_count)/float(number_of_reads_to_consider) > min_freq_to_consider:
            this_amplicon_seq_arr = amplicon_seq_arr[:]
            this_amplicon_max_pct = 0 #keep track of similarity to most-similar already-found amplicons
            for amp_seq in this_amplicon_seq_arr:
                ref_incentive = np.zeros(len(amp_seq)+1,dtype=np.int)
                fws1,fws2,fwscore=CRISPResso2Align.global_align(seq,amp_seq,matrix=aln_matrix,gap_incentive=ref_incentive,gap_open=needleman_wunsch_gap_open,gap_extend=needleman_wunsch_gap_extend,)
                rvs1,rvs2,rvscore=CRISPResso2Align.global_align(reverse_complement(seq),amp_seq,matrix=aln_matrix,gap_incentive=ref_incentive,gap_open=needleman_wunsch_gap_open,gap_extend=needleman_wunsch_gap_extend,)
                #if the sequence is similar to a previously-seen read, don't add it
                min_len =  min(len(last_seq),len(seq))
                max_score = max(fwscore,rvscore)
                if max_score/float(min_len) > this_amplicon_max_pct:
                    this_amplicon_max_pct = max_score/float(min_len)
            #if this amplicon was maximally-similar to all other chosen amplicons by less than amplicon_similarity_cutoff, add to the list
            if this_amplicon_max_pct < amplicon_similarity_cutoff:
                amplicon_seq_arr.append(seq)
                curr_amplicon_id += 1
        else:
            break

    return amplicon_seq_arr
Exemplo n.º 2
0
 def RunCRISPResso2(self, strQuerySeqAfterBarcode, strRefSeqAfterBarcode,
                    npGapIncentive):
     listResult = CRISPResso2Align.global_align(
         strQuerySeqAfterBarcode.upper(),
         strRefSeqAfterBarcode.upper(),
         matrix=self.npAlnMatrix,
         gap_open=self.floOg,
         gap_extend=self.floOe,
         gap_incentive=npGapIncentive)
     return listResult
def test_global_align():
    """General alignment tests."""
    seq1, seq2, score = CRISPResso2Align.global_align('ATTA',
                                                      'ATTA',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [0, 0, 0, 0, 0],
                                                          dtype=int))
    assert seq1 == 'ATTA'
    assert seq2 == 'ATTA'
    assert score == 100
Exemplo n.º 4
0
 def __init__(self, strEDNAFULL='', floOg='', floOe=''):
     self.npAlnMatrix = CRISPResso2Align.read_matrix(strEDNAFULL)
     self.floOg = floOg
     self.floOe = floOe
Exemplo n.º 5
0
def guess_guides(amplicon_sequence,
                 fastq_r1,
                 fastq_r2,
                 number_of_reads_to_consider,
                 flash_command,
                 max_paired_end_reads_overlap,
                 min_paired_end_reads_overlap,
                 aln_matrix,
                 needleman_wunsch_gap_open,
                 needleman_wunsch_gap_extend,
                 min_edit_freq_to_consider=0.1,
                 pam_seq="NGG",
                 min_pct_subs_in_base_editor_win=0.8):
    """
    guesses the guides used in an experiment by identifying the most-frequently edited positions, editing types, and PAM sites
    input:
    ampilcon_sequence - amplicon to analyze
    fastq_r1: path to fastq r1 (can be gzipped)
    fastq_r2: path to fastq r2 (can be gzipped)
    number_of_reads_to_consider: number of reads from the top of the file to examine
    flash_command: command to call flash
    min_paired_end_reads_overlap: min overlap in bp for flashing (merging) r1 and r2
    max_paired_end_reads_overlap: max overlap in bp for flashing (merging) r1 and r2
    needleman_wunsch_gap_open: alignment penalty assignment used to determine similarity of two sequences
    needleman_wunsch_gap_extend: alignment penalty assignment used to determine similarity of two sequences
    min_edit_freq_to_consider: edits must be at least this frequency for consideration
    pam_seq: pam sequence to look for (can be regex or contain degenerate bases)
    min_pct_subs_in_base_editor_win: if at least this percent of substitutions happen in the predicted base editor window, return base editor flag

    returns:
    tuple of (putative guide, boolean is_base_editor)
    or (None, None)
    """
    seq_lines = get_most_frequent_reads(fastq_r1, fastq_r2,
                                        number_of_reads_to_consider,
                                        flash_command,
                                        max_paired_end_reads_overlap,
                                        min_paired_end_reads_overlap)

    amp_len = len(amplicon_sequence)
    gap_incentive = np.zeros(amp_len + 1, dtype=np.int)
    include_idxs = set(range(0, amp_len))

    all_indel_count_vector = np.zeros(amp_len)
    all_sub_count_vector = np.zeros(amp_len)
    tot_count = 0
    for i in range(len(seq_lines)):
        count, seq = seq_lines[i].strip().split()
        count = int(count)
        tot_count += count
        fws1, fws2, fwscore = CRISPResso2Align.global_align(
            seq,
            amplicon_sequence,
            matrix=aln_matrix,
            gap_incentive=gap_incentive,
            gap_open=needleman_wunsch_gap_open,
            gap_extend=needleman_wunsch_gap_extend,
        )
        payload = CRISPRessoCOREResources.find_indels_substitutions(
            fws1, fws2, include_idxs)
        all_indel_count_vector[payload['all_insertion_positions']] += count
        all_indel_count_vector[payload['all_deletion_positions']] += count
        all_sub_count_vector[payload['all_substitution_positions']] += count

    max_loc = np.argmax(all_indel_count_vector)
    max_val = all_indel_count_vector[max_loc]

    #return nothing if the max edit doesn't break threshold
    if max_val / float(tot_count) < min_edit_freq_to_consider:
        return (None, None)

    pam_regex_string = pam_seq.upper()
    pam_regex_string = pam_regex_string.replace('I', '[ATCG]')
    pam_regex_string = pam_regex_string.replace('N', '[ATCG]')
    pam_regex_string = pam_regex_string.replace('R', '[AG]')
    pam_regex_string = pam_regex_string.replace('Y', '[CT]')
    pam_regex_string = pam_regex_string.replace('S', '[GC]')
    pam_regex_string = pam_regex_string.replace('W', '[AT]')
    pam_regex_string = pam_regex_string.replace('K', '[GT]')
    pam_regex_string = pam_regex_string.replace('M', '[AC]')
    pam_regex_string = pam_regex_string.replace('B', '[CGT]')
    pam_regex_string = pam_regex_string.replace('D', '[AGT]')
    pam_regex_string = pam_regex_string.replace('H', '[ACT]')
    pam_regex_string = pam_regex_string.replace('V', '[ACG]')

    is_base_editor = False
    #offset from expected position
    for offset in (0, +1, -1, +2, +3, +4, -2):
        #forward direction
        #find pam near max edit loc
        pam_start = max_loc + 4 + offset
        pam_end = max_loc + 7 + offset
        guide_start = max_loc - 16 + offset
        guide_end = max_loc + 4 + offset
        base_edit_start = max_loc - 16 + offset
        base_edit_end = max_loc - 6 + offset
        if pam_start > 0 and guide_end < amp_len:
            if re.match(pam_regex_string,
                        amplicon_sequence[pam_start:pam_end]):
                guide_seq = amplicon_sequence[guide_start:guide_end]
                sum_base_edits = sum(
                    all_sub_count_vector[base_edit_start:base_edit_end])
                #if a lot of edits are in the predicted base editor window, set base editor true
                #specifically, if at least min_pct_subs_in_base_editor_win % of substitutions happen in the predicted base editor window
                if sum_base_edits > min_pct_subs_in_base_editor_win * sum(
                        all_sub_count_vector):
                    is_base_editor = True
                return (guide_seq, is_base_editor)

        #reverse direction
        pam_start = max_loc - 5 - offset
        pam_end = max_loc - 2 - offset
        guide_start = max_loc - 2 - offset
        guide_end = max_loc + 18 - offset
        base_edit_start = max_loc + 8 - offset
        base_edit_end = max_loc + 18 - offset
        if pam_start > 0 and guide_end < amp_len:
            if re.match(pam_regex_string,
                        amplicon_sequence[pam_start:pam_end]):
                guide_seq = amplicon_sequence[guide_start:guide_end]
                sum_base_edits = sum(
                    all_sub_count_vector[base_edit_start:base_edit_end])
                #if a lot of edits are in the predicted base editor window, set base editor true
                #specifically, if at least min_pct_subs_in_base_editor_win % of substitutions happen in the predicted base editor window
                if sum_base_edits > min_pct_subs_in_base_editor_win * sum(
                        all_sub_count_vector):
                    is_base_editor = True
                return (guide_seq, is_base_editor)

    return (None, None)
from CRISPResso2 import CRISPResso2Align, CRISPRessoShared

ALN_MATRIX = CRISPResso2Align.read_matrix('./CRISPResso2/EDNAFULL')


def test_get_mismatches():
    mismatch_cords = CRISPRessoShared.get_mismatches(
        'ATTA',
        'ATTA',
        ALN_MATRIX,
        -5,
        -3,
    )
    assert len(mismatch_cords) == 0

    mismatch_cords = CRISPRessoShared.get_mismatches(
        'GCAGTGGGCGCGCTA',
        'CCCACTGAAGGCCC',
        ALN_MATRIX,
        -5,
        -3,
    )
    assert len(mismatch_cords) == 6
def test_global_align_gap_incentive_s1():
    """Test the global_align gap incentives for gaps in sequence 1 (the first sequence)."""
    seq1, seq2, score = CRISPResso2Align.global_align('ATTTA',
                                                      'ATTTA',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [0, 0, 0, 0, 0, 0],
                                                          dtype=int))
    #    print('seq1: ' + seq1 + ' seq2: ' + seq2 + ' score ' + str(score))
    assert seq1 == 'ATTTA'
    assert seq2 == 'ATTTA'
    assert score == 100

    seq1, seq2, score = CRISPResso2Align.global_align('ATTTA',
                                                      'ATTA',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [1, 0, 0, 0, 0],
                                                          dtype=int))
    assert seq1 == 'ATTTA'
    assert seq2 == 'ATT-A'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)

    seq1, seq2, score = CRISPResso2Align.global_align('ATTTA',
                                                      'ATTA',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [0, 1, 0, 0, 0],
                                                          dtype=int))
    assert seq1 == 'ATTTA'
    assert seq2 == 'A-TTA'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)

    seq1, seq2, score = CRISPResso2Align.global_align('ATTTA',
                                                      'ATTA',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [0, 0, 1, 0, 0],
                                                          dtype=int))
    assert seq1 == 'ATTTA'
    assert seq2 == 'AT-TA'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)

    seq1, seq2, score = CRISPResso2Align.global_align('ATTTA',
                                                      'ATTA',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [0, 0, 0, 1, 0],
                                                          dtype=int))
    assert seq1 == 'ATTTA'
    assert seq2 == 'ATT-A'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)

    seq1, seq2, score = CRISPResso2Align.global_align('ATTTA',
                                                      'ATTA',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [0, 0, 0, 1, 0],
                                                          dtype=int))
    assert seq1 == 'ATTTA'
    assert seq2 == 'ATT-A'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)

    seq1, seq2, score = CRISPResso2Align.global_align('ATTTA',
                                                      'ATTA',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [0, 0, 0, 0, 1],
                                                          dtype=int))
    assert seq1 == 'ATTTA'
    assert seq2 == 'ATT-A'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)

    seq1, seq2, score = CRISPResso2Align.global_align('TTTTT',
                                                      'TTTT',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [0, 0, 0, 0, 0],
                                                          dtype=int))
    assert seq1 == 'TTTTT'
    assert seq2 == 'TTTT-'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)

    seq1, seq2, score = CRISPResso2Align.global_align('TTTTT',
                                                      'TTTT',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [1, 0, 0, 0, 0],
                                                          dtype=int))
    assert seq1 == 'TTTTT'
    assert seq2 == '-TTTT'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)

    seq1, seq2, score = CRISPResso2Align.global_align('TTTTT',
                                                      'TTTT',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [0, 1, 0, 0, 0],
                                                          dtype=int))
    assert seq1 == 'TTTTT'
    assert seq2 == 'T-TTT'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)

    seq1, seq2, score = CRISPResso2Align.global_align('TTTTT',
                                                      'TTTT',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [0, 0, 1, 0, 0],
                                                          dtype=int))
    assert seq1 == 'TTTTT'
    assert seq2 == 'TT-TT'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)

    seq1, seq2, score = CRISPResso2Align.global_align('TTTTT',
                                                      'TTTT',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [0, 0, 0, 1, 0],
                                                          dtype=int))
    assert seq1 == 'TTTTT'
    assert seq2 == 'TTT-T'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)

    seq1, seq2, score = CRISPResso2Align.global_align('TTTTT',
                                                      'TTTT',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [0, 0, 0, 0, 1],
                                                          dtype=int))
    assert seq1 == 'TTTTT'
    assert seq2 == 'TTTT-'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)
def test_global_align_gap_incentive_s2():
    """Test the global_align gap incentives for gaps in sequence 2 (the second sequence)."""
    seq1, seq2, score = CRISPResso2Align.global_align('ATTA',
                                                      'ATTTA',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [1, 0, 0, 0, 0, 0],
                                                          dtype=int))
    assert seq1 == 'ATT-A'
    assert seq2 == 'ATTTA'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)

    seq1, seq2, score = CRISPResso2Align.global_align('ATTA',
                                                      'ATTTA',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [0, 1, 0, 0, 0, 0],
                                                          dtype=int))
    assert seq1 == 'A-TTA'
    assert seq2 == 'ATTTA'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)

    seq1, seq2, score = CRISPResso2Align.global_align('ATTA',
                                                      'ATTTA',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [0, 0, 1, 0, 0, 0],
                                                          dtype=int))
    assert seq1 == 'AT-TA'
    assert seq2 == 'ATTTA'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)

    seq1, seq2, score = CRISPResso2Align.global_align('ATTA',
                                                      'ATTTA',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [0, 0, 0, 1, 0, 0],
                                                          dtype=int))
    assert seq1 == 'ATT-A'
    assert seq2 == 'ATTTA'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)

    seq1, seq2, score = CRISPResso2Align.global_align('ATTA',
                                                      'ATTTA',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [0, 0, 0, 0, 1, 0],
                                                          dtype=int))
    assert seq1 == 'ATT-A'
    assert seq2 == 'ATTTA'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)

    seq1, seq2, score = CRISPResso2Align.global_align('ATTA',
                                                      'ATTTA',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [0, 0, 0, 0, 0, 1],
                                                          dtype=int))
    assert seq1 == 'ATT-A'
    assert seq2 == 'ATTTA'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)

    seq1, seq2, score = CRISPResso2Align.global_align('TTTT',
                                                      'TTTTT',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [1, 0, 0, 0, 0, 0],
                                                          dtype=int))
    assert seq1 == '-TTTT'
    assert seq2 == 'TTTTT'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)

    seq1, seq2, score = CRISPResso2Align.global_align('TTTT',
                                                      'TTTTT',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [0, 1, 0, 0, 0, 0],
                                                          dtype=int))
    assert seq1 == 'T-TTT'
    assert seq2 == 'TTTTT'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)

    seq1, seq2, score = CRISPResso2Align.global_align('TTTT',
                                                      'TTTTT',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [0, 0, 1, 0, 0, 0],
                                                          dtype=int))
    assert seq1 == 'TT-TT'
    assert seq2 == 'TTTTT'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)

    seq1, seq2, score = CRISPResso2Align.global_align('TTTT',
                                                      'TTTTT',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [0, 0, 0, 1, 0, 0],
                                                          dtype=int))
    assert seq1 == 'TTT-T'
    assert seq2 == 'TTTTT'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)

    seq1, seq2, score = CRISPResso2Align.global_align('TTTT',
                                                      'TTTTT',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [0, 0, 0, 0, 1, 0],
                                                          dtype=int))
    assert seq1 == 'TTTT-'
    assert seq2 == 'TTTTT'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)

    seq1, seq2, score = CRISPResso2Align.global_align('TTTT',
                                                      'TTTTT',
                                                      matrix=ALN_MATRIX,
                                                      gap_incentive=np.array(
                                                          [0, 0, 0, 0, 0, 1],
                                                          dtype=int))
    assert seq1 == 'TTTT-'
    assert seq2 == 'TTTTT'
    assert round(score, 3) == round(100 * 4 / 5.0, 3)