def updatePam(indel, orig_pam_loc, pam_dir):
    pam_loc = orig_pam_loc
    itype, isize, details, muts = tokFullIndel(indel)
    if itype != '-':
        if pam_dir == 'REVERSE':
            left_pos = pam_loc + 2 - (details['R'] - 1) + details['C']
            right_pos = pam_loc + 2 - (details['L'] + 1) + details['C']
        else:
            left_pos = pam_loc - 3 + (details['L'] + 1) + details['C']
            right_pos = pam_loc - 3 + (details['R'] - 1) + details['C']

        if itype == 'D':
            delsize = isize - details['I']
        else:
            delsize = -isize + details['D']

        if left_pos < pam_loc:
            pam_loc = max(pam_loc - delsize, left_pos)

    for (muttype, mutpos, nucl) in muts:
        if muttype == 'D':
            msize = mutpos
        if muttype == 'I':
            msize = -mutpos
        if muttype != 'S':
            continue
        if pam_dir == 'REVERSE':
            mutidx = pam_loc + 2 - mutpos
        else:
            mutidx = pam_loc - 3 + mutpos
        if mutidx < pam_loc:
            pam_loc = pam_loc - msize

    return pam_loc
示例#2
0
def writeMCISummary(fout, id, p1, stats1, oligo_det, more_indels=False):
    if not more_indels: mcis = [getHighestIndel(p1)]
    else: mcis = [x[1] for x in getProfileCounts(p1) if x[1] != '-']
    for mci in mcis:
        mci_reads = p1[mci]
        total_reads = stats1[0] - stats1[2]
        itype, isize, details, muts = tokFullIndel(mci)
        pam_loc, pam_dir, seq = oligo_det

        mh_seq, altered_seq = '', ''
        if itype == 'D' and ('I' not in details or details['I'] == 0):
            if details['C'] > 0:
                left_c_seq = getSequence(oligo_det, details['L'] + 1,
                                         details['L'] + details['C'])
                right_c_seq = getSequence(oligo_det,
                                          details['R'] - details['C'],
                                          details['R'] - 1)
                if left_c_seq == right_c_seq:
                    mh_seq = left_c_seq
            altered_seq = getSequence(oligo_det, details['L'] + 1,
                                      details['R'] -
                                      1)  #Note includes MH seq at both ends

        str_args = (id, mci, details['L'], details['R'], details['C'], itype,
                    isize, mci_reads, total_reads, mh_seq, altered_seq)
        fout.write(u'%s\t%s\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%s\t%s\n' % str_args)
示例#3
0
def predictMutations(theta_file, target_seq, pam_idx, add_null=True):

    theta, train_set, theta_feature_columns = readTheta(theta_file)

    #generate indels
    left_trim = 0
    tmp_genindels_file = 'tmp_genindels_%s_%d.txt' % (target_seq, random.randint(0,100000))
    cmd = INDELGENTARGET_EXE + ' %s %d %s' % (target_seq, pam_idx, tmp_genindels_file)
    print(cmd); subprocess.check_call(cmd.split())
    rep_reads = fetchRepReads(tmp_genindels_file)
    isize, smallest_indel = min([(tokFullIndel(x)[1],x) for x in rep_reads]) if len(rep_reads) > 0 else (0,'-') 
    if isize > 0: left_trim = target_seq.find(rep_reads[smallest_indel][:10])

    #compute features for all generated indels
    tmp_features_file = 'tmp_features_%s_%d.txt' % (target_seq, random.randint(0,100000))
    calculateFeaturesForGenIndelFile( tmp_genindels_file, target_seq, pam_idx-3, tmp_features_file)
    os.remove(tmp_genindels_file)
    feature_data, feature_columns = readFeaturesData(tmp_features_file)
    os.remove(tmp_features_file)

    if len(set(theta_feature_columns).difference(set(feature_columns))) != 0:
        raise Exception('Stored feature names associated with model thetas are not contained in those computed')

    if len(set(theta_feature_columns).union(set(feature_columns))) != len(theta_feature_columns):
        feature_data = feature_data[['Indel'] + theta_feature_columns]
        feature_columns = theta_feature_columns

    #Predict the profile
    p_predict, _ = computePredictedProfile(feature_data, theta, theta_feature_columns)
    in_frame, out_frame, _ = fetchIndelSizeCounts(p_predict)
    in_frame_perc = in_frame*100.0/(in_frame + out_frame)
    if add_null:
        p_predict['-'] = 1000
        rep_reads['-'] = target_seq[left_trim:]
    return p_predict, rep_reads, in_frame_perc
def compileMappedNull(file_prefix, read_lookup, pam_lookup, exp_oligo_lookup):
    read_profiles, indel_seqs = {}, {}
    if not os.path.isfile(file_prefix + '_mappings.txt'):
        print('Could not find file', file_prefix + '_mappings.txt')
    else:

        #Add 5 pseudo reads for the NULL indel for all oligos (in case poorly represented in the NULL measure)
        if file_prefix.split('/')[-1] in exp_oligo_lookup:
            for (oligo_id, pam_loc, pam_dir,
                 seq) in exp_oligo_lookup[file_prefix.split('/')[-1]]:
                read_profiles[oligo_id] = {'-': 5}
                indel_seqs[oligo_id] = {'-': seq}

        f = io.open(file_prefix + '_mappings.txt')
        rdr = csv.reader(f, delimiter='\t')
        for toks in rdr:
            oligo_id = toks[1].split('_')[0]
            read_id = oligo_id + '.' + toks[0].split()[0]
            if oligo_id not in read_profiles:
                read_profiles[oligo_id] = {}
                indel_seqs[oligo_id] = {}
            seq = read_lookup[read_id]
            indel = toks[2] + '_' + toks[3]  #combine mutations with indels
            itype, isize, details, muts = tokFullIndel(indel)
            if indel == '-_-':
                indel = '-'
            if indel not in read_profiles[oligo_id]:
                read_profiles[oligo_id][indel] = 0
                indel_seqs[oligo_id][indel] = seq
            read_profiles[oligo_id][indel] += 1
        f.close()

        fout = io.open(file_prefix + '_nullsummary.txt', 'w')
        oligo_ids = [x for x in read_profiles.keys()]
        oligo_ids.sort()
        for oligo_id in oligo_ids:

            orig_pam_loc, pam_dir = pam_lookup[oligo_id]
            fout.write(u'@@@%s\n' % oligo_id)
            indel_counts = [(read_profiles[oligo_id][x], x)
                            for x in read_profiles[oligo_id]]
            indel_counts.sort(reverse=True)
            total_counts = sum([x[0] for x in indel_counts])
            for (count, indel) in indel_counts:
                seq = indel_seqs[oligo_id][indel]
                perc = count * 100.0 / total_counts
                pam_loc = updatePam(indel, orig_pam_loc, pam_dir)
                fout.write(u'%s\t%s\t%d\t%s\t%.3f\n' %
                           (seq, indel, pam_loc, pam_dir, perc))

        fout.close()
示例#5
0
def fetchIndelSizeCounts(p1):
    inframe, outframe, size_counts, = 0, 0, {'I': {}, 'D': {}}
    for i in range(1, 21):
        size_counts['I'][i] = 0
        size_counts['D'][i] = 0
    for indel in p1:
        if indel == '-':
            continue
        itype, isize, details, muts = tokFullIndel(indel)
        net_isize = isize - details['I'] - details['D']
        if net_isize % 3 == 0:
            inframe += p1[indel]
        else:
            outframe += p1[indel]
        if net_isize not in size_counts[itype]:
            size_counts[itype][net_isize] = 0
        size_counts[itype][net_isize] += p1[indel]
    return inframe, outframe, size_counts
示例#6
0
def isAllowableOligoIndel(oligo_indel):
    itype, isize, details, muts = tokFullIndel(oligo_indel)
    #Exclude reads from oligos with any mutations in the guide or PAM sequence
    is_ok = True
    mut_locs = [x for x in muts if x[0] not in ['N', 'I', 'D']]
    if len(mut_locs) > 0:
        if any([x[1] > -20 and x[1] < 6 for x in mut_locs]):
            is_ok = False
        if len(mut_locs) > 5:
            is_ok = False
    #Only allow oligo indels if they're size 1 or 2 insertion/deletions outside the guide or PAM sequence
    ins_del_muts = [x for x in muts if x[0] in ['I', 'D']]
    if len(ins_del_muts) > 0:
        if any([x[1] > 2 for x in ins_del_muts]):
            is_ok = False
    if oligo_indel[0] != '-':
        if isize > 2 or (details['L'] < 6 and details['R'] > -20):
            is_ok = False
    return is_ok
示例#7
0
文件: view.py 项目: zhaijj/SelfTarget
def padReadForIndel(read_seq, indel, pam_idx):
    itype, isize, details, muts = tokFullIndel(indel)
    red_idxs, green_idxs = set(), set()
    if itype == 'D':
        read_seq = read_seq[:pam_idx - 3 + details['L'] + details['C'] +
                            1] + ' ' * isize + read_seq[pam_idx - 3 +
                                                        details['L'] +
                                                        details['C'] + 1:]
        green_idxs = set(
            range(pam_idx - 3 + details['L'] + 1,
                  pam_idx - 3 + details['L'] + 1 + details['C']))
    if itype == 'I':
        green_idxs = set(
            range(pam_idx - 3 + details['L'] + 1,
                  pam_idx - 3 + details['L'] + 1 + details['C']))
        red_idxs = set(
            range(pam_idx - 3 + details['L'] + 1 + details['C'],
                  pam_idx - 3 + details['L'] + details['C'] + 1 + isize))
    return read_seq, red_idxs, green_idxs
示例#8
0
def readSummaryToProfile(filename,
                         profile,
                         oligoid=None,
                         noexclude=False,
                         remove_long_indels=False,
                         remove_wt=True,
                         wt_thresh=3.0):

    if not os.path.isfile(filename): return 0, 0, 0

    dirname = '/'.join(filename.split('/')[:-3])
    filename_suffix = '/'.join(filename.split('/')[-3:])
    wt_p, wt_p_wfilter = {}, {}
    if 'WT' not in dirname and dirname != '' and not noexclude and remove_wt:
        wt_filename = getWTDir(dirname) + '/' + filename_suffix
        #if wt_filename[0] == '/' and wt_filename[1:7] != 'lustre': wt_filename = wt_filename[1:]
        if not os.path.isfile(wt_filename):
            print('Warning: Could not find', wt_filename)
        else:
            readSummaryToProfile(wt_filename,
                                 wt_p,
                                 oligoid=oligoid,
                                 noexclude=True,
                                 remove_wt=False)
            _, wt_acc, _ = readSummaryToProfile(wt_filename,
                                                wt_p_wfilter,
                                                oligoid=oligoid,
                                                noexclude=False,
                                                remove_wt=False)
            if wt_acc < 10.0:
                return 0, 0, 0  #Need at least 20% acceptable reads in the wild type
            #(to remove oligos that are really messed up)

    total, accepted = 0, 0
    f = io.open(filename)
    reader = csv.reader(f, delimiter='\t')
    if '-' not in profile:
        profile['-'] = 0
    orig_null = profile['-']
    curr_oligo_id = None
    wt_indels = []
    for toks in reader:
        if toks[0][:3] == '@@@':
            curr_oligo_id = toks[0][3:]
            continue
        if oligoid != curr_oligo_id:
            continue
        indel = toks[0]
        oligo_indel = toks[1]
        num_reads = eval(toks[2])
        total += num_reads
        if not noexclude:
            if oligo_indel != '-':
                if not isAllowableOligoIndel(oligo_indel):
                    continue
            #Only allow indels that span the cut site and which are
            #not present in the corresponding WT sample
            if indel != '-':
                itype, isize, details, muts = tokFullIndel(indel)
                if itype != '-' and (details['L'] > 5 or details['R'] < -5):
                    continue
                if remove_long_indels and isize > 30:
                    continue
                if indel in wt_p and remove_wt:
                    #Check the levels of the indel in the WT sample,
                    #only include it if present at at least 3 x that level (including NULLS)
                    # - will need to wait til we know total reads to do this
                    wt_indels.append((indel, num_reads))
                    continue
        if indel not in profile:
            profile[indel] = 0
        profile[indel] += num_reads
        accepted += num_reads
    for indel, num_reads in wt_indels:
        if num_reads * 1.0 / total > wt_p[indel] * wt_thresh / sum(
            [wt_p[x] for x in wt_p]):
            if indel not in profile: profile[indel] = 0
            profile[indel] += num_reads
            accepted += num_reads
    f.close()
    if total == 0:
        perc_accepted = 0.0
    else:
        perc_accepted = accepted * 100.0 / total
    return accepted, perc_accepted, profile['-'] - orig_null
def plotMicrohomologyMismatches(all_result_outputs, label=''):
    
    mut_hdrs =  ['Left Mut', 'Right Mut','Merged Mut1', 'Merged Mut2']
    cols_to_sum = [x + ' Indel Reads in Mut' for x in mut_hdrs] + ['Orig Indel Reads in Orig', 'Mut Non-Null Reads', 'Orig Non-Null Reads'] 
    common_cols = ['Oligo ID','Mapped Oligo Id','Num Mismatches','Orig MH','Left Mut-MH','Right Mut-MH','Merged Mut 1 MH','Merged Mut 2 MH','Orig Indel','Left Mut-MH Indel','Right Mut-MH Indel','Merge Mut 1 Indel','Merge Mut 2 Indel']
    data =  mergeSamples(all_result_outputs, cols_to_sum, merge_on=common_cols)

    getLeft = lambda indel: tokFullIndel(indel)[2]['L']
    getRight = lambda indel: tokFullIndel(indel)[2]['R']
    getMHSize = lambda indel: tokFullIndel(indel)[2]['C']

    oligo_data = pd.read_csv(getHighDataDir() + '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv', sep='\t')
    oligo_data['Guide is matched'] = oligo_data.apply(isMatched, axis=1)
    reverse_lookup = {x: y == 'REVERSE' for (x,y) in zip(oligo_data['ID'],oligo_data['PAM Direction'])}
    is_reverse = lambda x: reverse_lookup[x]

    data = pd.merge(data, oligo_data[['ID','Guide is matched']], left_on='Oligo ID', right_on='ID', how='inner')

    data['MH Size'] = data['Orig Indel'].apply(getMHSize)
    data = data.loc[(data['MH Size'] != 0) & (data['Guide is matched'])]
    data['MH Left Loc'] = data['Orig Indel'].apply(getLeft) + data['MH Size']
    data['MH Right Loc'] = data['Orig Indel'].apply(getRight) - data['MH Size']
    data['Is Reverse'] = data['Oligo ID'].apply(is_reverse)

    for hdrL,hdrR in [mut_hdrs[:2], mut_hdrs[2:]]:
        data[hdrL + ' Reads'] = data['Is Reverse']*data[hdrR + ' Indel Reads in Mut Sum'] + (1- data['Is Reverse'])*data[hdrL + ' Indel Reads in Mut Sum']
        data[hdrR + ' Reads'] = data['Is Reverse']*data[hdrL + ' Indel Reads in Mut Sum'] + (1- data['Is Reverse'])*data[hdrR + ' Indel Reads in Mut Sum']
        data[hdrL + ' Reads Ratio'] =  data[hdrL + ' Reads']*100.0/data['Mut Non-Null Reads Sum']
        data[hdrR + ' Reads Ratio'] =  data[hdrR + ' Reads']*100.0/data['Mut Non-Null Reads Sum']
    data['Orig Indel Reads Ratio'] = data['Orig Indel Reads in Orig Sum']*100.0/data['Orig Non-Null Reads Sum']
    data['All Mut Reads Ratio'] = (data[[x + ' Reads' for x in mut_hdrs]].sum(axis=1))*100.0/data['Mut Non-Null Reads Sum']
    data['MH Dist'] = data['MH Right Loc'] - data['MH Left Loc']
    data['1st Mismatch'] = data.apply(getMismatch, axis=1)
    data['Last Mismatch'] = data.apply(getLastMismatch, axis=1)
    data['MH GC Content'] = data.apply(getMhGC, axis=1)

    mh_indel_types = [('Orig Indel','Left Mut'), ('Orig Indel','Right Mut'), ('Orig Indel','All Mut'),('Left Mut','Right Mut') ]
 
    label_lookup = {'Orig Indel': 'Perc. mutated reads of corresponding microhomology-\nmediated deletion with no sequence mismatches',
                    'Left Mut': 'Perc. mutated reads of mismatched microhomology-\nmediated deletion with retained left sequence',
                    'Right Mut': 'Perc mutated reads of mismatched microhomology-\nmediated deletion with retained right sequence',
                    'All Mut': 'Perc mutated reads of mismatched microhomology-\nmediated deletion (All)'
        }

    fig1 = PL.figure(figsize=(4,4))
    fig_all = PL.figure(figsize=(10,10))
    for i, (mh_typex, mh_typey) in enumerate(mh_indel_types):
        figs = [(fig_all, True), (fig1,False)] if i==2 else [(fig_all, True)]
        for fig, is_all in figs:
            PL.figure(fig.number)
            if is_all: PL.subplot(2,2,i+1)
            for nm,clr  in zip([1,2],['royalblue','orange']):
                nm_data = data.loc[data['Num Mismatches'] == nm]

                sty, lsty = 'o', '-'
                sel_data = nm_data.loc[(nm_data['MH Size'] >= 6) & (nm_data['MH Size'] <= 15)]

                PL.plot(sel_data[mh_typex + ' Reads Ratio'], sel_data[mh_typey + ' Reads Ratio'], sty, color=clr, markersize=4, label='No. MH Mismatches=%d' % (nm))
                rx, ry, grad = getRegrLine(sel_data[[mh_typex + ' Reads Ratio']], sel_data[[mh_typey + ' Reads Ratio']])
                if not is_all: print(grad, nm, mh_typex, mh_typey)
                if i != 3: PL.plot(rx, ry, lsty, color=clr, linewidth=2)

            PL.xlabel(label_lookup[mh_typex])
            PL.ylabel(label_lookup[mh_typey])
            PL.xlim((0,80))
            PL.ylim((0,80))
            PL.plot([0,80],[0,80],'k--')
            PL.legend()
            PL.show(block=False)
    saveFig('mm_mismatch_all')
    PL.figure(fig1.number)
    saveFig('mm_mismatch_one')
示例#10
0
def plotD2(all_result_outputs, label=''):

    #Merge replicates
    mci_merged_data = mergeSamples(all_result_outputs, [],
                                   data_label='perOligoMCI')
    mci_merged_data['Equal MCI'] = (
        mci_merged_data['Most Common Indel']
        == mci_merged_data['Most Common Indel 2']) & (
            mci_merged_data['Most Common Indel']
            == mci_merged_data['Most Common Indel 3'])

    oligo_data = pd.read_csv(
        getHighDataDir() +
        '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv',
        sep='\t')
    remove_under = lambda x: x.replace('_', '')
    oligo_data['Oligo Id'] = oligo_data['ID'].apply(remove_under)
    mci_merged_data_guides = pd.merge(mci_merged_data,
                                      oligo_data[['Oligo Id', 'Guide']],
                                      how='inner',
                                      on='Oligo Id')
    mci_common = mci_merged_data_guides.loc[mci_merged_data['Equal MCI']]
    dmci_data = mci_common.loc[(
        mci_common['MCI Type'] == 'D2'
    )]  #Note: type check discards equally most common indels

    pie_vals, pie_labels = [], []
    is_left_rpt = lambda row: row['Guide'][-5] == row['Guide'][
        -3] and tokFullIndel(row['Most Common Indel'])[2][
            'R'] >= 1 and tokFullIndel(row['Most Common Indel'])[2]['L'] <= -3
    is_right_rpt = lambda row: row['Guide'][-4] == row['Guide'][
        -2] and tokFullIndel(row['Most Common Indel'])[2][
            'R'] >= 2 and tokFullIndel(row['Most Common Indel'])[2]['L'] <= -2
    is_left_only_rpt = lambda row: is_left_rpt(row) and not is_right_rpt(row)
    is_right_only_rpt = lambda row: is_right_rpt(row) and not is_left_rpt(row)
    is_both_rpt = lambda row: is_right_rpt(row) and is_left_rpt(row)

    lrpt_data = dmci_data.loc[dmci_data.apply(is_left_only_rpt, axis=1)]
    pie_labels.append('Y|XY->Y')
    pie_vals.append(len(lrpt_data))
    rrpt_data = dmci_data.loc[dmci_data.apply(is_right_only_rpt, axis=1)]
    pie_labels.append('XY|X->X')
    pie_vals.append(len(rrpt_data))
    rpt_data = dmci_data.loc[dmci_data.apply(is_both_rpt, axis=1)]
    pie_labels.append('XY|XY->XY')
    pie_vals.append(len(rpt_data))

    is_r0 = lambda row: tokFullIndel(row['Most Common Indel'])[2]['R'] == 0
    ro_data = dmci_data.loc[dmci_data.apply(is_r0, axis=1)]
    pie_labels.append('Z|XY->Z')
    pie_vals.append(len(ro_data))
    is_l1 = lambda row: tokFullIndel(row['Most Common Indel'])[2]['L'] == -1
    l1_data = dmci_data.loc[dmci_data.apply(is_l1, axis=1)]
    pie_labels.append('XY|Z->Z')
    pie_vals.append(len(l1_data))

    seen_ids = set(rpt_data['Oligo Id']).union(set(ro_data['Oligo Id'])).union(
        set(l1_data['Oligo Id'])).union(set(lrpt_data['Oligo Id'])).union(
            set(rrpt_data['Oligo Id']))
    is_unseen = lambda id: id not in seen_ids
    unseen_data = dmci_data.loc[dmci_data['Oligo Id'].apply(is_unseen)]
    print(unseen_data)
    assert (len(unseen_data) == 0)
    #pie_labels.append('Other')
    #pie_vals.append(len(unseen_data))

    #pie_labels = [x for x in dmci_data['Most Common Indel'].unique()]
    #pie_vals  = [len(dmci_data.loc[dmci_data['Most Common Indel']==indel]) for indel in pie_labels]
    PL.figure(figsize=(4, 4))
    PL.pie(pie_vals,
           labels=pie_labels,
           autopct='%.1f',
           labeldistance=1.1,
           counterclock=False,
           colors=COLORS)
    PL.title(
        'Size 2 deletions that are\n"most common" for their gRNA in all 3 replicates\n(%d gRNAs from %d total)'
        % (len(dmci_data), len(mci_merged_data)))
    PL.show(block=False)
    saveFig('pie_chart_D2_indel_cats')

    PL.figure(figsize=(12, 8))

    #XY|XY->XY
    PL.subplot(2, 3, 1)
    pie_vals, pie_labels = [], []
    for mh_str in [y + x for x in 'ATGC' for y in 'ATGC']:
        pie_labels.append(mh_str)
        is_mh_str = lambda guide: guide[-5:-3] == mh_str
        pie_vals.append(len(rpt_data.loc[rpt_data['Guide'].apply(is_mh_str)]))
    for dnt, cnt in zip(pie_labels, pie_vals):
        print(dnt, cnt * 100 / sum(pie_vals))
    PL.pie(pie_vals,
           labels=pie_labels,
           autopct='%.1f',
           labeldistance=1.1,
           counterclock=False,
           colors=COLORS)
    PL.title('XY|XY->XY\n(%d gRNAs)' % len(rpt_data))
    PL.show(block=False)

    #__|
    PL.subplot(2, 3, 2)
    pie_vals, pie_labels = [], []
    for mh_str in [y + x for x in 'ATGC' for y in 'ATGC']:
        pie_labels.append(mh_str)
        is_mh_str = lambda guide: guide[-5:-3] == mh_str
        pie_vals.append(len(ro_data.loc[ro_data['Guide'].apply(is_mh_str)]))
    for dnt, cnt in zip(pie_labels, pie_vals):
        print(dnt, cnt * 100 / sum(pie_vals))
    PL.pie(pie_vals,
           labels=pie_labels,
           autopct='%.1f',
           labeldistance=1.1,
           counterclock=False,
           colors=COLORS)
    PL.title('XY| -> __|\n(%d gRNAs)' % len(ro_data))
    PL.show(block=False)

    #|__
    PL.subplot(2, 3, 3)
    pie_vals, pie_labels = [], []
    for mh_str in [y + x for x in 'ATGC' for y in 'ATGC']:
        pie_labels.append(mh_str)
        is_mh_str = lambda guide: guide[-3:-1] == mh_str
        pie_vals.append(len(l1_data.loc[l1_data['Guide'].apply(is_mh_str)]))
    for dnt, cnt in zip(pie_labels, pie_vals):
        print(dnt, cnt * 100 / sum(pie_vals))
    PL.pie(pie_vals,
           labels=pie_labels,
           autopct='%.1f',
           labeldistance=1.1,
           counterclock=False,
           colors=COLORS)
    PL.title('|XY -> |__\n(%d gRNAs)' % len(l1_data))
    PL.show(block=False)

    #XY|X->X
    PL.subplot(2, 3, 4)
    pie_vals, pie_labels = [], []
    for nt in 'ATGC':
        pie_labels.append('%sN|%s -> %s' % (nt, nt, nt))
        is_mh_str = lambda guide: guide[-5] == nt
        pie_vals.append(len(
            lrpt_data.loc[lrpt_data['Guide'].apply(is_mh_str)]))
    PL.pie(pie_vals,
           labels=pie_labels,
           autopct='%.1f',
           labeldistance=1.1,
           counterclock=False,
           colors=COLORS)
    PL.title('XY|X->X\n(%d gRNAs)' % len(lrpt_data))
    PL.show(block=False)

    #X|YX->X
    PL.subplot(2, 3, 5)
    pie_vals, pie_labels = [], []
    for nt in 'ATGC':
        pie_labels.append('%s|N%s -> %s' % (nt, nt, nt))
        is_mh_str = lambda guide: guide[-4] == nt
        pie_vals.append(len(
            rrpt_data.loc[rrpt_data['Guide'].apply(is_mh_str)]))
    PL.pie(pie_vals,
           labels=pie_labels,
           autopct='%.1f',
           labeldistance=1.1,
           counterclock=False,
           colors=COLORS)
    PL.title('X|YX->X\n(%d gRNAs)' % len(rrpt_data))
    PL.show(block=False)
    PL.subplots_adjust(left=0.05,
                       right=0.95,
                       top=0.9,
                       bottom=0.1,
                       hspace=0.3,
                       wspace=0.3)
    saveFig('D2_nts_per_cat')

    PL.figure(figsize=(12, 8))

    #XY|XY->XY
    PL.subplot(2, 3, 1)
    bar_heights, bar_labels, d2_dnt_counts, dnt_counts = [], [], [], []
    for dnt in [y + x for x in 'ATGC' for y in 'ATGC']:
        has_dnt = lambda guide: guide[-5:-3] == dnt and guide[-3:-1] == dnt
        dnt_data = mci_merged_data_guides.loc[
            mci_merged_data_guides['Guide'].apply(has_dnt)]
        dnt_counts.append(
            len(
                set(rpt_data['Oligo Id']).intersection(
                    set(dnt_data['Oligo Id']))))
        d2_dnt_counts.append(len(dnt_data))
        bar_heights.append(dnt_counts[-1] * 100.0 /
                           d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0)
        bar_labels.append(dnt)
        print(
            dnt, dnt_counts[-1] * 100.0 /
            d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0)
    PL.bar(range(len(bar_labels)), bar_heights, width=0.8)
    for i, (hgt, d2cnt,
            cnt) in enumerate(zip(bar_heights, d2_dnt_counts, dnt_counts)):
        PL.text(i - 0.3, hgt + 15, '%d/%d' % (cnt, d2cnt), rotation='vertical')
    PL.xticks(range(len(bar_labels)), bar_labels, rotation='vertical')
    PL.ylim((0, 90))
    PL.xlabel('XY')
    PL.title('XY|XY->XY')
    PL.ylabel(
        'Percent gRNAs with XY|XY->XY deletion\nas most common indel in all 3 replicates'
    )
    PL.show(block=False)

    #__|
    PL.subplot(2, 3, 2)
    bar_heights, bar_labels, d2_dnt_counts, dnt_counts = [], [], [], []
    for dnt in [y + x for x in 'ATGC' for y in 'ATGC']:
        has_dnt = lambda guide: guide[-5:-3] == dnt
        dnt_data = mci_merged_data_guides.loc[
            mci_merged_data_guides['Guide'].apply(has_dnt)]
        dnt_counts.append(
            len(
                set(ro_data['Oligo Id']).intersection(set(
                    dnt_data['Oligo Id']))))
        d2_dnt_counts.append(len(dnt_data))
        bar_heights.append(dnt_counts[-1] * 100.0 /
                           d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0)
        bar_labels.append(dnt)
        print(
            dnt, dnt_counts[-1] * 100.0 /
            d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0)
    PL.bar(range(len(bar_labels)), bar_heights, width=0.8)
    for i, (hgt, d2cnt,
            cnt) in enumerate(zip(bar_heights, d2_dnt_counts, dnt_counts)):
        PL.text(i - 0.3,
                hgt + 1.5,
                '%d/%d' % (cnt, d2cnt),
                rotation='vertical')
    PL.xticks(range(len(bar_labels)), bar_labels, rotation='vertical')
    PL.ylim((0, 8))
    PL.xlabel('XY')
    PL.title('XY| -> __|')
    PL.ylabel(
        'Percent gRNAs with XY| -> __| deletion\nas most common indel in all 3 replicates'
    )
    PL.show(block=False)

    #|__
    PL.subplot(2, 3, 3)
    bar_heights, bar_labels, d2_dnt_counts, dnt_counts = [], [], [], []
    for dnt in [y + x for x in 'ATGC' for y in 'ATGC']:
        has_dnt = lambda guide: guide[-3:-1] == dnt
        dnt_data = mci_merged_data_guides.loc[
            mci_merged_data_guides['Guide'].apply(has_dnt)]
        dnt_counts.append(
            len(
                set(l1_data['Oligo Id']).intersection(set(
                    dnt_data['Oligo Id']))))
        d2_dnt_counts.append(len(dnt_data))
        bar_heights.append(dnt_counts[-1] * 100.0 /
                           d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0)
        bar_labels.append(dnt)
        print(
            dnt, dnt_counts[-1] * 100.0 /
            d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0)
    PL.bar(range(len(bar_labels)), bar_heights, width=0.8)
    for i, (hgt, d2cnt,
            cnt) in enumerate(zip(bar_heights, d2_dnt_counts, dnt_counts)):
        PL.text(i - 0.3,
                hgt + 1.5,
                '%d/%d' % (cnt, d2cnt),
                rotation='vertical')
    PL.xticks(range(len(bar_labels)), bar_labels, rotation='vertical')
    PL.ylim((0, 8))
    PL.xlabel('XY')
    PL.title('|XY -> |__')
    PL.ylabel(
        'Percent gRNAs with |XY -> |__ deletion\nas most common indel in all 3 replicates'
    )
    PL.show(block=False)

    #XY|X->X
    PL.subplot(2, 3, 4)
    bar_heights, bar_labels, d2_nt_counts, nt_counts = [], [], [], []
    for nt in 'ATGC':
        has_nt = lambda guide: guide[-3] == nt and guide[-5] == nt
        nt_data = mci_merged_data_guides.loc[
            mci_merged_data_guides['Guide'].apply(has_nt)]
        nt_counts.append(
            len(
                set(lrpt_data['Oligo Id']).intersection(
                    set(nt_data['Oligo Id']))))
        d2_nt_counts.append(len(nt_data))
        bar_heights.append(nt_counts[-1] * 100.0 /
                           d2_nt_counts[-1] if d2_nt_counts[-1] > 0 else 0)
        bar_labels.append(nt)
    PL.bar(range(len(bar_labels)), bar_heights, width=0.8)
    for i, (hgt, d2cnt,
            cnt) in enumerate(zip(bar_heights, d2_nt_counts, nt_counts)):
        PL.text(i - 0.3, hgt + 0.05, '%d/%d' % (cnt, d2cnt))
    PL.xticks(range(len(bar_labels)), bar_labels)
    PL.ylim((0, 5))
    PL.xlabel('X')
    PL.title('XY|X->X')
    PL.ylabel(
        'Percent gRNAs with XY|X->X deletion\nas most common indel in all 3 replicates'
    )
    PL.show(block=False)

    #X|YX->X
    PL.subplot(2, 3, 5)
    bar_heights, bar_labels, d2_nt_counts, nt_counts = [], [], [], []
    for nt in 'ATGC':
        has_nt = lambda guide: guide[-4] == nt and guide[-2] == nt
        nt_data = mci_merged_data_guides.loc[
            mci_merged_data_guides['Guide'].apply(has_nt)]
        nt_counts.append(
            len(
                set(rrpt_data['Oligo Id']).intersection(
                    set(nt_data['Oligo Id']))))
        d2_nt_counts.append(len(nt_data))
        bar_heights.append(nt_counts[-1] * 100.0 /
                           d2_nt_counts[-1] if d2_nt_counts[-1] > 0 else 0)
        bar_labels.append(nt)
    PL.bar(range(len(bar_labels)), bar_heights, width=0.8)
    for i, (hgt, d2cnt,
            cnt) in enumerate(zip(bar_heights, d2_nt_counts, nt_counts)):
        PL.text(i - 0.3, hgt + 0.05, '%d/%d' % (cnt, d2cnt))
    PL.xticks(range(len(bar_labels)), bar_labels)
    PL.ylim((0, 5))
    PL.xlabel('X')
    PL.title('X|YX->X')
    PL.ylabel(
        'Percent gRNAs with X|YX->X deletion\nas most common indel in all 3 replicates'
    )
    PL.show(block=False)

    PL.subplots_adjust(left=0.05,
                       right=0.95,
                       top=0.9,
                       bottom=0.1,
                       hspace=0.3,
                       wspace=0.3)
    saveFig('D2_nts_per_cat_bars')
示例#11
0
def plotD1(all_result_outputs, label=''):
    mci_merged_data = mergeSamples(all_result_outputs, [],
                                   data_label='perOligoMCI')
    mci_merged_data['Equal MCI'] = (
        mci_merged_data['Most Common Indel']
        == mci_merged_data['Most Common Indel 2']) & (
            mci_merged_data['Most Common Indel']
            == mci_merged_data['Most Common Indel 3'])
    mci_common = mci_merged_data.loc[mci_merged_data['Equal MCI']]
    pie_vals, pie_labels = [], []
    dmci_data = mci_common.loc[(
        mci_common['MCI Type'] == 'D1'
    )]  #Note: type check discards equally most common indels

    spans_cutsite = lambda indel: tokFullIndel(indel)[2][
        'L'] < -1 and tokFullIndel(indel)[2]['R'] > 0
    for nt in 'ATGC':
        is_mh = lambda alt_seq: len(alt_seq) >= 2 and alt_seq == (len(alt_seq)
                                                                  * nt)
        num_repeat_nt = len(dmci_data.loc[
            dmci_data['Altered Sequence'].apply(is_mh)
            & dmci_data['Most Common Indel'].apply(spans_cutsite)])
        pie_vals.append(num_repeat_nt * 100.0 / len(dmci_data))
        print(num_repeat_nt)
        pie_labels.append('Removal of %s\nfrom %s|%s' % (nt, nt, nt))
    is_non_repeat = lambda seq: len(seq) < 2 or seq != (seq[0] * len(seq))
    num_non_repeat = len(
        dmci_data.loc[dmci_data['Altered Sequence'].apply(is_non_repeat)
                      | ~dmci_data['Most Common Indel'].apply(spans_cutsite)])
    pie_vals.append(num_non_repeat * 100.0 / len(dmci_data))
    print(num_non_repeat)
    pie_labels.append('Removal from non-repeat')
    PL.figure(figsize=(4, 4))
    PL.pie(pie_vals,
           labels=pie_labels,
           autopct='%.1f',
           labeldistance=1.1,
           counterclock=False,
           colors=OLD_COLORS)
    PL.title(
        'Size 1 deletions that are\n"most common" for their gRNA in all 3 replicates\n(%d gRNAs from %d total)'
        % (len(dmci_data), len(mci_merged_data)))
    PL.show(block=False)
    saveFig('pie_chart_D1')

    oligo_data = pd.read_csv(
        getHighDataDir() +
        '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv',
        sep='\t')
    remove_under = lambda x: x.replace('_', '')
    oligo_data['Oligo Id'] = oligo_data['ID'].apply(remove_under)
    merged_mci_data = pd.merge(mci_merged_data,
                               oligo_data[['Oligo Id', 'Guide']],
                               how='inner',
                               on='Oligo Id')
    print(len(merged_mci_data))

    nt_dbl_perc_d1, cnt_labels = [], []
    is_d1 = lambda indel: (indel.split('_')[0] == 'D1')
    non_dbl_nt = lambda row: row['Guide'][-4] != row['Guide'][-3]
    nts = 'ATGC'
    for nt in nts:
        double_nt = lambda row: row['Guide'][-4:-2] == (nt + nt)
        dbl_data = merged_mci_data.loc[merged_mci_data.apply(double_nt,
                                                             axis=1)]
        num_dbl_d1 = sum(
            dbl_data['Most Common Indel'].apply(is_d1) & dbl_data['Equal MCI']
            & (dbl_data['Oligo Id'] != 'Oligo28137')
        )  #Oligo28137: Corner case where a guide has CT|T and loses the C
        nt_dbl_perc_d1.append(num_dbl_d1 * 100.0 / len(dbl_data))
        cnt_labels.append('%d/%d' % (num_dbl_d1, len(dbl_data)))
        print(len(dbl_data))
    non_dbl_data = merged_mci_data.loc[merged_mci_data.apply(non_dbl_nt,
                                                             axis=1)]
    print(len(non_dbl_data))
    num_non_dbl_d1 = sum(non_dbl_data['Most Common Indel'].apply(is_d1)
                         & non_dbl_data['Equal MCI'])
    nt_dbl_perc_d1.append(num_non_dbl_d1 * 100.0 / len(non_dbl_data))
    cnt_labels.append('%d/%d' % (num_non_dbl_d1, len(non_dbl_data)))

    PL.figure()
    PL.bar(range(5), nt_dbl_perc_d1, width=0.8)
    for i, cnt in enumerate(cnt_labels):
        PL.text(i - 0.3, nt_dbl_perc_d1[i] + 5.0, cnt)
    PL.xticks(range(5), ['%s' % x * 2 for x in nts] + ['Other'])
    PL.ylim((0, 40))
    PL.xlabel('Nucleotides on either side of cut site')
    PL.ylabel(
        'Percent gRNAs with single nucleotide deletion\nas most common indel in all 3 replicates'
    )
    PL.show(block=False)
    saveFig('D1_bar_3_rep')
示例#12
0
文件: view.py 项目: zhaijj/SelfTarget
def plotProfiles(profiles,
                 rep_reads,
                 pam_idxs,
                 reverses,
                 labels,
                 title='',
                 max_lines=10):
    if len(profiles) == 0: raise Exception('Empty list of profiles')

    colors = [
        FORECAST_GREEN, 'C0', 'C2', 'C2', 'C1', 'C1', 'C3', 'C3', 'C4', 'C4',
        'C5', 'C5', 'C6'
    ]

    PL.rcParams['svg.fonttype'] = 'none'
    ocounts = [getProfileCounts(p1) for p1 in profiles]
    counts = [{
        indel: (cnt, indel, perc1a, perc1b)
        for (cnt, indel, perc1a, perc1b) in x
    } for x in ocounts]

    #Count total non-null reads for each sample (to report in labels)
    nonnull_reads = [
        sum([x[indel][0] for indel in x if indel != '-']) for x in counts
    ]
    labels = [
        '%s(%d Reads)' % (tit, nn) for (tit, nn) in zip(labels, nonnull_reads)
    ]

    #Fetch the indels to display as union of top N indels across profiles
    num_top = 20
    top_indels = [[y[1] for y in x[:num_top]] for x in ocounts]
    union_top_indels = set()
    for x in top_indels:
        union_top_indels = union_top_indels.union(set(x))

    for indel in union_top_indels:
        for count in counts:
            if indel not in count:
                count[indel] = (0, indel, 0.0, 0.0)
    union_top_indels = [x for x in union_top_indels]
    indel_toks = [tokFullIndel(indel) for indel in union_top_indels]
    max_insert = max([0] + [toks[1] for toks in indel_toks if toks[0] == 'I'])

    #Order indels by decreasing average percentage across profiles
    top_av_percs = [(np.mean([x[indel][-1] for x in counts]), indel)
                    for indel in union_top_indels]
    top_av_percs.sort(reverse=True)
    max_indels = max_lines / len(profiles)

    #Figure out Trims
    null_reads = [
        x['-'] if '-' in x else [x[y[1]] for y in ocnt if y[1] in x][0]
        for x, ocnt in zip(rep_reads, ocounts)
    ]
    null_reads = [
        Bio.Seq.reverse_complement(x) if rev else x
        for x, rev in zip(null_reads, reverses)
    ]
    pam_idxs = [
        len(x) - pam if rev else pam
        for x, pam, rev in zip(null_reads, pam_idxs, reverses)
    ]
    min_null, pam_idx = min([(len(null), pidx)
                             for (null, pidx) in zip(null_reads, pam_idxs)])
    Ls = [x - pam_idx for x in pam_idxs]
    Rs = [L + min_null - len(null) for (L, null) in zip(Ls, null_reads)]

    #Plot
    scale_factor = 10.0 / max([x[1][3] for x in ocounts])
    fig = PL.figure(figsize=(9, 5 * len(labels)))
    fig.patch.set_visible(False)
    ax = PL.gca()
    ax.axis('off')
    N = min(len(union_top_indels), max_indels)
    line_height = 0.8
    min_xloc, max_xloc = MIN_X, MAX_X
    PL.ylim((0, (N + 1.0) * line_height))
    bar_ypos, bar_len = [[] for x in profiles], [[] for x in profiles]
    for i, (av_perc, indel) in enumerate(top_av_percs):
        if i > max_indels: break
        for repr, cnts, rev, L1, R1, j in zip(rep_reads, counts, reverses, Ls,
                                              Rs, range(len(Rs))):
            (cnt1, indel1, perc1a, perc1b) = cnts[indel]
            if indel in repr:
                if R1 == 0: R1 = len(repr[indel])
                seq = Bio.Seq.reverse_complement(
                    repr[indel])[L1:R1] if rev else repr[indel][L1:R1]
                padded_seq, red_idxs, green_idxs = padReadForIndel(
                    seq, indel, pam_idx)
                min_xloc, max_xloc = plotSeqLetterwise(
                    padded_seq,
                    (N - i + (j + 0.3) * 1.0 / len(profiles)) * line_height,
                    pam_idx,
                    red_idxs=red_idxs,
                    green_idxs=green_idxs)
            if indel != '-':
                bar_ypos[j].append(
                    (N - i + (j + 0.4) * 1.0 / len(profiles)) * line_height)
                bar_len[j].append(perc1b * scale_factor)
    hist_loc = max_xloc + 10
    for bar1_ypos, bar1_len, label1, clr in zip(bar_ypos, bar_len, labels,
                                                colors):
        PL.barh(bar1_ypos,
                bar1_len,
                height=0.8 * line_height / len(profiles),
                left=hist_loc,
                label=label1,
                color=clr)
        for (ypos, blen) in zip(bar1_ypos, bar1_len):
            PL.text(hist_loc + blen + 1,
                    ypos - 0.5 / len(profiles) * line_height,
                    '%.1f%%' % (blen / scale_factor))
    xlims = (min_xloc - 10, MAX_X + 20 + (min_xloc - MIN_X))
    PL.xlim(xlims)
    for i, (av_perc, indel) in enumerate(top_av_percs):
        if i > max_indels: break
        if indel == '-':
            PL.text(xlims[0], (N - i + 0.4) * line_height,
                    'Target:',
                    fontweight='bold')
        else:
            PL.text(xlims[0], (N - i + 0.4) * line_height,
                    indel.split('_')[0],
                    fontweight='bold')
        PL.plot([min_xloc - 10, max_xloc + 10],
                [(N - i) * line_height, (N - i) * line_height], 'lightgrey')
    PL.plot([0, 0], [0, (N + 1) * line_height], 'k--')
    PL.plot([min_xloc - 10, hist_loc], [N * line_height, N * line_height], 'k')
    PL.plot([hist_loc, hist_loc], [0, N * line_height], 'k')
    PL.xticks([])
    PL.yticks([])
    if len(labels) > 1: PL.legend(loc='upper right')
    PL.text(hist_loc, (N + 0.5) * line_height, title, fontweight='bold')
    PL.subplots_adjust(left=0.05, right=0.95, top=0.95, bottom=0.05)
    PL.show(block=False)
    PL.axis('off')
    saveFig('%s_%d' % (title.replace(' ', '_'), len(labels)), bbox=False)
    return fig