예제 #1
0
def computePieDataWithAmbig(data, label='', norm='I1 Total'):
    merged_data = mergeWithIndelData(data)
    pie_labels = [
        'I1_Rpt Left Reads - NonAmb', 'Ambiguous Rpt Reads',
        'I1_Rpt Right Reads - NonAmb', 'I1_NonRpt Reads'
    ]
    labels = [
        'Repeated\nleft nucleotide', 'Ambiguous\n(Left = Right)',
        'Repeated\nright nucleotide', 'Non-repeated\nnucleotide'
    ]
    pie_data = {
        x: (merged_data[x] * 100.0 / merged_data[norm]).mean(axis=0)
        for x in pie_labels
    }
    PL.figure(figsize=(3, 3))
    PL.pie([pie_data[x] for x in pie_labels],
           labels=labels,
           autopct='%.1f',
           labeldistance=1.05,
           startangle=120.0,
           counterclock=False)
    PL.title('Single nucleotide insertions (I1)')
    PL.show(block=False)
    saveFig('ambig_pie_%s' % label)
    return pie_data, pie_labels, data['Total reads'].median()
예제 #2
0
def plotSumPie(all_result_outputs, label=''):
    mapping = {
        'Large D, No MH': 'D>=4,\nno MH',
        'Large D, MH': 'D>=4,\nMH',
        'Small D, No MH': 'D<4, no MH',
        'Small D, MH': 'D<4, MH'
    }
    merged_data = mergeSamples(all_result_outputs,
                               ['Total reads'] + ALL_LABELS,
                               data_label='perOligoCounts')
    for col in ALL_LABELS:
        merged_data[col + ' Perc'] = merged_data[
            col + ' Sum'] * 100.0 / merged_data['Total reads Sum']
    merged_data.to_csv('data_dump_indel_pie.txt',
                       sep='\t',
                       columns=['Oligo Id'] +
                       [col + ' Perc' for col in ALL_LABELS])
    pie_vals = [merged_data[col + ' Perc'].mean() for col in ALL_LABELS]
    PL.figure(figsize=(4, 4))
    wedge_labels = [mapping[x] if x in mapping else x for x in ALL_LABELS]
    PL.pie(pie_vals,
           labels=wedge_labels,
           autopct='%.1f',
           labeldistance=1.05,
           startangle=90.0,
           counterclock=False,
           colors=COLORS)
    PL.title('Average distribution\n of mutations\n per gRNA')
    PL.show(block=False)
    saveFig('pie_chart_cats')
예제 #3
0
def plotDominantBars(all_result_outputs, label=''):
    pie_labels = ['I1_Rpt Left Reads - NonAmb','Ambiguous Rpt Reads','I1_Rpt Right Reads - NonAmb','I1_NonRpt Reads']
    mci_merged_data = mergeSamples(all_result_outputs, [], data_label='i1IndelData')
    mci_merged_data['Equal MCI'] = (mci_merged_data['Most Common Indel']==mci_merged_data['Most Common Indel 2']) & (mci_merged_data['Most Common Indel']==mci_merged_data['Most Common Indel 3'])
    mci_merged_data['Is Dominant I1'] = (mci_merged_data['Equal MCI'] & (mci_merged_data['MCI Type'] == 'I1'))
    
    oligo_data =  pd.read_csv(getHighDataDir() + '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv',sep='\t')
    remove_under = lambda x: x.replace('_','')
    oligo_data['Oligo Id'] = oligo_data['ID'].apply(remove_under)
    merged_mci_data = pd.merge(mci_merged_data, oligo_data[['Oligo Id','Guide']], how='inner',on='Oligo Id')

    nt_perc_i1, cnt_labels = [], []
    nts = 'ATGC'
    for nt in nts:
        is_nt = lambda guide: (guide[-4] == nt)
        nt_data = merged_mci_data.loc[merged_mci_data['Guide'].apply(is_nt)]
        nt_perc_i1.append(sum(nt_data['Is Dominant I1'])*100.0/len(nt_data))
        cnt_labels.append('%d/%d' % (sum(nt_data['Is Dominant I1']),  len(nt_data)))
    
    PL.figure()
    PL.bar(range(4), nt_perc_i1, width=0.8)
    for i, cnt in enumerate(cnt_labels):
        PL.text(i-0.3,nt_perc_i1[i]+5.0,cnt)
    PL.xticks(range(4), [x for x in nts])
    PL.xlabel('Nucleotide on Left of cut-site')
    PL.ylabel('Percent gRNAs with single nucleotide insertion\nas most common indel in all 3 replicates')
    PL.show(block=False)
    saveFig('I1_bar_3_rep')
예제 #4
0
def plotInFrame(overbeek_inframes, ours_inframes, oof_sel_overbeek_ids,
                pred_results_dir):

    PL.figure(figsize=(4.2, 4.2))
    data = pd.read_csv(pred_results_dir +
                       '/old_new_kl_predicted_summaries.txt',
                       sep='\t').fillna(-1.0)
    label1, label2 = 'New 2x800x In Frame Perc', 'New 1600x In Frame Perc'
    xdata, ydata = data[label1], data[label2]
    PL.plot(xdata,
            ydata,
            '.',
            label='Synthetic between library (R=%.2f)' %
            pearsonr(xdata, ydata)[0],
            color='C0',
            alpha=0.15)
    PL.plot(overbeek_inframes,
            ours_inframes,
            '^',
            label='Synthetic vs Endogenous (R=%.2f)' %
            pearsonr(overbeek_inframes, ours_inframes)[0],
            color='C1')
    for (x, y, id) in zip(overbeek_inframes, ours_inframes,
                          oof_sel_overbeek_ids):
        if abs(x - y) > 25.0: PL.text(x, y, id)
    PL.plot([0, 100], [0, 100], 'k--')
    PL.ylabel('Percent In-Frame Mutations')
    PL.xlabel('Percent In-Frame Mutations')
    PL.legend()
    PL.xticks([], [])
    PL.yticks([], [])
    PL.show(block=False)
    saveFig('in_frame_full_scatter')
def plotInFrameCorr(data):

    shi_data = pd.read_csv(getHighDataDir() + '/shi_deepseq_frame_shifts.txt',
                           sep='\t')

    label1, label2 = 'New In Frame Perc', 'Predicted In Frame Per'
    PL.figure(figsize=(4, 4))

    xdata, ydata = data[label1], data[label2]
    PL.plot(xdata, ydata, '.', alpha=0.15)
    PL.plot(shi_data['Measured Frame Shift'],
            shi_data['Predicted Frame Shift'],
            '^',
            color='orange')
    for x, y, id in zip(shi_data['Measured Frame Shift'],
                        shi_data['Predicted Frame Shift'], shi_data['ID']):
        if x - y > 10:
            PL.text(x, y, id.split('/')[1][:-21])
    PL.plot([0, 100], [0, 100], 'k--')
    PL.title('R=%.3f' % (pearsonr(xdata, ydata)[0]))
    PL.xlabel('percent in frame mutations (measured)')
    PL.ylabel('percent in frame mutations (predicted)')
    PL.ylim((0, 80))
    PL.xlim((0, 80))
    PL.show(block=False)
    saveFig('in_frame_corr_%s_%s' %
            (label1.replace(' ', '_'), label2.replace(' ', '_')))
예제 #6
0
def plotMCIPie(all_result_outputs, label=''):
    mci_merged_data = mergeSamples(all_result_outputs,
                                   ['MCI Type', 'Most Common Indel'],
                                   data_label='perOligoMCI')
    mci_common = mci_merged_data.loc[(mci_merged_data['Most Common Indel'] ==
                                      mci_merged_data['Most Common Indel 2']) &
                                     (mci_merged_data['Most Common Indel'] ==
                                      mci_merged_data['Most Common Indel 3'])]
    pie_vals, pie_labels = [], []
    for mci_type in ALL_LABELS:
        pie_vals.append(len(
            mci_common.loc[mci_common['MCI Type'] == mci_type]))
        pie_labels.append(mci_type)
    pie_vals.append(len(mci_merged_data) - len(mci_common))
    pie_labels.append('Inconsistent\nbetween\nreplicates')

    PL.figure(figsize=(4, 4))
    PL.pie(pie_vals,
           labels=pie_labels,
           autopct='%.1f',
           labeldistance=1.05,
           startangle=90.0,
           counterclock=False,
           colors=COLORS)
    PL.title('Most frequent\nmutation per gRNA')
    PL.show(block=False)
    saveFig('pie_chart_cats_dominant')
def plotKLBoxes(data):
    cols = [
        x for x in data.columns if 'KL' in x and 'Class KL' not in x
        and 'Old' not in x and 'Conventional' not in x and 'Combined' not in x
    ]
    cols.reverse()
    cols_label, max_kl = 'KL', 9
    PL.figure(figsize=(4, 5))

    pt = data.loc[(data['Combined v Predicted KL'] > 0.75)
                  & (data['Combined v Predicted KL'] < 0.8) &
                  (data['Old v New KL'] > 0.75) & (data['Old v New KL'] < 0.8)]
    print(pt['Old Oligo Id'])

    PL.boxplot([data[col] for col in cols],
               positions=range(len(cols)),
               patch_artist=True,
               boxprops=dict(facecolor='C2'),
               medianprops=dict(linewidth=2.5, color='C1'),
               showfliers=False)
    PL.xticks(range(len(cols)), [renameCol(x) for x in cols],
              rotation='vertical')
    for i, col in enumerate(cols):
        PL.text(i - 0.15,
                np.median(data[col]) + 0.02, '%.2f' % np.median(data[col]))
    PL.ylabel(cols_label)
    PL.subplots_adjust(left=0.1, right=0.95, top=0.95, bottom=0.5)
    PL.show(block=False)
    saveFig('kl_compare_old_new_predicted_%s' % cols_label.replace(' ', ''))
예제 #8
0
def plotMergedPieDataWithAmbig(all_result_outputs, label='', norm='I1 Total'):
    pie_labels = [
        'I1_Rpt Left Reads - NonAmb', 'Ambiguous Rpt Reads',
        'I1_Rpt Right Reads - NonAmb', 'I1_NonRpt Reads'
    ]
    merged_data = mergeSamples(all_result_outputs,
                               pie_labels + [norm],
                               data_label='i1IndelData',
                               merge_on=['Oligo Id'])
    labels = [
        'Repeated\nleft nucleotide', 'Ambiguous\n(Left = Right)',
        'Repeated\nright nucleotide', 'Non-repeated\nnucleotide'
    ]
    pie_data = {
        x: (merged_data[x + ' Sum'] * 100.0 /
            merged_data[norm + ' Sum']).mean(axis=0)
        for x in pie_labels
    }
    PL.figure(figsize=(3, 3))
    PL.pie([pie_data[x] for x in pie_labels],
           labels=labels,
           autopct='%.1f',
           labeldistance=1.05,
           startangle=120.0,
           counterclock=False)
    PL.title('Single nucleotide insertions (I1)')
    PL.show(block=False)
    saveFig('ambig_pie')
예제 #9
0
def plotInFrameCorr(data):
    
    for label1, label2 in [('Combined in Frame Perc', 'Predicted In Frame Per')]:
        PL.figure(figsize=(4,4))
        xdata, ydata = data[label1], data[label2]
        PL.plot(xdata,ydata, '.')
        PL.plot([0,100],[0,100],'k--')
        PL.title('R=%.3f' % (pearsonr(xdata, ydata)[0]))
        PL.xlabel(renameCol(label1))
        PL.ylabel(renameCol(label2))
        PL.show(block=False)
        saveFig('in_frame_corr_%s_%s' % (label1.replace(' ','_'),label2.replace(' ','_')))
예제 #10
0
def compareMHlines(all_result_outputs,
                   label='',
                   y_axis='Percent Non-Null Reads',
                   data_label='RegrLines'):

    color_map = {
        'K562': 'b',
        'K562_1600x': 'lightblue',
        'BOB': 'g',
        'RPE1': 'purple',
        'TREX2': 'k',
        'TREX2_2A': 'gray',
        'HAP1': 'r',
        'E14TG2A': 'orange',
        'eCAS9': 'c',
        'WT': 'pink',
        'CHO': 'salmon'
    }
    lysty_map = {2: '--', 3: '--', 5: '--', 7: '-', 10: '-.', 16: ':', 20: ':'}

    dirnames = [x[1] for x in all_result_outputs]
    lystys = [lysty_map[parseSampleName(x)[1]] for x in dirnames]
    clrs = [color_map[parseSampleName(x)[0]] for x in dirnames]

    for mh_len in [9]:
        PL.figure()
        regr_lines = [x[0][data_label][mh_len] for x in all_result_outputs]
        for dirname, regr_line, lysty, clr in zip(dirnames, regr_lines, lystys,
                                                  clrs):
            PL.plot(regr_line[0],
                    regr_line[1],
                    label='%s (R=%.1f)' %
                    (getSimpleName(dirname), regr_line[2]),
                    linewidth=2,
                    color=clr,
                    linestyle=lysty,
                    alpha=0.5)
        PL.xlabel('Distance between nearest ends of microhomologous sequences',
                  fontsize=14)
        PL.ylabel(
            'Correspondng microhomology-mediated deletion\n as percent of total mutated reads',
            fontsize=14)
        PL.tick_params(labelsize=16)
        PL.legend(loc='upper right')
        PL.ylim((0, 70))
        PL.xlim((0, 20))
        PL.xticks(range(0, 21, 5))
        PL.title('Microhomology Length %d' % mh_len, fontsize=18)
        PL.show(block=False)
        saveFig('mh_regr_lines_all_samples__%d' % mh_len)
예제 #11
0
def plotDominantPieDataWithAmbig(all_result_outputs, label=''):
    pie_labels = ['I1_Rpt Left Reads - NonAmb','Ambiguous Rpt Reads','I1_Rpt Right Reads - NonAmb','I1_NonRpt Reads']
    mci_merged_data = mergeSamples(all_result_outputs, [], data_label='i1IndelData')
    mci_merged_data['Equal MCI'] = (mci_merged_data['Most Common Indel']==mci_merged_data['Most Common Indel 2']) & (mci_merged_data['Most Common Indel']==mci_merged_data['Most Common Indel 3'])
    mci_common_i1 = mci_merged_data.loc[mci_merged_data['Equal MCI'] & (mci_merged_data['MCI Type'] == 'I1')]
    labels = ['Repeated\nleft nucleotide', 'Ambiguous\n(Left = Right)', 'Repeated\nright nucleotide', 'Non-repeated\nnucleotide']
    pie_data = []
    for label in pie_labels:
        is_rpt = lambda row: row['MCI Reads'] == row[label]
        pie_data.append(sum(mci_common_i1.apply(is_rpt,axis=1))*100.0/len(mci_common_i1))
    PL.figure(figsize=(3,3))
    PL.pie(pie_data, labels=labels, autopct='%.1f', labeldistance=1.05, startangle=180.0, counterclock=False)
    PL.title('Dominant single nucleotide insertions (I1)\n%d from %d gRNAs' % (len(mci_common_i1), len(mci_merged_data)))
    PL.show(block=False)
    saveFig('I1_dom_pie_3_rep')
예제 #12
0
def plotPercScatterAnalysis(data, label='test', y_axis = 'Percent Non-Null Reads', plot_scatters=False, plot_regr_lines=False, scatter_mh_lens=[], mh_lens=[9]):
    
    plot_dir = getPlotDir()
    regr_lines = {}
    for mh_len in mh_lens:
        mh_data = data.loc[data['MH Len'] == mh_len]
        mh_rdata = mh_data.loc[(mh_data['MH Dist'] >= 0) & (mh_data['MH Dist'] < (30-mh_len)) ]
        
        regr = linear_model.LinearRegression()
        rx, ry = mh_rdata[['MH Dist']], mh_rdata[[y_axis]] #np.log(mh_rdata[[y_axis]])
        regr.fit(rx, ry)
        corr = scipy.stats.pearsonr(rx, ry)
        min_x, max_x = rx.min()[0], rx.max()[0]
        x_pts = [min_x, max_x]
        regr_lines[mh_len] = (x_pts,[regr.predict(x)[0] for x in x_pts],corr[0])
        
        if plot_scatters and mh_len in scatter_mh_lens:
            fig = PL.figure(figsize=(5,5))
            PL.plot( mh_data['MH Dist'], mh_data[y_axis], '.', alpha=0.4 )
            PL.plot(regr_lines[mh_len][0],regr_lines[mh_len][1],'dodgerblue',linewidth=3)
        
            PL.xlabel('Distance between nearest ends of\nmicrohomologous sequences',fontsize=14)
            PL.ylabel('Percent of mutated reads of corresponding\nMH-mediated deletion',fontsize=14)
            PL.tick_params(labelsize=14)
            PL.xlim((0,20))
            PL.title('Microhomology of length %d (r=%.2f)' % (mh_len,corr[0]),fontsize=14)
            PL.show(block=False)  
            saveFig('mh_scatter_len%d_%s' % (mh_len,label.split('/')[-1])) 
    
    if plot_regr_lines:
        fig = PL.figure()
        output_data = {}
        for mh_len in mh_lens:
            fit_data = regr_lines[mh_len]
            if mh_len > 15:
                continue
            lsty = '--' if mh_len < 9 else '-'
            PL.plot(fit_data[0], fit_data[1], linewidth=2, linestyle=lsty, label='MH length %d (R=%.1f)' % (mh_len, fit_data[2]))
        PL.title(label,fontsize=18)
        PL.xlabel('Distance between nearest ends of\nmicrohomologous sequences',fontsize=14)
        PL.ylabel('Percent of mutated reads of corresponding\nMH-mediated deletion',fontsize=14)
          
        PL.tick_params(labelsize=18)
        PL.legend()
        PL.ylim((0,100))
        PL.show(block=False)  
        saveFig(plot_dir + '/mh_scatter_all_len_%s' % label.split('/')[-1]) 
    return regr_lines
예제 #13
0
def plotMergedI1Repeats(all_result_outputs, label=''):
    merged_data = mergeSamples(all_result_outputs, ['I1_Rpt Left Reads - NonAmb','Total reads'], data_label='i1IndelData', merge_on=['Oligo Id','Repeat Nucleotide Left'])
    nt_mean_percs, nts = [], ['A','T','G','C']
    for nt in nts:
        nt_data  = merged_data.loc[merged_data['Repeat Nucleotide Left'] == nt]  
        nt_mean_percs.append((nt_data['I1_Rpt Left Reads - NonAmb Sum']*100.0/nt_data['Total reads Sum']).mean())
    PL.figure(figsize=(3,3))
    PL.bar(range(4),nt_mean_percs)
    for i in range(4):
        PL.text(i-0.25,nt_mean_percs[i]+0.8,'%.1f' % nt_mean_percs[i])
    PL.xticks(range(4),nts)
    PL.ylim((0,26))
    PL.xlabel('PAM distal nucleotide\nadjacent to the cut site')
    PL.ylabel('I1 repeated left nucleotide\nas percent of total mutated reads')
    PL.show(block=False)
    saveFig('i1_rtp_nt')
예제 #14
0
def i1RepeatNucleotides(data, label=''):
    merged_data = mergeWithIndelData(data)
    nt_mean_percs, nts = [], ['A','T','G','C']
    for nt in nts:
        nt_data  = merged_data.loc[merged_data['Repeat Nucleotide Left'] == nt]  
        nt_mean_percs.append((nt_data['I1_Rpt Left Reads - NonAmb']*100.0/nt_data['Total reads']).mean())
    PL.figure(figsize=(3,3))
    PL.bar(range(4),nt_mean_percs)
    for i in range(4):
        PL.text(i-0.25,nt_mean_percs[i]+0.8,'%.1f' % nt_mean_percs[i])
    PL.xticks(range(4),nts)
    PL.ylim((0,26))
    PL.xlabel('PAM distal nucleotide\nadjacent to the cut site')
    PL.ylabel('I1 repeated left nucleotide\nas percent of total mutated reads')
    PL.show(block=False)
    saveFig('i1_rtp_nt_%s' % label)
예제 #15
0
def plotHeatMap(data, col='KL without null', label=''):

    #Compute and collate medians
    sel_cols = [x for x in data.columns if col in x]
    cmp_meds = data[sel_cols].median(axis=0)
    samples = sortSampleNames(getUniqueSamples(sel_cols))
    cell_lines = [
        'CHO', 'E14TG2A', 'BOB', 'RPE1', 'HAP1', 'K562', 'eCAS9', 'TREX2'
    ]
    sample_idxs = [(cell_lines.index(parseSampleName(x)[0]), x)
                   for x in getUniqueSamples(sel_cols)]
    sample_idxs.sort()
    samples = [x[1] for x in sample_idxs]

    N = len(samples)
    meds = np.zeros((N, N))
    for colname in sel_cols:
        dir1, dir2 = getDirsFromFilename(colname.split('$')[-1])
        idx1, idx2 = samples.index(dir1), samples.index(dir2)
        meds[idx1, idx2] = cmp_meds[colname]
        meds[idx2, idx1] = cmp_meds[colname]

    for i in range(N):
        print(' '.join(['%.2f' % x for x in meds[i, :]]))
    print(np.median(meds[:, :-4], axis=0))

    #Display in Heatmap
    PL.figure(figsize=(5, 5))
    PL.imshow(meds, cmap='hot_r', vmin=0.0, vmax=3.0, interpolation='nearest')
    PL.colorbar()
    PL.xticks(range(N))
    PL.yticks(range(N))
    PL.title(
        "Median KL"
    )  # between %d mutational profiles (for %s with >%d mutated reads)" % (col, len(data), label, MIN_READS))
    ax1 = PL.gca()
    ax1.set_yticklabels([getSimpleName(x) for x in samples],
                        rotation='horizontal')
    ax1.set_xticklabels([getSimpleName(x) for x in samples],
                        rotation='vertical')
    PL.subplots_adjust(left=0.25, right=0.95, top=0.95, bottom=0.25)
    PL.show(block=False)
    saveFig('median_kl_heatmap_cell_lines')
예제 #16
0
def compareMHK562lines(all_result_outputs, label='', y_axis = 'Percent Non-Null Reads', data_label='RegrLines'):

    dirnames = [x[1] for x in all_result_outputs]
    clrs = ['silver','grey','darkgreen','green','lightgreen','royalblue','dodgerblue','skyblue','mediumpurple','orchid','red','orange','salmon']

    fig = PL.figure(figsize=(6,6))
    leg_handles = []
    mh_lens = [3,4,5,6,7,8,9,10,11,12,13,14,15]
    for mh_len, clr in zip(mh_lens,clrs):
        regr_lines = [x[0][data_label][mh_len] for x in all_result_outputs]
        mean_line = np.mean([x[:2] for x in regr_lines], axis=0)
        leg_handles.append(PL.plot(mean_line[0], mean_line[1], label='MH Len=%d  (R=%.1f)' % (mh_len,np.mean([x[2] for x in regr_lines])) , linewidth=2, color=clr )[0])
    PL.xlabel('Distance between nearest ends of\nmicrohomologous sequences',fontsize=16)
    PL.ylabel('Correspondng microhomology-mediated deletion\n as percent of total mutated reads',fontsize=16)
    PL.tick_params(labelsize=16)
    PL.legend(handles=[x for x in reversed(leg_handles)], loc='upper right')
    PL.ylim((0,80))
    PL.xlim((0,20))
    PL.xticks(range(0,21,5))
    PL.show(block=False)  
    saveFig('mh_regr_lines_K562')
예제 #17
0
def plotGCContent(all_result_outputs, label=''):
    
    #Merge data across samples
    unique_cols = ['Oligo ID','Indel', 'GC Content', 'MH Len', 'MH Dist'] 
    datas = [x[0]['Data'][unique_cols + ['Indel Reads', 'Non-Null Reads']] for x in all_result_outputs]
    merged_data = datas[0]
    for i, data in enumerate(datas[1:]):
        merged_data = pd.merge(merged_data, data, on=unique_cols, suffixes=('','%d' % (i+2)), how='outer')
    suffix = lambda i: '%d' % (i+1) if i > 0 else ''
    merged_data['Indel Reads Sum'] = merged_data[['Indel Reads' + suffix(i) for i in range(len(datas))]].sum(axis=1)
    merged_data['Non-Null Reads Sum'] = merged_data[['Non-Null Reads' + suffix(i) for i in range(len(datas))]].sum(axis=1)

    #Compute mean regression lines across samples for each MH length
    mean_lines = {}
    for mh_len in range(2,16):
        if mh_len not in all_result_outputs[0][0]['RegrLines']: continue
        regr_lines = [x[0]['RegrLines'][mh_len][:2] for x in all_result_outputs]
        mean_lines[mh_len] = np.mean(regr_lines, axis=0)

    #Restrict to only MH dist in (0,10) and adjust for mh len-dist relationship
    for mh_len in [9]:
        compute_resid = lambda row: row['Perc Reads']# - getRegrValue(row['MH Len'],row['MH Dist'],mean_lines)
        sel_data = merged_data.loc[(merged_data['MH Len'] == mh_len) & (merged_data['MH Dist'] >= 0) & (merged_data['MH Dist'] <= 10)]
        sel_data['Perc Reads'] = sel_data['Indel Reads Sum']*100.0/sel_data['Non-Null Reads Sum']
        sel_data['Perc Reads Residual'] = sel_data.apply(compute_resid, axis=1)
        PL.figure(figsize=(4,4))
        gcs = sel_data['GC Content'].unique(); gcs.sort()
        boxdata_lk = {gc: sel_data.loc[sel_data['GC Content'] == gc]['Perc Reads Residual'] for gc in gcs}
        gcs = [gc for gc in gcs if len(boxdata_lk[gc])>20]  #Limit to GC with at least 20 data points
        boxdata = [boxdata_lk[gc] for gc in gcs]
        print([len(x) for x in boxdata])
        PL.boxplot(boxdata)
        PL.ylabel('Percent total mutated reads of MH-mediated deletion')
        PL.xlabel('GC content of microhomologous sequence')
        PL.title('Microhomology of length %d\n(at max 10 distance)' % mh_len)
        PL.xticks(range(1,len(gcs)+1),gcs)
        PL.show(block=False)
        saveFig('gc_content_mh%d' % mh_len)
예제 #18
0
def runAnalysis():

    data = pd.read_csv(getHighDataDir() + '/old_new_kl_summaries.txt',
                       sep='\t').fillna(-1.0)
    kl_cols = [
        x for x in data.columns
        if 'KL' in x and 'Class KL' not in x and 'Old v Old' not in x
    ]
    max_kl = 9
    PL.figure(figsize=(2.5, 4))
    bps = []
    box_types = [('C2', 'Within Library'), ('C0', 'Between Library')]
    for i, (clr, box_type) in enumerate(box_types):
        col_box_data = [
            data[col] for col in kl_cols if renameCol(col) == box_type
        ]
        pos = [2 * x + i + 1 for x in range(len(col_box_data))]
        print('KL', box_type, np.median(col_box_data, axis=1))
        bps.append(
            PL.boxplot(col_box_data,
                       positions=pos,
                       patch_artist=True,
                       boxprops=dict(facecolor=clr),
                       showfliers=False))
    PL.xticks([1.5, 3.5, 5.5],
              ['Same\ngRNA', 'Other\ngRNA', 'Other\ngRNA\n(Rpt)'])
    PL.plot([2.5, 2.5], [0, max_kl], '-', color='silver')
    PL.plot([4.5, 4.5], [0, max_kl], '-', color='silver')
    PL.xlim((0.5, 6.5))
    PL.ylim((0, max_kl))
    PL.ylabel('KL')
    PL.subplots_adjust(left=0.1, right=0.95, top=0.95, bottom=0.25)
    PL.legend([bp["boxes"][0] for bp in bps], [x[1] for x in box_types],
              loc='upper left')
    PL.show(block=False)
    saveFig('kl_compare_old_new_KL')
def plotMicrohomologyMismatches(all_result_outputs, label=''):
    
    mut_hdrs =  ['Left Mut', 'Right Mut','Merged Mut1', 'Merged Mut2']
    cols_to_sum = [x + ' Indel Reads in Mut' for x in mut_hdrs] + ['Orig Indel Reads in Orig', 'Mut Non-Null Reads', 'Orig Non-Null Reads'] 
    common_cols = ['Oligo ID','Mapped Oligo Id','Num Mismatches','Orig MH','Left Mut-MH','Right Mut-MH','Merged Mut 1 MH','Merged Mut 2 MH','Orig Indel','Left Mut-MH Indel','Right Mut-MH Indel','Merge Mut 1 Indel','Merge Mut 2 Indel']
    data =  mergeSamples(all_result_outputs, cols_to_sum, merge_on=common_cols)

    getLeft = lambda indel: tokFullIndel(indel)[2]['L']
    getRight = lambda indel: tokFullIndel(indel)[2]['R']
    getMHSize = lambda indel: tokFullIndel(indel)[2]['C']

    oligo_data = pd.read_csv(getHighDataDir() + '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv', sep='\t')
    oligo_data['Guide is matched'] = oligo_data.apply(isMatched, axis=1)
    reverse_lookup = {x: y == 'REVERSE' for (x,y) in zip(oligo_data['ID'],oligo_data['PAM Direction'])}
    is_reverse = lambda x: reverse_lookup[x]

    data = pd.merge(data, oligo_data[['ID','Guide is matched']], left_on='Oligo ID', right_on='ID', how='inner')

    data['MH Size'] = data['Orig Indel'].apply(getMHSize)
    data = data.loc[(data['MH Size'] != 0) & (data['Guide is matched'])]
    data['MH Left Loc'] = data['Orig Indel'].apply(getLeft) + data['MH Size']
    data['MH Right Loc'] = data['Orig Indel'].apply(getRight) - data['MH Size']
    data['Is Reverse'] = data['Oligo ID'].apply(is_reverse)

    for hdrL,hdrR in [mut_hdrs[:2], mut_hdrs[2:]]:
        data[hdrL + ' Reads'] = data['Is Reverse']*data[hdrR + ' Indel Reads in Mut Sum'] + (1- data['Is Reverse'])*data[hdrL + ' Indel Reads in Mut Sum']
        data[hdrR + ' Reads'] = data['Is Reverse']*data[hdrL + ' Indel Reads in Mut Sum'] + (1- data['Is Reverse'])*data[hdrR + ' Indel Reads in Mut Sum']
        data[hdrL + ' Reads Ratio'] =  data[hdrL + ' Reads']*100.0/data['Mut Non-Null Reads Sum']
        data[hdrR + ' Reads Ratio'] =  data[hdrR + ' Reads']*100.0/data['Mut Non-Null Reads Sum']
    data['Orig Indel Reads Ratio'] = data['Orig Indel Reads in Orig Sum']*100.0/data['Orig Non-Null Reads Sum']
    data['All Mut Reads Ratio'] = (data[[x + ' Reads' for x in mut_hdrs]].sum(axis=1))*100.0/data['Mut Non-Null Reads Sum']
    data['MH Dist'] = data['MH Right Loc'] - data['MH Left Loc']
    data['1st Mismatch'] = data.apply(getMismatch, axis=1)
    data['Last Mismatch'] = data.apply(getLastMismatch, axis=1)
    data['MH GC Content'] = data.apply(getMhGC, axis=1)

    mh_indel_types = [('Orig Indel','Left Mut'), ('Orig Indel','Right Mut'), ('Orig Indel','All Mut'),('Left Mut','Right Mut') ]
 
    label_lookup = {'Orig Indel': 'Perc. mutated reads of corresponding microhomology-\nmediated deletion with no sequence mismatches',
                    'Left Mut': 'Perc. mutated reads of mismatched microhomology-\nmediated deletion with retained left sequence',
                    'Right Mut': 'Perc mutated reads of mismatched microhomology-\nmediated deletion with retained right sequence',
                    'All Mut': 'Perc mutated reads of mismatched microhomology-\nmediated deletion (All)'
        }

    fig1 = PL.figure(figsize=(4,4))
    fig_all = PL.figure(figsize=(10,10))
    for i, (mh_typex, mh_typey) in enumerate(mh_indel_types):
        figs = [(fig_all, True), (fig1,False)] if i==2 else [(fig_all, True)]
        for fig, is_all in figs:
            PL.figure(fig.number)
            if is_all: PL.subplot(2,2,i+1)
            for nm,clr  in zip([1,2],['royalblue','orange']):
                nm_data = data.loc[data['Num Mismatches'] == nm]

                sty, lsty = 'o', '-'
                sel_data = nm_data.loc[(nm_data['MH Size'] >= 6) & (nm_data['MH Size'] <= 15)]

                PL.plot(sel_data[mh_typex + ' Reads Ratio'], sel_data[mh_typey + ' Reads Ratio'], sty, color=clr, markersize=4, label='No. MH Mismatches=%d' % (nm))
                rx, ry, grad = getRegrLine(sel_data[[mh_typex + ' Reads Ratio']], sel_data[[mh_typey + ' Reads Ratio']])
                if not is_all: print(grad, nm, mh_typex, mh_typey)
                if i != 3: PL.plot(rx, ry, lsty, color=clr, linewidth=2)

            PL.xlabel(label_lookup[mh_typex])
            PL.ylabel(label_lookup[mh_typey])
            PL.xlim((0,80))
            PL.ylim((0,80))
            PL.plot([0,80],[0,80],'k--')
            PL.legend()
            PL.show(block=False)
    saveFig('mm_mismatch_all')
    PL.figure(fig1.number)
    saveFig('mm_mismatch_one')
예제 #20
0
파일: view.py 프로젝트: zhaijj/SelfTarget
def plotProfiles(profiles,
                 rep_reads,
                 pam_idxs,
                 reverses,
                 labels,
                 title='',
                 max_lines=10):
    if len(profiles) == 0: raise Exception('Empty list of profiles')

    colors = [
        FORECAST_GREEN, 'C0', 'C2', 'C2', 'C1', 'C1', 'C3', 'C3', 'C4', 'C4',
        'C5', 'C5', 'C6'
    ]

    PL.rcParams['svg.fonttype'] = 'none'
    ocounts = [getProfileCounts(p1) for p1 in profiles]
    counts = [{
        indel: (cnt, indel, perc1a, perc1b)
        for (cnt, indel, perc1a, perc1b) in x
    } for x in ocounts]

    #Count total non-null reads for each sample (to report in labels)
    nonnull_reads = [
        sum([x[indel][0] for indel in x if indel != '-']) for x in counts
    ]
    labels = [
        '%s(%d Reads)' % (tit, nn) for (tit, nn) in zip(labels, nonnull_reads)
    ]

    #Fetch the indels to display as union of top N indels across profiles
    num_top = 20
    top_indels = [[y[1] for y in x[:num_top]] for x in ocounts]
    union_top_indels = set()
    for x in top_indels:
        union_top_indels = union_top_indels.union(set(x))

    for indel in union_top_indels:
        for count in counts:
            if indel not in count:
                count[indel] = (0, indel, 0.0, 0.0)
    union_top_indels = [x for x in union_top_indels]
    indel_toks = [tokFullIndel(indel) for indel in union_top_indels]
    max_insert = max([0] + [toks[1] for toks in indel_toks if toks[0] == 'I'])

    #Order indels by decreasing average percentage across profiles
    top_av_percs = [(np.mean([x[indel][-1] for x in counts]), indel)
                    for indel in union_top_indels]
    top_av_percs.sort(reverse=True)
    max_indels = max_lines / len(profiles)

    #Figure out Trims
    null_reads = [
        x['-'] if '-' in x else [x[y[1]] for y in ocnt if y[1] in x][0]
        for x, ocnt in zip(rep_reads, ocounts)
    ]
    null_reads = [
        Bio.Seq.reverse_complement(x) if rev else x
        for x, rev in zip(null_reads, reverses)
    ]
    pam_idxs = [
        len(x) - pam if rev else pam
        for x, pam, rev in zip(null_reads, pam_idxs, reverses)
    ]
    min_null, pam_idx = min([(len(null), pidx)
                             for (null, pidx) in zip(null_reads, pam_idxs)])
    Ls = [x - pam_idx for x in pam_idxs]
    Rs = [L + min_null - len(null) for (L, null) in zip(Ls, null_reads)]

    #Plot
    scale_factor = 10.0 / max([x[1][3] for x in ocounts])
    fig = PL.figure(figsize=(9, 5 * len(labels)))
    fig.patch.set_visible(False)
    ax = PL.gca()
    ax.axis('off')
    N = min(len(union_top_indels), max_indels)
    line_height = 0.8
    min_xloc, max_xloc = MIN_X, MAX_X
    PL.ylim((0, (N + 1.0) * line_height))
    bar_ypos, bar_len = [[] for x in profiles], [[] for x in profiles]
    for i, (av_perc, indel) in enumerate(top_av_percs):
        if i > max_indels: break
        for repr, cnts, rev, L1, R1, j in zip(rep_reads, counts, reverses, Ls,
                                              Rs, range(len(Rs))):
            (cnt1, indel1, perc1a, perc1b) = cnts[indel]
            if indel in repr:
                if R1 == 0: R1 = len(repr[indel])
                seq = Bio.Seq.reverse_complement(
                    repr[indel])[L1:R1] if rev else repr[indel][L1:R1]
                padded_seq, red_idxs, green_idxs = padReadForIndel(
                    seq, indel, pam_idx)
                min_xloc, max_xloc = plotSeqLetterwise(
                    padded_seq,
                    (N - i + (j + 0.3) * 1.0 / len(profiles)) * line_height,
                    pam_idx,
                    red_idxs=red_idxs,
                    green_idxs=green_idxs)
            if indel != '-':
                bar_ypos[j].append(
                    (N - i + (j + 0.4) * 1.0 / len(profiles)) * line_height)
                bar_len[j].append(perc1b * scale_factor)
    hist_loc = max_xloc + 10
    for bar1_ypos, bar1_len, label1, clr in zip(bar_ypos, bar_len, labels,
                                                colors):
        PL.barh(bar1_ypos,
                bar1_len,
                height=0.8 * line_height / len(profiles),
                left=hist_loc,
                label=label1,
                color=clr)
        for (ypos, blen) in zip(bar1_ypos, bar1_len):
            PL.text(hist_loc + blen + 1,
                    ypos - 0.5 / len(profiles) * line_height,
                    '%.1f%%' % (blen / scale_factor))
    xlims = (min_xloc - 10, MAX_X + 20 + (min_xloc - MIN_X))
    PL.xlim(xlims)
    for i, (av_perc, indel) in enumerate(top_av_percs):
        if i > max_indels: break
        if indel == '-':
            PL.text(xlims[0], (N - i + 0.4) * line_height,
                    'Target:',
                    fontweight='bold')
        else:
            PL.text(xlims[0], (N - i + 0.4) * line_height,
                    indel.split('_')[0],
                    fontweight='bold')
        PL.plot([min_xloc - 10, max_xloc + 10],
                [(N - i) * line_height, (N - i) * line_height], 'lightgrey')
    PL.plot([0, 0], [0, (N + 1) * line_height], 'k--')
    PL.plot([min_xloc - 10, hist_loc], [N * line_height, N * line_height], 'k')
    PL.plot([hist_loc, hist_loc], [0, N * line_height], 'k')
    PL.xticks([])
    PL.yticks([])
    if len(labels) > 1: PL.legend(loc='upper right')
    PL.text(hist_loc, (N + 0.5) * line_height, title, fontweight='bold')
    PL.subplots_adjust(left=0.05, right=0.95, top=0.95, bottom=0.05)
    PL.show(block=False)
    PL.axis('off')
    saveFig('%s_%d' % (title.replace(' ', '_'), len(labels)), bbox=False)
    return fig
예제 #21
0
def plotD1(all_result_outputs, label=''):
    mci_merged_data = mergeSamples(all_result_outputs, [],
                                   data_label='perOligoMCI')
    mci_merged_data['Equal MCI'] = (
        mci_merged_data['Most Common Indel']
        == mci_merged_data['Most Common Indel 2']) & (
            mci_merged_data['Most Common Indel']
            == mci_merged_data['Most Common Indel 3'])
    mci_common = mci_merged_data.loc[mci_merged_data['Equal MCI']]
    pie_vals, pie_labels = [], []
    dmci_data = mci_common.loc[(
        mci_common['MCI Type'] == 'D1'
    )]  #Note: type check discards equally most common indels

    spans_cutsite = lambda indel: tokFullIndel(indel)[2][
        'L'] < -1 and tokFullIndel(indel)[2]['R'] > 0
    for nt in 'ATGC':
        is_mh = lambda alt_seq: len(alt_seq) >= 2 and alt_seq == (len(alt_seq)
                                                                  * nt)
        num_repeat_nt = len(dmci_data.loc[
            dmci_data['Altered Sequence'].apply(is_mh)
            & dmci_data['Most Common Indel'].apply(spans_cutsite)])
        pie_vals.append(num_repeat_nt * 100.0 / len(dmci_data))
        print(num_repeat_nt)
        pie_labels.append('Removal of %s\nfrom %s|%s' % (nt, nt, nt))
    is_non_repeat = lambda seq: len(seq) < 2 or seq != (seq[0] * len(seq))
    num_non_repeat = len(
        dmci_data.loc[dmci_data['Altered Sequence'].apply(is_non_repeat)
                      | ~dmci_data['Most Common Indel'].apply(spans_cutsite)])
    pie_vals.append(num_non_repeat * 100.0 / len(dmci_data))
    print(num_non_repeat)
    pie_labels.append('Removal from non-repeat')
    PL.figure(figsize=(4, 4))
    PL.pie(pie_vals,
           labels=pie_labels,
           autopct='%.1f',
           labeldistance=1.1,
           counterclock=False,
           colors=OLD_COLORS)
    PL.title(
        'Size 1 deletions that are\n"most common" for their gRNA in all 3 replicates\n(%d gRNAs from %d total)'
        % (len(dmci_data), len(mci_merged_data)))
    PL.show(block=False)
    saveFig('pie_chart_D1')

    oligo_data = pd.read_csv(
        getHighDataDir() +
        '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv',
        sep='\t')
    remove_under = lambda x: x.replace('_', '')
    oligo_data['Oligo Id'] = oligo_data['ID'].apply(remove_under)
    merged_mci_data = pd.merge(mci_merged_data,
                               oligo_data[['Oligo Id', 'Guide']],
                               how='inner',
                               on='Oligo Id')
    print(len(merged_mci_data))

    nt_dbl_perc_d1, cnt_labels = [], []
    is_d1 = lambda indel: (indel.split('_')[0] == 'D1')
    non_dbl_nt = lambda row: row['Guide'][-4] != row['Guide'][-3]
    nts = 'ATGC'
    for nt in nts:
        double_nt = lambda row: row['Guide'][-4:-2] == (nt + nt)
        dbl_data = merged_mci_data.loc[merged_mci_data.apply(double_nt,
                                                             axis=1)]
        num_dbl_d1 = sum(
            dbl_data['Most Common Indel'].apply(is_d1) & dbl_data['Equal MCI']
            & (dbl_data['Oligo Id'] != 'Oligo28137')
        )  #Oligo28137: Corner case where a guide has CT|T and loses the C
        nt_dbl_perc_d1.append(num_dbl_d1 * 100.0 / len(dbl_data))
        cnt_labels.append('%d/%d' % (num_dbl_d1, len(dbl_data)))
        print(len(dbl_data))
    non_dbl_data = merged_mci_data.loc[merged_mci_data.apply(non_dbl_nt,
                                                             axis=1)]
    print(len(non_dbl_data))
    num_non_dbl_d1 = sum(non_dbl_data['Most Common Indel'].apply(is_d1)
                         & non_dbl_data['Equal MCI'])
    nt_dbl_perc_d1.append(num_non_dbl_d1 * 100.0 / len(non_dbl_data))
    cnt_labels.append('%d/%d' % (num_non_dbl_d1, len(non_dbl_data)))

    PL.figure()
    PL.bar(range(5), nt_dbl_perc_d1, width=0.8)
    for i, cnt in enumerate(cnt_labels):
        PL.text(i - 0.3, nt_dbl_perc_d1[i] + 5.0, cnt)
    PL.xticks(range(5), ['%s' % x * 2 for x in nts] + ['Other'])
    PL.ylim((0, 40))
    PL.xlabel('Nucleotides on either side of cut site')
    PL.ylabel(
        'Percent gRNAs with single nucleotide deletion\nas most common indel in all 3 replicates'
    )
    PL.show(block=False)
    saveFig('D1_bar_3_rep')
예제 #22
0
def plotD2(all_result_outputs, label=''):

    #Merge replicates
    mci_merged_data = mergeSamples(all_result_outputs, [],
                                   data_label='perOligoMCI')
    mci_merged_data['Equal MCI'] = (
        mci_merged_data['Most Common Indel']
        == mci_merged_data['Most Common Indel 2']) & (
            mci_merged_data['Most Common Indel']
            == mci_merged_data['Most Common Indel 3'])

    oligo_data = pd.read_csv(
        getHighDataDir() +
        '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv',
        sep='\t')
    remove_under = lambda x: x.replace('_', '')
    oligo_data['Oligo Id'] = oligo_data['ID'].apply(remove_under)
    mci_merged_data_guides = pd.merge(mci_merged_data,
                                      oligo_data[['Oligo Id', 'Guide']],
                                      how='inner',
                                      on='Oligo Id')
    mci_common = mci_merged_data_guides.loc[mci_merged_data['Equal MCI']]
    dmci_data = mci_common.loc[(
        mci_common['MCI Type'] == 'D2'
    )]  #Note: type check discards equally most common indels

    pie_vals, pie_labels = [], []
    is_left_rpt = lambda row: row['Guide'][-5] == row['Guide'][
        -3] and tokFullIndel(row['Most Common Indel'])[2][
            'R'] >= 1 and tokFullIndel(row['Most Common Indel'])[2]['L'] <= -3
    is_right_rpt = lambda row: row['Guide'][-4] == row['Guide'][
        -2] and tokFullIndel(row['Most Common Indel'])[2][
            'R'] >= 2 and tokFullIndel(row['Most Common Indel'])[2]['L'] <= -2
    is_left_only_rpt = lambda row: is_left_rpt(row) and not is_right_rpt(row)
    is_right_only_rpt = lambda row: is_right_rpt(row) and not is_left_rpt(row)
    is_both_rpt = lambda row: is_right_rpt(row) and is_left_rpt(row)

    lrpt_data = dmci_data.loc[dmci_data.apply(is_left_only_rpt, axis=1)]
    pie_labels.append('Y|XY->Y')
    pie_vals.append(len(lrpt_data))
    rrpt_data = dmci_data.loc[dmci_data.apply(is_right_only_rpt, axis=1)]
    pie_labels.append('XY|X->X')
    pie_vals.append(len(rrpt_data))
    rpt_data = dmci_data.loc[dmci_data.apply(is_both_rpt, axis=1)]
    pie_labels.append('XY|XY->XY')
    pie_vals.append(len(rpt_data))

    is_r0 = lambda row: tokFullIndel(row['Most Common Indel'])[2]['R'] == 0
    ro_data = dmci_data.loc[dmci_data.apply(is_r0, axis=1)]
    pie_labels.append('Z|XY->Z')
    pie_vals.append(len(ro_data))
    is_l1 = lambda row: tokFullIndel(row['Most Common Indel'])[2]['L'] == -1
    l1_data = dmci_data.loc[dmci_data.apply(is_l1, axis=1)]
    pie_labels.append('XY|Z->Z')
    pie_vals.append(len(l1_data))

    seen_ids = set(rpt_data['Oligo Id']).union(set(ro_data['Oligo Id'])).union(
        set(l1_data['Oligo Id'])).union(set(lrpt_data['Oligo Id'])).union(
            set(rrpt_data['Oligo Id']))
    is_unseen = lambda id: id not in seen_ids
    unseen_data = dmci_data.loc[dmci_data['Oligo Id'].apply(is_unseen)]
    print(unseen_data)
    assert (len(unseen_data) == 0)
    #pie_labels.append('Other')
    #pie_vals.append(len(unseen_data))

    #pie_labels = [x for x in dmci_data['Most Common Indel'].unique()]
    #pie_vals  = [len(dmci_data.loc[dmci_data['Most Common Indel']==indel]) for indel in pie_labels]
    PL.figure(figsize=(4, 4))
    PL.pie(pie_vals,
           labels=pie_labels,
           autopct='%.1f',
           labeldistance=1.1,
           counterclock=False,
           colors=COLORS)
    PL.title(
        'Size 2 deletions that are\n"most common" for their gRNA in all 3 replicates\n(%d gRNAs from %d total)'
        % (len(dmci_data), len(mci_merged_data)))
    PL.show(block=False)
    saveFig('pie_chart_D2_indel_cats')

    PL.figure(figsize=(12, 8))

    #XY|XY->XY
    PL.subplot(2, 3, 1)
    pie_vals, pie_labels = [], []
    for mh_str in [y + x for x in 'ATGC' for y in 'ATGC']:
        pie_labels.append(mh_str)
        is_mh_str = lambda guide: guide[-5:-3] == mh_str
        pie_vals.append(len(rpt_data.loc[rpt_data['Guide'].apply(is_mh_str)]))
    for dnt, cnt in zip(pie_labels, pie_vals):
        print(dnt, cnt * 100 / sum(pie_vals))
    PL.pie(pie_vals,
           labels=pie_labels,
           autopct='%.1f',
           labeldistance=1.1,
           counterclock=False,
           colors=COLORS)
    PL.title('XY|XY->XY\n(%d gRNAs)' % len(rpt_data))
    PL.show(block=False)

    #__|
    PL.subplot(2, 3, 2)
    pie_vals, pie_labels = [], []
    for mh_str in [y + x for x in 'ATGC' for y in 'ATGC']:
        pie_labels.append(mh_str)
        is_mh_str = lambda guide: guide[-5:-3] == mh_str
        pie_vals.append(len(ro_data.loc[ro_data['Guide'].apply(is_mh_str)]))
    for dnt, cnt in zip(pie_labels, pie_vals):
        print(dnt, cnt * 100 / sum(pie_vals))
    PL.pie(pie_vals,
           labels=pie_labels,
           autopct='%.1f',
           labeldistance=1.1,
           counterclock=False,
           colors=COLORS)
    PL.title('XY| -> __|\n(%d gRNAs)' % len(ro_data))
    PL.show(block=False)

    #|__
    PL.subplot(2, 3, 3)
    pie_vals, pie_labels = [], []
    for mh_str in [y + x for x in 'ATGC' for y in 'ATGC']:
        pie_labels.append(mh_str)
        is_mh_str = lambda guide: guide[-3:-1] == mh_str
        pie_vals.append(len(l1_data.loc[l1_data['Guide'].apply(is_mh_str)]))
    for dnt, cnt in zip(pie_labels, pie_vals):
        print(dnt, cnt * 100 / sum(pie_vals))
    PL.pie(pie_vals,
           labels=pie_labels,
           autopct='%.1f',
           labeldistance=1.1,
           counterclock=False,
           colors=COLORS)
    PL.title('|XY -> |__\n(%d gRNAs)' % len(l1_data))
    PL.show(block=False)

    #XY|X->X
    PL.subplot(2, 3, 4)
    pie_vals, pie_labels = [], []
    for nt in 'ATGC':
        pie_labels.append('%sN|%s -> %s' % (nt, nt, nt))
        is_mh_str = lambda guide: guide[-5] == nt
        pie_vals.append(len(
            lrpt_data.loc[lrpt_data['Guide'].apply(is_mh_str)]))
    PL.pie(pie_vals,
           labels=pie_labels,
           autopct='%.1f',
           labeldistance=1.1,
           counterclock=False,
           colors=COLORS)
    PL.title('XY|X->X\n(%d gRNAs)' % len(lrpt_data))
    PL.show(block=False)

    #X|YX->X
    PL.subplot(2, 3, 5)
    pie_vals, pie_labels = [], []
    for nt in 'ATGC':
        pie_labels.append('%s|N%s -> %s' % (nt, nt, nt))
        is_mh_str = lambda guide: guide[-4] == nt
        pie_vals.append(len(
            rrpt_data.loc[rrpt_data['Guide'].apply(is_mh_str)]))
    PL.pie(pie_vals,
           labels=pie_labels,
           autopct='%.1f',
           labeldistance=1.1,
           counterclock=False,
           colors=COLORS)
    PL.title('X|YX->X\n(%d gRNAs)' % len(rrpt_data))
    PL.show(block=False)
    PL.subplots_adjust(left=0.05,
                       right=0.95,
                       top=0.9,
                       bottom=0.1,
                       hspace=0.3,
                       wspace=0.3)
    saveFig('D2_nts_per_cat')

    PL.figure(figsize=(12, 8))

    #XY|XY->XY
    PL.subplot(2, 3, 1)
    bar_heights, bar_labels, d2_dnt_counts, dnt_counts = [], [], [], []
    for dnt in [y + x for x in 'ATGC' for y in 'ATGC']:
        has_dnt = lambda guide: guide[-5:-3] == dnt and guide[-3:-1] == dnt
        dnt_data = mci_merged_data_guides.loc[
            mci_merged_data_guides['Guide'].apply(has_dnt)]
        dnt_counts.append(
            len(
                set(rpt_data['Oligo Id']).intersection(
                    set(dnt_data['Oligo Id']))))
        d2_dnt_counts.append(len(dnt_data))
        bar_heights.append(dnt_counts[-1] * 100.0 /
                           d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0)
        bar_labels.append(dnt)
        print(
            dnt, dnt_counts[-1] * 100.0 /
            d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0)
    PL.bar(range(len(bar_labels)), bar_heights, width=0.8)
    for i, (hgt, d2cnt,
            cnt) in enumerate(zip(bar_heights, d2_dnt_counts, dnt_counts)):
        PL.text(i - 0.3, hgt + 15, '%d/%d' % (cnt, d2cnt), rotation='vertical')
    PL.xticks(range(len(bar_labels)), bar_labels, rotation='vertical')
    PL.ylim((0, 90))
    PL.xlabel('XY')
    PL.title('XY|XY->XY')
    PL.ylabel(
        'Percent gRNAs with XY|XY->XY deletion\nas most common indel in all 3 replicates'
    )
    PL.show(block=False)

    #__|
    PL.subplot(2, 3, 2)
    bar_heights, bar_labels, d2_dnt_counts, dnt_counts = [], [], [], []
    for dnt in [y + x for x in 'ATGC' for y in 'ATGC']:
        has_dnt = lambda guide: guide[-5:-3] == dnt
        dnt_data = mci_merged_data_guides.loc[
            mci_merged_data_guides['Guide'].apply(has_dnt)]
        dnt_counts.append(
            len(
                set(ro_data['Oligo Id']).intersection(set(
                    dnt_data['Oligo Id']))))
        d2_dnt_counts.append(len(dnt_data))
        bar_heights.append(dnt_counts[-1] * 100.0 /
                           d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0)
        bar_labels.append(dnt)
        print(
            dnt, dnt_counts[-1] * 100.0 /
            d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0)
    PL.bar(range(len(bar_labels)), bar_heights, width=0.8)
    for i, (hgt, d2cnt,
            cnt) in enumerate(zip(bar_heights, d2_dnt_counts, dnt_counts)):
        PL.text(i - 0.3,
                hgt + 1.5,
                '%d/%d' % (cnt, d2cnt),
                rotation='vertical')
    PL.xticks(range(len(bar_labels)), bar_labels, rotation='vertical')
    PL.ylim((0, 8))
    PL.xlabel('XY')
    PL.title('XY| -> __|')
    PL.ylabel(
        'Percent gRNAs with XY| -> __| deletion\nas most common indel in all 3 replicates'
    )
    PL.show(block=False)

    #|__
    PL.subplot(2, 3, 3)
    bar_heights, bar_labels, d2_dnt_counts, dnt_counts = [], [], [], []
    for dnt in [y + x for x in 'ATGC' for y in 'ATGC']:
        has_dnt = lambda guide: guide[-3:-1] == dnt
        dnt_data = mci_merged_data_guides.loc[
            mci_merged_data_guides['Guide'].apply(has_dnt)]
        dnt_counts.append(
            len(
                set(l1_data['Oligo Id']).intersection(set(
                    dnt_data['Oligo Id']))))
        d2_dnt_counts.append(len(dnt_data))
        bar_heights.append(dnt_counts[-1] * 100.0 /
                           d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0)
        bar_labels.append(dnt)
        print(
            dnt, dnt_counts[-1] * 100.0 /
            d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0)
    PL.bar(range(len(bar_labels)), bar_heights, width=0.8)
    for i, (hgt, d2cnt,
            cnt) in enumerate(zip(bar_heights, d2_dnt_counts, dnt_counts)):
        PL.text(i - 0.3,
                hgt + 1.5,
                '%d/%d' % (cnt, d2cnt),
                rotation='vertical')
    PL.xticks(range(len(bar_labels)), bar_labels, rotation='vertical')
    PL.ylim((0, 8))
    PL.xlabel('XY')
    PL.title('|XY -> |__')
    PL.ylabel(
        'Percent gRNAs with |XY -> |__ deletion\nas most common indel in all 3 replicates'
    )
    PL.show(block=False)

    #XY|X->X
    PL.subplot(2, 3, 4)
    bar_heights, bar_labels, d2_nt_counts, nt_counts = [], [], [], []
    for nt in 'ATGC':
        has_nt = lambda guide: guide[-3] == nt and guide[-5] == nt
        nt_data = mci_merged_data_guides.loc[
            mci_merged_data_guides['Guide'].apply(has_nt)]
        nt_counts.append(
            len(
                set(lrpt_data['Oligo Id']).intersection(
                    set(nt_data['Oligo Id']))))
        d2_nt_counts.append(len(nt_data))
        bar_heights.append(nt_counts[-1] * 100.0 /
                           d2_nt_counts[-1] if d2_nt_counts[-1] > 0 else 0)
        bar_labels.append(nt)
    PL.bar(range(len(bar_labels)), bar_heights, width=0.8)
    for i, (hgt, d2cnt,
            cnt) in enumerate(zip(bar_heights, d2_nt_counts, nt_counts)):
        PL.text(i - 0.3, hgt + 0.05, '%d/%d' % (cnt, d2cnt))
    PL.xticks(range(len(bar_labels)), bar_labels)
    PL.ylim((0, 5))
    PL.xlabel('X')
    PL.title('XY|X->X')
    PL.ylabel(
        'Percent gRNAs with XY|X->X deletion\nas most common indel in all 3 replicates'
    )
    PL.show(block=False)

    #X|YX->X
    PL.subplot(2, 3, 5)
    bar_heights, bar_labels, d2_nt_counts, nt_counts = [], [], [], []
    for nt in 'ATGC':
        has_nt = lambda guide: guide[-4] == nt and guide[-2] == nt
        nt_data = mci_merged_data_guides.loc[
            mci_merged_data_guides['Guide'].apply(has_nt)]
        nt_counts.append(
            len(
                set(rrpt_data['Oligo Id']).intersection(
                    set(nt_data['Oligo Id']))))
        d2_nt_counts.append(len(nt_data))
        bar_heights.append(nt_counts[-1] * 100.0 /
                           d2_nt_counts[-1] if d2_nt_counts[-1] > 0 else 0)
        bar_labels.append(nt)
    PL.bar(range(len(bar_labels)), bar_heights, width=0.8)
    for i, (hgt, d2cnt,
            cnt) in enumerate(zip(bar_heights, d2_nt_counts, nt_counts)):
        PL.text(i - 0.3, hgt + 0.05, '%d/%d' % (cnt, d2cnt))
    PL.xticks(range(len(bar_labels)), bar_labels)
    PL.ylim((0, 5))
    PL.xlabel('X')
    PL.title('X|YX->X')
    PL.ylabel(
        'Percent gRNAs with X|YX->X deletion\nas most common indel in all 3 replicates'
    )
    PL.show(block=False)

    PL.subplots_adjust(left=0.05,
                       right=0.95,
                       top=0.9,
                       bottom=0.1,
                       hspace=0.3,
                       wspace=0.3)
    saveFig('D2_nts_per_cat_bars')
예제 #23
0
def compareOverbeekProfiles(
        selected_overbeek_id=None,
        pred_results_dir='../indel_prediction/model_testing'):

    new_dirs = [
        'ST_June_2017/data/K562_800x_LV7A_DPI7/mapped_reads/Oligos_71',
        'ST_June_2017/data/K562_800x_LV7A_DPI10/mapped_reads/Oligos_71',
        'ST_June_2017/data/K562_800x_LV7B_DPI7/mapped_reads/Oligos_71',
        'ST_June_2017/data/K562_800x_LV7B_DPI10/mapped_reads/Oligos_71',
        'ST_June_2017/data/K562_1600x_LV7B_DPI5/mapped_reads/Oligos_71',
        'ST_Feb_2018/data/CAS9_12NA_1600X_DPI7/mapped_reads/Oligos_71'
    ]

    #Old Samples
    old_dirs = [
        'ST_June_2017/data/K562_1600x_6OA_DPI5/mapped_reads/Oligos_71',
        'ST_June_2017/data/K562_1600x_6OA_DPI7/mapped_reads/Oligos_71',
        'ST_April_2017/data/K562_800x_6OA_DPI3_Old7/mapped_reads/Oligos_71',
        'ST_April_2017/data/K562_800x_6OA_DPI7_Old8/mapped_reads/Oligos_71',
        'ST_April_2017/data/K562_800x_6OA_DPI10_Old9/mapped_reads/Oligos_71',
        'ST_April_2017/data/K562_800x_6OB_DPI3_Old10/mapped_reads/Oligos_71',
        'ST_April_2017/data/K562_800x_6OB_DPI7_Old11/mapped_reads/Oligos_71',
        'ST_April_2017/data/K562_800x_6OB_DPI10_Old12/mapped_reads/Oligos_71'
    ]
    remove_long_indels = False
    remove_wt, wt_thresh = True, 3.0
    mappings = loadMappings()

    all_overbeek_profiles, all_new_profiles, all_old_profiles, all_our_profiles, sel_overbeek_ids,oldnew_overbeek_ids, old_ids, new_ids = [],[],[],[], [],[],[],[]

    overbeek_inframes, ours_inframes, oof_sel_overbeek_ids = [], [], []

    kls, kls_old, kls_new, log_reads, overbeek_ids, above30_percentages, log_reads_new, log_reads_old, min_log_reads = [],[],[],[],[],[],[],[], []
    for idx in range(1, 97):

        overbeek_id = 'Overbeek%d' % idx
        if selected_overbeek_id is not None and selected_overbeek_id != overbeek_id:
            continue
        if overbeek_id not in mappings:
            continue

        overbeek_filename = getHighDataDir(
        ) + '/overbeek_fastq_files/' + overbeek_id + '_mappedindelsummary.txt'

        p1, p1_new, p1_old, o1, rep_reads1, rep_reads2 = {}, {}, {}, {}, {}, {}
        nreads2, nreads1, nreads_old, nreads_new, nnull_old, nnull_new, nnull1, nnull2 = 0, 0, 0, 0, 0, 0, 0, 0

        #Read the overbreek profile
        numread2, perc_accept2, num_null2 = readSummaryToProfile(
            overbeek_filename,
            o1,
            oligoid=overbeek_id,
            remove_long_indels=remove_long_indels,
            remove_wt=False)
        if selected_overbeek_id is not None:
            fetchRepresentativeCleanReads(
                getHighDataDir() + '/overbeek_fastq_files/' + overbeek_id +
                '_mappedindelprofiles.txt',
                rep_reads2,
                oligoid=overbeek_id)
            pam_loc2, pam_dir2 = getNullTargetPamDetails(
                getHighDataDir() + '/overbeek_control_fastq_files/' +
                overbeek_id + '_exptargets.txt',
                oligoid=overbeek_id)
        nreads2 += numread2
        nnull2 += num_null2

        if numread2 == 0: continue

        p1_new_reps, p1_old_reps = [{}, {}], [{}, {}]
        rr_new_reps, rr_old_reps = [{}, {}], [{}, {}]
        #Read all the new and old profiles
        pam_loc1, pam_dir1 = None, None
        for oligo_id, is_old in mappings[overbeek_id]:

            #Read all reads for all our K562 profiles
            oligo_idx = eval(oligo_id[5:])
            _, oligo_fileprefix = getFileForOligoIdx(oligo_idx, ext='')
            oligo_filename = oligo_fileprefix + '_mappedindelsummary.txt'
            read_filename = oligo_fileprefix + '_mappedindelprofiles.txt'
            exptarget_filename = oligo_fileprefix + '_exptargets.txt'
            if is_old:
                oligo_dirs, p1_old_new, null_oligo_dir = old_dirs, p1_old, 'ST_April_2017/data/NULL_Old/mapped_reads/Oligos_71'
                p1_reps, rr_reps = p1_old_reps, rr_old_reps
            else:
                oligo_dirs, p1_old_new, null_oligo_dir = new_dirs, p1_new, 'ST_April_2017/data/NULL_New/mapped_reads/Oligos_71'
                p1_reps, rr_reps = p1_new_reps, rr_new_reps

            for oligo_dir in [getHighDataDir() + '/' + x for x in oligo_dirs]:
                nr1, pa1, nn1 = readSummaryToProfile(
                    oligo_dir + '/' + oligo_filename,
                    p1_old_new,
                    oligoid=oligo_id,
                    remove_long_indels=remove_long_indels,
                    remove_wt=remove_wt,
                    wt_thresh=wt_thresh)
                numread1, perc_accept1, num_null1 = readSummaryToProfile(
                    oligo_dir + '/' + oligo_filename,
                    p1,
                    oligoid=oligo_id,
                    remove_long_indels=remove_long_indels,
                    remove_wt=remove_wt,
                    wt_thresh=wt_thresh)
                if 'DPI7' in oligo_dir:
                    rep_idx = 0 if '800x' in oligo_dir else 1
                    nr_rep, pa_rep, nn_rep = readSummaryToProfile(
                        oligo_dir + '/' + oligo_filename,
                        p1_reps[rep_idx],
                        oligoid=oligo_id,
                        remove_long_indels=remove_long_indels,
                        remove_wt=remove_wt,
                        wt_thresh=wt_thresh)
                if selected_overbeek_id is not None:
                    fetchRepresentativeCleanReads(oligo_dir + '/' +
                                                  read_filename,
                                                  rep_reads1,
                                                  oligoid=oligo_id)
                    if 'DPI7' in oligo_dir:
                        fetchRepresentativeCleanReads(oligo_dir + '/' +
                                                      read_filename,
                                                      rr_reps[rep_idx],
                                                      oligoid=oligo_id)
                    if pam_loc1 is None:
                        pam_loc1, pam_dir1 = getNullTargetPamDetails(
                            getHighDataDir() + '/' + null_oligo_dir + '/' +
                            exptarget_filename,
                            oligoid=oligo_id)
                if is_old:
                    nreads_old += numread1
                    nnull_old += num_null1
                else:
                    nreads_new += numread1
                    nnull_new += num_null1
                nreads1 += numread1
                nnull1 += num_null1

        kls.append(symmetricKL(p1, o1, True))
        kls_old.append(symmetricKL(p1_old, o1, True))
        kls_new.append(symmetricKL(p1_new, o1, True))

        log_reads.append(np.log10(nreads1 - nnull1 + 0.5))
        log_reads_old.append(np.log10(nreads_old - nnull_old + 0.5))
        log_reads_new.append(np.log10(nreads_new - nnull_new + 0.5))
        min_log_reads.append(min(log_reads_old[-1], log_reads_new[-1]))
        above30_percentages.append(computePercAbove30(o1))
        overbeek_ids.append(overbeek_id)

        if log_reads[-1] > 2.0:
            all_overbeek_profiles.append(o1)
            all_our_profiles.append(p1)
            sel_overbeek_ids.append(overbeek_id[8:])
            if above30_percentages[-1] < 50.0:
                oif, oof, _ = fetchIndelSizeCounts(o1)
                pif, pof, _ = fetchIndelSizeCounts(p1)
                overbeek_inframes.append(oif * 100.0 / (oif + oof))
                ours_inframes.append(pif * 100.0 / (pif + pof))
                oof_sel_overbeek_ids.append(overbeek_id)

        if min_log_reads[-1] > 2.0:
            all_new_profiles.append(p1_new)
            all_old_profiles.append(p1_old)
            oldnew_overbeek_ids.append(overbeek_id)
            old_ids.append(
                [id for id, is_old in mappings[overbeek_id] if is_old][0])
            new_ids.append(
                [id for id, is_old in mappings[overbeek_id] if not is_old][0])

        try:
            print(overbeek_id, [x for (x, y) in mappings[overbeek_id]],
                  kls[-1], nreads2, nreads1)
        except KeyError:
            print('Could not find', overbeek_id)
            print(mappings)

        if selected_overbeek_id is not None:
            title = '%s (KL=%.1f)' % (overbeek_id, kls[-1])
            labels = [
                'Conventional scaffold Rep A', 'Conventional scaffold  Rep B',
                'Improved scaffold Rep A', 'Improved scaffold  Rep B',
                'Endogenous Profile'
            ]
            plotProfiles([
                p1_old_reps[0], p1_old_reps[1], p1_new_reps[0], p1_new_reps[0],
                o1
            ], [
                rr_old_reps[0], rr_old_reps[1], rr_new_reps[0], rr_new_reps[1],
                rep_reads2
            ], [pam_loc1, pam_loc1, pam_loc1, pam_loc1, pam_loc2], [
                x == 'REVERSE'
                for x in [pam_dir1, pam_dir1, pam_dir1, pam_dir1, pam_dir2]
            ],
                         labels,
                         title=title)

    if selected_overbeek_id is None:

        plotInFrame(overbeek_inframes, ours_inframes, oof_sel_overbeek_ids,
                    pred_results_dir)

        i = 1
        PL.figure(figsize=(5.5, 5))
        for thr_l, thr_h in [(0.0, 10.0), (10.0, 20.0), (20.0, 50.0),
                             (50.0, 90.0), (90.0, 100.0)]:
            ydata = [
                kl for (kl, a30, id, reads) in zip(kls, above30_percentages,
                                                   overbeek_ids, log_reads)
                if a30 > thr_l and a30 <= thr_h
            ]
            xdata = [
                reads for (kl, a30, id, reads) in zip(kls, above30_percentages,
                                                      overbeek_ids, log_reads)
                if a30 > thr_l and a30 <= thr_h
            ]
            sel_ids = [
                id for (kl, a30, id, reads) in zip(kls, above30_percentages,
                                                   overbeek_ids, log_reads)
                if a30 > thr_l and a30 <= thr_h
            ]
            PL.plot(xdata,
                    ydata,
                    'o',
                    label='%d-%d%% Deletions > 30' % (thr_l, thr_h))
            for x, y, id in zip(xdata, ydata, sel_ids):
                if y > 3 and x > 2:
                    PL.text(x, y, id)
        PL.legend()
        PL.plot([0, 6], [0.77, 0.77], '--', color='grey')
        PL.text(0.1, 0.5, 'Median between our replicates', color='grey')
        PL.ylabel('Symmetric KL Divergence', fontsize=12)
        PL.xlabel('Log10 Mutated Reads', fontsize=12)
        PL.xlim((0, 5.5))
        PL.ylim((0, 8))
        PL.show(block=False)
        saveFig('scatter_KL')
        i += 1

        print('Median=', np.median(kls), 'Mean KL=', np.mean(kls))
        print(len(overbeek_ids))

        #Compute pairwise KL between overbeek and ours
        N = len(sel_overbeek_ids)
        kl_mat = np.zeros((N, N))
        for i, o1 in enumerate(all_overbeek_profiles):
            for j, p1 in enumerate(all_our_profiles):
                kl_mat[i, j] = symmetricKL(o1, p1)
        PL.figure(figsize=(8, 6))
        PL.imshow(kl_mat,
                  cmap='hot_r',
                  vmin=0.0,
                  vmax=3.0,
                  interpolation='nearest')
        PL.xticks(range(N), sel_overbeek_ids, rotation='vertical', fontsize=6)
        PL.yticks(range(N),
                  sel_overbeek_ids,
                  rotation='horizontal',
                  fontsize=6)
        PL.xlabel('Synthetic Measurement', fontsize=12)
        PL.ylabel('Endogenous Measurement', fontsize=12)
        PL.title('KL', fontsize=12)
        PL.colorbar()
        PL.show(block=False)
        saveFig('heatmap_KL')