def plotDominantBars(all_result_outputs, label=''): pie_labels = ['I1_Rpt Left Reads - NonAmb','Ambiguous Rpt Reads','I1_Rpt Right Reads - NonAmb','I1_NonRpt Reads'] mci_merged_data = mergeSamples(all_result_outputs, [], data_label='i1IndelData') mci_merged_data['Equal MCI'] = (mci_merged_data['Most Common Indel']==mci_merged_data['Most Common Indel 2']) & (mci_merged_data['Most Common Indel']==mci_merged_data['Most Common Indel 3']) mci_merged_data['Is Dominant I1'] = (mci_merged_data['Equal MCI'] & (mci_merged_data['MCI Type'] == 'I1')) oligo_data = pd.read_csv(getHighDataDir() + '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv',sep='\t') remove_under = lambda x: x.replace('_','') oligo_data['Oligo Id'] = oligo_data['ID'].apply(remove_under) merged_mci_data = pd.merge(mci_merged_data, oligo_data[['Oligo Id','Guide']], how='inner',on='Oligo Id') nt_perc_i1, cnt_labels = [], [] nts = 'ATGC' for nt in nts: is_nt = lambda guide: (guide[-4] == nt) nt_data = merged_mci_data.loc[merged_mci_data['Guide'].apply(is_nt)] nt_perc_i1.append(sum(nt_data['Is Dominant I1'])*100.0/len(nt_data)) cnt_labels.append('%d/%d' % (sum(nt_data['Is Dominant I1']), len(nt_data))) PL.figure() PL.bar(range(4), nt_perc_i1, width=0.8) for i, cnt in enumerate(cnt_labels): PL.text(i-0.3,nt_perc_i1[i]+5.0,cnt) PL.xticks(range(4), [x for x in nts]) PL.xlabel('Nucleotide on Left of cut-site') PL.ylabel('Percent gRNAs with single nucleotide insertion\nas most common indel in all 3 replicates') PL.show(block=False) saveFig('I1_bar_3_rep')
def plotMergedPieDataWithAmbig(all_result_outputs, label='', norm='I1 Total'): pie_labels = [ 'I1_Rpt Left Reads - NonAmb', 'Ambiguous Rpt Reads', 'I1_Rpt Right Reads - NonAmb', 'I1_NonRpt Reads' ] merged_data = mergeSamples(all_result_outputs, pie_labels + [norm], data_label='i1IndelData', merge_on=['Oligo Id']) labels = [ 'Repeated\nleft nucleotide', 'Ambiguous\n(Left = Right)', 'Repeated\nright nucleotide', 'Non-repeated\nnucleotide' ] pie_data = { x: (merged_data[x + ' Sum'] * 100.0 / merged_data[norm + ' Sum']).mean(axis=0) for x in pie_labels } PL.figure(figsize=(3, 3)) PL.pie([pie_data[x] for x in pie_labels], labels=labels, autopct='%.1f', labeldistance=1.05, startangle=120.0, counterclock=False) PL.title('Single nucleotide insertions (I1)') PL.show(block=False) saveFig('ambig_pie')
def plotMCIPie(all_result_outputs, label=''): mci_merged_data = mergeSamples(all_result_outputs, ['MCI Type', 'Most Common Indel'], data_label='perOligoMCI') mci_common = mci_merged_data.loc[(mci_merged_data['Most Common Indel'] == mci_merged_data['Most Common Indel 2']) & (mci_merged_data['Most Common Indel'] == mci_merged_data['Most Common Indel 3'])] pie_vals, pie_labels = [], [] for mci_type in ALL_LABELS: pie_vals.append(len( mci_common.loc[mci_common['MCI Type'] == mci_type])) pie_labels.append(mci_type) pie_vals.append(len(mci_merged_data) - len(mci_common)) pie_labels.append('Inconsistent\nbetween\nreplicates') PL.figure(figsize=(4, 4)) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.05, startangle=90.0, counterclock=False, colors=COLORS) PL.title('Most frequent\nmutation per gRNA') PL.show(block=False) saveFig('pie_chart_cats_dominant')
def plotSumPie(all_result_outputs, label=''): mapping = { 'Large D, No MH': 'D>=4,\nno MH', 'Large D, MH': 'D>=4,\nMH', 'Small D, No MH': 'D<4, no MH', 'Small D, MH': 'D<4, MH' } merged_data = mergeSamples(all_result_outputs, ['Total reads'] + ALL_LABELS, data_label='perOligoCounts') for col in ALL_LABELS: merged_data[col + ' Perc'] = merged_data[ col + ' Sum'] * 100.0 / merged_data['Total reads Sum'] merged_data.to_csv('data_dump_indel_pie.txt', sep='\t', columns=['Oligo Id'] + [col + ' Perc' for col in ALL_LABELS]) pie_vals = [merged_data[col + ' Perc'].mean() for col in ALL_LABELS] PL.figure(figsize=(4, 4)) wedge_labels = [mapping[x] if x in mapping else x for x in ALL_LABELS] PL.pie(pie_vals, labels=wedge_labels, autopct='%.1f', labeldistance=1.05, startangle=90.0, counterclock=False, colors=COLORS) PL.title('Average distribution\n of mutations\n per gRNA') PL.show(block=False) saveFig('pie_chart_cats')
def plotDominantPieDataWithAmbig(all_result_outputs, label=''): pie_labels = ['I1_Rpt Left Reads - NonAmb','Ambiguous Rpt Reads','I1_Rpt Right Reads - NonAmb','I1_NonRpt Reads'] mci_merged_data = mergeSamples(all_result_outputs, [], data_label='i1IndelData') mci_merged_data['Equal MCI'] = (mci_merged_data['Most Common Indel']==mci_merged_data['Most Common Indel 2']) & (mci_merged_data['Most Common Indel']==mci_merged_data['Most Common Indel 3']) mci_common_i1 = mci_merged_data.loc[mci_merged_data['Equal MCI'] & (mci_merged_data['MCI Type'] == 'I1')] labels = ['Repeated\nleft nucleotide', 'Ambiguous\n(Left = Right)', 'Repeated\nright nucleotide', 'Non-repeated\nnucleotide'] pie_data = [] for label in pie_labels: is_rpt = lambda row: row['MCI Reads'] == row[label] pie_data.append(sum(mci_common_i1.apply(is_rpt,axis=1))*100.0/len(mci_common_i1)) PL.figure(figsize=(3,3)) PL.pie(pie_data, labels=labels, autopct='%.1f', labeldistance=1.05, startangle=180.0, counterclock=False) PL.title('Dominant single nucleotide insertions (I1)\n%d from %d gRNAs' % (len(mci_common_i1), len(mci_merged_data))) PL.show(block=False) saveFig('I1_dom_pie_3_rep')
def plotMergedI1Repeats(all_result_outputs, label=''): merged_data = mergeSamples(all_result_outputs, ['I1_Rpt Left Reads - NonAmb','Total reads'], data_label='i1IndelData', merge_on=['Oligo Id','Repeat Nucleotide Left']) nt_mean_percs, nts = [], ['A','T','G','C'] for nt in nts: nt_data = merged_data.loc[merged_data['Repeat Nucleotide Left'] == nt] nt_mean_percs.append((nt_data['I1_Rpt Left Reads - NonAmb Sum']*100.0/nt_data['Total reads Sum']).mean()) PL.figure(figsize=(3,3)) PL.bar(range(4),nt_mean_percs) for i in range(4): PL.text(i-0.25,nt_mean_percs[i]+0.8,'%.1f' % nt_mean_percs[i]) PL.xticks(range(4),nts) PL.ylim((0,26)) PL.xlabel('PAM distal nucleotide\nadjacent to the cut site') PL.ylabel('I1 repeated left nucleotide\nas percent of total mutated reads') PL.show(block=False) saveFig('i1_rtp_nt')
def plotMicrohomologyMismatches(all_result_outputs, label=''): mut_hdrs = ['Left Mut', 'Right Mut','Merged Mut1', 'Merged Mut2'] cols_to_sum = [x + ' Indel Reads in Mut' for x in mut_hdrs] + ['Orig Indel Reads in Orig', 'Mut Non-Null Reads', 'Orig Non-Null Reads'] common_cols = ['Oligo ID','Mapped Oligo Id','Num Mismatches','Orig MH','Left Mut-MH','Right Mut-MH','Merged Mut 1 MH','Merged Mut 2 MH','Orig Indel','Left Mut-MH Indel','Right Mut-MH Indel','Merge Mut 1 Indel','Merge Mut 2 Indel'] data = mergeSamples(all_result_outputs, cols_to_sum, merge_on=common_cols) getLeft = lambda indel: tokFullIndel(indel)[2]['L'] getRight = lambda indel: tokFullIndel(indel)[2]['R'] getMHSize = lambda indel: tokFullIndel(indel)[2]['C'] oligo_data = pd.read_csv(getHighDataDir() + '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv', sep='\t') oligo_data['Guide is matched'] = oligo_data.apply(isMatched, axis=1) reverse_lookup = {x: y == 'REVERSE' for (x,y) in zip(oligo_data['ID'],oligo_data['PAM Direction'])} is_reverse = lambda x: reverse_lookup[x] data = pd.merge(data, oligo_data[['ID','Guide is matched']], left_on='Oligo ID', right_on='ID', how='inner') data['MH Size'] = data['Orig Indel'].apply(getMHSize) data = data.loc[(data['MH Size'] != 0) & (data['Guide is matched'])] data['MH Left Loc'] = data['Orig Indel'].apply(getLeft) + data['MH Size'] data['MH Right Loc'] = data['Orig Indel'].apply(getRight) - data['MH Size'] data['Is Reverse'] = data['Oligo ID'].apply(is_reverse) for hdrL,hdrR in [mut_hdrs[:2], mut_hdrs[2:]]: data[hdrL + ' Reads'] = data['Is Reverse']*data[hdrR + ' Indel Reads in Mut Sum'] + (1- data['Is Reverse'])*data[hdrL + ' Indel Reads in Mut Sum'] data[hdrR + ' Reads'] = data['Is Reverse']*data[hdrL + ' Indel Reads in Mut Sum'] + (1- data['Is Reverse'])*data[hdrR + ' Indel Reads in Mut Sum'] data[hdrL + ' Reads Ratio'] = data[hdrL + ' Reads']*100.0/data['Mut Non-Null Reads Sum'] data[hdrR + ' Reads Ratio'] = data[hdrR + ' Reads']*100.0/data['Mut Non-Null Reads Sum'] data['Orig Indel Reads Ratio'] = data['Orig Indel Reads in Orig Sum']*100.0/data['Orig Non-Null Reads Sum'] data['All Mut Reads Ratio'] = (data[[x + ' Reads' for x in mut_hdrs]].sum(axis=1))*100.0/data['Mut Non-Null Reads Sum'] data['MH Dist'] = data['MH Right Loc'] - data['MH Left Loc'] data['1st Mismatch'] = data.apply(getMismatch, axis=1) data['Last Mismatch'] = data.apply(getLastMismatch, axis=1) data['MH GC Content'] = data.apply(getMhGC, axis=1) mh_indel_types = [('Orig Indel','Left Mut'), ('Orig Indel','Right Mut'), ('Orig Indel','All Mut'),('Left Mut','Right Mut') ] label_lookup = {'Orig Indel': 'Perc. mutated reads of corresponding microhomology-\nmediated deletion with no sequence mismatches', 'Left Mut': 'Perc. mutated reads of mismatched microhomology-\nmediated deletion with retained left sequence', 'Right Mut': 'Perc mutated reads of mismatched microhomology-\nmediated deletion with retained right sequence', 'All Mut': 'Perc mutated reads of mismatched microhomology-\nmediated deletion (All)' } fig1 = PL.figure(figsize=(4,4)) fig_all = PL.figure(figsize=(10,10)) for i, (mh_typex, mh_typey) in enumerate(mh_indel_types): figs = [(fig_all, True), (fig1,False)] if i==2 else [(fig_all, True)] for fig, is_all in figs: PL.figure(fig.number) if is_all: PL.subplot(2,2,i+1) for nm,clr in zip([1,2],['royalblue','orange']): nm_data = data.loc[data['Num Mismatches'] == nm] sty, lsty = 'o', '-' sel_data = nm_data.loc[(nm_data['MH Size'] >= 6) & (nm_data['MH Size'] <= 15)] PL.plot(sel_data[mh_typex + ' Reads Ratio'], sel_data[mh_typey + ' Reads Ratio'], sty, color=clr, markersize=4, label='No. MH Mismatches=%d' % (nm)) rx, ry, grad = getRegrLine(sel_data[[mh_typex + ' Reads Ratio']], sel_data[[mh_typey + ' Reads Ratio']]) if not is_all: print(grad, nm, mh_typex, mh_typey) if i != 3: PL.plot(rx, ry, lsty, color=clr, linewidth=2) PL.xlabel(label_lookup[mh_typex]) PL.ylabel(label_lookup[mh_typey]) PL.xlim((0,80)) PL.ylim((0,80)) PL.plot([0,80],[0,80],'k--') PL.legend() PL.show(block=False) saveFig('mm_mismatch_all') PL.figure(fig1.number) saveFig('mm_mismatch_one')
def plotD2(all_result_outputs, label=''): #Merge replicates mci_merged_data = mergeSamples(all_result_outputs, [], data_label='perOligoMCI') mci_merged_data['Equal MCI'] = ( mci_merged_data['Most Common Indel'] == mci_merged_data['Most Common Indel 2']) & ( mci_merged_data['Most Common Indel'] == mci_merged_data['Most Common Indel 3']) oligo_data = pd.read_csv( getHighDataDir() + '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv', sep='\t') remove_under = lambda x: x.replace('_', '') oligo_data['Oligo Id'] = oligo_data['ID'].apply(remove_under) mci_merged_data_guides = pd.merge(mci_merged_data, oligo_data[['Oligo Id', 'Guide']], how='inner', on='Oligo Id') mci_common = mci_merged_data_guides.loc[mci_merged_data['Equal MCI']] dmci_data = mci_common.loc[( mci_common['MCI Type'] == 'D2' )] #Note: type check discards equally most common indels pie_vals, pie_labels = [], [] is_left_rpt = lambda row: row['Guide'][-5] == row['Guide'][ -3] and tokFullIndel(row['Most Common Indel'])[2][ 'R'] >= 1 and tokFullIndel(row['Most Common Indel'])[2]['L'] <= -3 is_right_rpt = lambda row: row['Guide'][-4] == row['Guide'][ -2] and tokFullIndel(row['Most Common Indel'])[2][ 'R'] >= 2 and tokFullIndel(row['Most Common Indel'])[2]['L'] <= -2 is_left_only_rpt = lambda row: is_left_rpt(row) and not is_right_rpt(row) is_right_only_rpt = lambda row: is_right_rpt(row) and not is_left_rpt(row) is_both_rpt = lambda row: is_right_rpt(row) and is_left_rpt(row) lrpt_data = dmci_data.loc[dmci_data.apply(is_left_only_rpt, axis=1)] pie_labels.append('Y|XY->Y') pie_vals.append(len(lrpt_data)) rrpt_data = dmci_data.loc[dmci_data.apply(is_right_only_rpt, axis=1)] pie_labels.append('XY|X->X') pie_vals.append(len(rrpt_data)) rpt_data = dmci_data.loc[dmci_data.apply(is_both_rpt, axis=1)] pie_labels.append('XY|XY->XY') pie_vals.append(len(rpt_data)) is_r0 = lambda row: tokFullIndel(row['Most Common Indel'])[2]['R'] == 0 ro_data = dmci_data.loc[dmci_data.apply(is_r0, axis=1)] pie_labels.append('Z|XY->Z') pie_vals.append(len(ro_data)) is_l1 = lambda row: tokFullIndel(row['Most Common Indel'])[2]['L'] == -1 l1_data = dmci_data.loc[dmci_data.apply(is_l1, axis=1)] pie_labels.append('XY|Z->Z') pie_vals.append(len(l1_data)) seen_ids = set(rpt_data['Oligo Id']).union(set(ro_data['Oligo Id'])).union( set(l1_data['Oligo Id'])).union(set(lrpt_data['Oligo Id'])).union( set(rrpt_data['Oligo Id'])) is_unseen = lambda id: id not in seen_ids unseen_data = dmci_data.loc[dmci_data['Oligo Id'].apply(is_unseen)] print(unseen_data) assert (len(unseen_data) == 0) #pie_labels.append('Other') #pie_vals.append(len(unseen_data)) #pie_labels = [x for x in dmci_data['Most Common Indel'].unique()] #pie_vals = [len(dmci_data.loc[dmci_data['Most Common Indel']==indel]) for indel in pie_labels] PL.figure(figsize=(4, 4)) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title( 'Size 2 deletions that are\n"most common" for their gRNA in all 3 replicates\n(%d gRNAs from %d total)' % (len(dmci_data), len(mci_merged_data))) PL.show(block=False) saveFig('pie_chart_D2_indel_cats') PL.figure(figsize=(12, 8)) #XY|XY->XY PL.subplot(2, 3, 1) pie_vals, pie_labels = [], [] for mh_str in [y + x for x in 'ATGC' for y in 'ATGC']: pie_labels.append(mh_str) is_mh_str = lambda guide: guide[-5:-3] == mh_str pie_vals.append(len(rpt_data.loc[rpt_data['Guide'].apply(is_mh_str)])) for dnt, cnt in zip(pie_labels, pie_vals): print(dnt, cnt * 100 / sum(pie_vals)) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title('XY|XY->XY\n(%d gRNAs)' % len(rpt_data)) PL.show(block=False) #__| PL.subplot(2, 3, 2) pie_vals, pie_labels = [], [] for mh_str in [y + x for x in 'ATGC' for y in 'ATGC']: pie_labels.append(mh_str) is_mh_str = lambda guide: guide[-5:-3] == mh_str pie_vals.append(len(ro_data.loc[ro_data['Guide'].apply(is_mh_str)])) for dnt, cnt in zip(pie_labels, pie_vals): print(dnt, cnt * 100 / sum(pie_vals)) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title('XY| -> __|\n(%d gRNAs)' % len(ro_data)) PL.show(block=False) #|__ PL.subplot(2, 3, 3) pie_vals, pie_labels = [], [] for mh_str in [y + x for x in 'ATGC' for y in 'ATGC']: pie_labels.append(mh_str) is_mh_str = lambda guide: guide[-3:-1] == mh_str pie_vals.append(len(l1_data.loc[l1_data['Guide'].apply(is_mh_str)])) for dnt, cnt in zip(pie_labels, pie_vals): print(dnt, cnt * 100 / sum(pie_vals)) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title('|XY -> |__\n(%d gRNAs)' % len(l1_data)) PL.show(block=False) #XY|X->X PL.subplot(2, 3, 4) pie_vals, pie_labels = [], [] for nt in 'ATGC': pie_labels.append('%sN|%s -> %s' % (nt, nt, nt)) is_mh_str = lambda guide: guide[-5] == nt pie_vals.append(len( lrpt_data.loc[lrpt_data['Guide'].apply(is_mh_str)])) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title('XY|X->X\n(%d gRNAs)' % len(lrpt_data)) PL.show(block=False) #X|YX->X PL.subplot(2, 3, 5) pie_vals, pie_labels = [], [] for nt in 'ATGC': pie_labels.append('%s|N%s -> %s' % (nt, nt, nt)) is_mh_str = lambda guide: guide[-4] == nt pie_vals.append(len( rrpt_data.loc[rrpt_data['Guide'].apply(is_mh_str)])) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=COLORS) PL.title('X|YX->X\n(%d gRNAs)' % len(rrpt_data)) PL.show(block=False) PL.subplots_adjust(left=0.05, right=0.95, top=0.9, bottom=0.1, hspace=0.3, wspace=0.3) saveFig('D2_nts_per_cat') PL.figure(figsize=(12, 8)) #XY|XY->XY PL.subplot(2, 3, 1) bar_heights, bar_labels, d2_dnt_counts, dnt_counts = [], [], [], [] for dnt in [y + x for x in 'ATGC' for y in 'ATGC']: has_dnt = lambda guide: guide[-5:-3] == dnt and guide[-3:-1] == dnt dnt_data = mci_merged_data_guides.loc[ mci_merged_data_guides['Guide'].apply(has_dnt)] dnt_counts.append( len( set(rpt_data['Oligo Id']).intersection( set(dnt_data['Oligo Id'])))) d2_dnt_counts.append(len(dnt_data)) bar_heights.append(dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) bar_labels.append(dnt) print( dnt, dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) PL.bar(range(len(bar_labels)), bar_heights, width=0.8) for i, (hgt, d2cnt, cnt) in enumerate(zip(bar_heights, d2_dnt_counts, dnt_counts)): PL.text(i - 0.3, hgt + 15, '%d/%d' % (cnt, d2cnt), rotation='vertical') PL.xticks(range(len(bar_labels)), bar_labels, rotation='vertical') PL.ylim((0, 90)) PL.xlabel('XY') PL.title('XY|XY->XY') PL.ylabel( 'Percent gRNAs with XY|XY->XY deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) #__| PL.subplot(2, 3, 2) bar_heights, bar_labels, d2_dnt_counts, dnt_counts = [], [], [], [] for dnt in [y + x for x in 'ATGC' for y in 'ATGC']: has_dnt = lambda guide: guide[-5:-3] == dnt dnt_data = mci_merged_data_guides.loc[ mci_merged_data_guides['Guide'].apply(has_dnt)] dnt_counts.append( len( set(ro_data['Oligo Id']).intersection(set( dnt_data['Oligo Id'])))) d2_dnt_counts.append(len(dnt_data)) bar_heights.append(dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) bar_labels.append(dnt) print( dnt, dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) PL.bar(range(len(bar_labels)), bar_heights, width=0.8) for i, (hgt, d2cnt, cnt) in enumerate(zip(bar_heights, d2_dnt_counts, dnt_counts)): PL.text(i - 0.3, hgt + 1.5, '%d/%d' % (cnt, d2cnt), rotation='vertical') PL.xticks(range(len(bar_labels)), bar_labels, rotation='vertical') PL.ylim((0, 8)) PL.xlabel('XY') PL.title('XY| -> __|') PL.ylabel( 'Percent gRNAs with XY| -> __| deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) #|__ PL.subplot(2, 3, 3) bar_heights, bar_labels, d2_dnt_counts, dnt_counts = [], [], [], [] for dnt in [y + x for x in 'ATGC' for y in 'ATGC']: has_dnt = lambda guide: guide[-3:-1] == dnt dnt_data = mci_merged_data_guides.loc[ mci_merged_data_guides['Guide'].apply(has_dnt)] dnt_counts.append( len( set(l1_data['Oligo Id']).intersection(set( dnt_data['Oligo Id'])))) d2_dnt_counts.append(len(dnt_data)) bar_heights.append(dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) bar_labels.append(dnt) print( dnt, dnt_counts[-1] * 100.0 / d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0) PL.bar(range(len(bar_labels)), bar_heights, width=0.8) for i, (hgt, d2cnt, cnt) in enumerate(zip(bar_heights, d2_dnt_counts, dnt_counts)): PL.text(i - 0.3, hgt + 1.5, '%d/%d' % (cnt, d2cnt), rotation='vertical') PL.xticks(range(len(bar_labels)), bar_labels, rotation='vertical') PL.ylim((0, 8)) PL.xlabel('XY') PL.title('|XY -> |__') PL.ylabel( 'Percent gRNAs with |XY -> |__ deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) #XY|X->X PL.subplot(2, 3, 4) bar_heights, bar_labels, d2_nt_counts, nt_counts = [], [], [], [] for nt in 'ATGC': has_nt = lambda guide: guide[-3] == nt and guide[-5] == nt nt_data = mci_merged_data_guides.loc[ mci_merged_data_guides['Guide'].apply(has_nt)] nt_counts.append( len( set(lrpt_data['Oligo Id']).intersection( set(nt_data['Oligo Id'])))) d2_nt_counts.append(len(nt_data)) bar_heights.append(nt_counts[-1] * 100.0 / d2_nt_counts[-1] if d2_nt_counts[-1] > 0 else 0) bar_labels.append(nt) PL.bar(range(len(bar_labels)), bar_heights, width=0.8) for i, (hgt, d2cnt, cnt) in enumerate(zip(bar_heights, d2_nt_counts, nt_counts)): PL.text(i - 0.3, hgt + 0.05, '%d/%d' % (cnt, d2cnt)) PL.xticks(range(len(bar_labels)), bar_labels) PL.ylim((0, 5)) PL.xlabel('X') PL.title('XY|X->X') PL.ylabel( 'Percent gRNAs with XY|X->X deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) #X|YX->X PL.subplot(2, 3, 5) bar_heights, bar_labels, d2_nt_counts, nt_counts = [], [], [], [] for nt in 'ATGC': has_nt = lambda guide: guide[-4] == nt and guide[-2] == nt nt_data = mci_merged_data_guides.loc[ mci_merged_data_guides['Guide'].apply(has_nt)] nt_counts.append( len( set(rrpt_data['Oligo Id']).intersection( set(nt_data['Oligo Id'])))) d2_nt_counts.append(len(nt_data)) bar_heights.append(nt_counts[-1] * 100.0 / d2_nt_counts[-1] if d2_nt_counts[-1] > 0 else 0) bar_labels.append(nt) PL.bar(range(len(bar_labels)), bar_heights, width=0.8) for i, (hgt, d2cnt, cnt) in enumerate(zip(bar_heights, d2_nt_counts, nt_counts)): PL.text(i - 0.3, hgt + 0.05, '%d/%d' % (cnt, d2cnt)) PL.xticks(range(len(bar_labels)), bar_labels) PL.ylim((0, 5)) PL.xlabel('X') PL.title('X|YX->X') PL.ylabel( 'Percent gRNAs with X|YX->X deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) PL.subplots_adjust(left=0.05, right=0.95, top=0.9, bottom=0.1, hspace=0.3, wspace=0.3) saveFig('D2_nts_per_cat_bars')
def plotD1(all_result_outputs, label=''): mci_merged_data = mergeSamples(all_result_outputs, [], data_label='perOligoMCI') mci_merged_data['Equal MCI'] = ( mci_merged_data['Most Common Indel'] == mci_merged_data['Most Common Indel 2']) & ( mci_merged_data['Most Common Indel'] == mci_merged_data['Most Common Indel 3']) mci_common = mci_merged_data.loc[mci_merged_data['Equal MCI']] pie_vals, pie_labels = [], [] dmci_data = mci_common.loc[( mci_common['MCI Type'] == 'D1' )] #Note: type check discards equally most common indels spans_cutsite = lambda indel: tokFullIndel(indel)[2][ 'L'] < -1 and tokFullIndel(indel)[2]['R'] > 0 for nt in 'ATGC': is_mh = lambda alt_seq: len(alt_seq) >= 2 and alt_seq == (len(alt_seq) * nt) num_repeat_nt = len(dmci_data.loc[ dmci_data['Altered Sequence'].apply(is_mh) & dmci_data['Most Common Indel'].apply(spans_cutsite)]) pie_vals.append(num_repeat_nt * 100.0 / len(dmci_data)) print(num_repeat_nt) pie_labels.append('Removal of %s\nfrom %s|%s' % (nt, nt, nt)) is_non_repeat = lambda seq: len(seq) < 2 or seq != (seq[0] * len(seq)) num_non_repeat = len( dmci_data.loc[dmci_data['Altered Sequence'].apply(is_non_repeat) | ~dmci_data['Most Common Indel'].apply(spans_cutsite)]) pie_vals.append(num_non_repeat * 100.0 / len(dmci_data)) print(num_non_repeat) pie_labels.append('Removal from non-repeat') PL.figure(figsize=(4, 4)) PL.pie(pie_vals, labels=pie_labels, autopct='%.1f', labeldistance=1.1, counterclock=False, colors=OLD_COLORS) PL.title( 'Size 1 deletions that are\n"most common" for their gRNA in all 3 replicates\n(%d gRNAs from %d total)' % (len(dmci_data), len(mci_merged_data))) PL.show(block=False) saveFig('pie_chart_D1') oligo_data = pd.read_csv( getHighDataDir() + '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv', sep='\t') remove_under = lambda x: x.replace('_', '') oligo_data['Oligo Id'] = oligo_data['ID'].apply(remove_under) merged_mci_data = pd.merge(mci_merged_data, oligo_data[['Oligo Id', 'Guide']], how='inner', on='Oligo Id') print(len(merged_mci_data)) nt_dbl_perc_d1, cnt_labels = [], [] is_d1 = lambda indel: (indel.split('_')[0] == 'D1') non_dbl_nt = lambda row: row['Guide'][-4] != row['Guide'][-3] nts = 'ATGC' for nt in nts: double_nt = lambda row: row['Guide'][-4:-2] == (nt + nt) dbl_data = merged_mci_data.loc[merged_mci_data.apply(double_nt, axis=1)] num_dbl_d1 = sum( dbl_data['Most Common Indel'].apply(is_d1) & dbl_data['Equal MCI'] & (dbl_data['Oligo Id'] != 'Oligo28137') ) #Oligo28137: Corner case where a guide has CT|T and loses the C nt_dbl_perc_d1.append(num_dbl_d1 * 100.0 / len(dbl_data)) cnt_labels.append('%d/%d' % (num_dbl_d1, len(dbl_data))) print(len(dbl_data)) non_dbl_data = merged_mci_data.loc[merged_mci_data.apply(non_dbl_nt, axis=1)] print(len(non_dbl_data)) num_non_dbl_d1 = sum(non_dbl_data['Most Common Indel'].apply(is_d1) & non_dbl_data['Equal MCI']) nt_dbl_perc_d1.append(num_non_dbl_d1 * 100.0 / len(non_dbl_data)) cnt_labels.append('%d/%d' % (num_non_dbl_d1, len(non_dbl_data))) PL.figure() PL.bar(range(5), nt_dbl_perc_d1, width=0.8) for i, cnt in enumerate(cnt_labels): PL.text(i - 0.3, nt_dbl_perc_d1[i] + 5.0, cnt) PL.xticks(range(5), ['%s' % x * 2 for x in nts] + ['Other']) PL.ylim((0, 40)) PL.xlabel('Nucleotides on either side of cut site') PL.ylabel( 'Percent gRNAs with single nucleotide deletion\nas most common indel in all 3 replicates' ) PL.show(block=False) saveFig('D1_bar_3_rep')