def createOverbeekTemplates(selected_id=None):

    ctrl_samdir = getHighDataDir() + '/overbeek_control_sam_files'
    output_template_dir = getHighDataDir() + '/overbeek_template_files'
    if not os.path.isdir(output_template_dir):
        os.mkdir(output_template_dir)

    lookup = loadLocationSpacerLookup()

    f = io.open(getHighDataDir() + '/overbeek_self_targets.csv')
    reader = csv.reader(f, delimiter='\t')

    for toks in reader:
        idx = eval(toks[-1].split()[-1])
        id, samid = 'Overbeek%d' % idx, 'Overbeek_%d' % idx
        if selected_id is not None and selected_id != id:
            continue
        loc, spacer_seq, primer = lookup[id]
        fout = io.open(output_template_dir + '/%s_template.fasta' % id, 'w')
        ctrl_sam_file = ctrl_samdir + '/%s.sam' % samid
        template_seq, pam_loc, pam_dir = extractTemplateSequenceAndPamLoc(
            ctrl_sam_file, loc, id, spacer_seq, primer)
        fout.write(u'>%s_%s %d %s\n%s\n' %
                   (id, spacer_seq, pam_loc, pam_dir, template_seq))
        fout.close()
예제 #2
0
def runAnalysis():

    spec = {'results_dir': getHighDataDir() + '/microhomology/mh_freqs_by_len',
            'dirname_to_result_fn': lambda x: x,
            'result_to_dirname_fn': lambda x: x,
            'py_func_load': loadAllMHLenData,
            'py_funcs_per_result':  [(plotK562PercScatterAnalysis,'RegrLines'), (passData, 'Data')],
            'py_funcs_all_results': [compareMHK562lines, plotGCContent],
            'reads_colname': 'Non-Null Reads',
            'check_output_fn': lambda x: True, 
            'id_colname': 'Oligo ID',
            'min_reads': MIN_READS,
            'partitions': ['Non-Targeting'],
            'samples': ['K562 New']
            }
    analyseResultsPerPartition( spec ) 

    spec = {'results_dir': getHighDataDir() + '/microhomology/mh_freqs_by_len',
            'dirname_to_result_fn': lambda x: x,
            'result_to_dirname_fn': lambda x: x,
            'py_func_load': loadAllMHLenData,
            'py_funcs_per_result':  [(plotPercScatterAnalysis,'RegrLines')],
            'py_funcs_all_results': [compareMHlines],
            'reads_colname': 'Non-Null Reads',
            'check_output_fn': lambda x: True, 
            'id_colname': 'Oligo ID',
            'min_reads': MIN_READS,
            'partitions': ['Non-Targeting'],
            'samples': ['DPI7']
            }
    analyseResultsPerPartition( spec ) 
예제 #3
0
def loadAllData(guideset,
                sample_selector=lambda x: True,
                label='',
                cols=['KL without null'],
                allow_pickle=False):
    pickle_file = '%s/kl_analysis_%s.pickle' % (getPickleDir(),
                                                label.replace(' ', '_'))
    if os.path.exists(pickle_file) and allow_pickle:
        merged_data = pandas.read_pickle(pickle_file)
    else:
        cmp_files = os.listdir(getHighDataDir() + '/' +
                               ST_COMPARISON_RESULTS_DIR)
        merged_data = None
        for filename in cmp_files:
            dir1, dir2 = getDirsFromFilename(filename)
            if not sample_selector(dir1) or not sample_selector(dir2): continue

            #Load data from file
            data = pandas.read_csv(getHighDataDir() + '/' +
                                   ST_COMPARISON_RESULTS_DIR + '/' + filename,
                                   sep='\t')
            data['Mutated Reads 1'] = data['Num Reads 1'] - data[
                'Num null reads 1']
            data['Mutated Reads 2'] = data['Num Reads 2'] - data[
                'Num null reads 2']
            data = data.loc[data['Mutated Reads 1'] > MIN_READS]
            data = data.loc[data['Mutated Reads 2'] > MIN_READS]
            data = data.loc[data['ID'].isin(guideset)][['ID'] + cols]
            if merged_data is not None and len(data) < 0.75 * len(merged_data):
                print('Skipping %s, data for insufficient guides (%d vs %d)' %
                      (filename, len(data), len(merged_data)))
                continue

            #Merge with the other data (keep only common Oligos)
            suffix_fn = lambda x: '$' + x
            if merged_data is None:
                merged_data, first_suffix = data, suffix_fn(filename)
            else:
                merged_data = merged_data.merge(data,
                                                how='inner',
                                                on='ID',
                                                suffixes=('',
                                                          suffix_fn(filename)))
            print(len(merged_data), filename)

        merged_data = merged_data.rename(
            columns={x: (x + first_suffix)
                     for x in cols})
        if allow_pickle:
            merged_data.to_pickle(pickle_file)
    return merged_data
예제 #4
0
def loadIndelData():
    indel_data_new = pd.read_csv(getHighDataDir() +
                                 '/i1/exp_target_pam_new_gen_i1_indels.txt',
                                 sep='\t',
                                 header=1)
    indel_data_old = pd.read_csv(getHighDataDir() +
                                 '/i1/exp_target_pam_old_gen_i1_indels.txt',
                                 sep='\t',
                                 header=1)
    indel_data = pd.concat([indel_data_new, indel_data_old])[[
        'Oligo Id', 'Repeat Nucleotide Left', 'Repeat Nucleotide Right'
    ]]
    indel_data['Short Oligo Id'] = indel_data['Oligo Id'].apply(
        getShortOligoId)
    return indel_data
def fetchMhMismatchFrequencies(dirname,
                               outdir='mh_mismatch_indel_frequencies'):

    if not os.path.isdir(outdir): os.makedirs(outdir)
    if isOldLib(dirname): raise Exception('Old Lib not supported')

    mh_exp_indels_file = getHighDataDir() + '/mh_mismatch_indels.txt'

    fout = io.open(outdir + '/' + getDirLabel(dirname) + '.txt', 'w')
    hdr_str = '\t'.join([
        '\t'.join([
            x + ' Indel Reads in ' + y for x in
            ['Orig', 'Left Mut', 'Right Mut', 'Merged Mut1', 'Merged Mut2']
        ]) for y in ['Mut', 'Orig']
    ])

    f = io.open(mh_exp_indels_file)
    rdr = csv.DictReader(f, delimiter='\t')
    fout.write(u'%s\t%s\tMut Non-Null Reads\tOrig Non-Null Reads\n' %
               ('\t'.join(rdr.fieldnames), hdr_str))
    for row in rdr:

        #Load Indel Profiles for both the original and mutated micrhomology forms
        mut_oligo_id = row['Oligo ID'].replace('_', '')
        orig_oligo_id = row['Mapped Oligo Id'].replace('_', '')

        mut_filepath, mut_filename = getFileForOligoIdx(
            getOligoIdxFromId(mut_oligo_id), ext='_mappedindelsummary.txt')
        orig_filepath, orig_filename = getFileForOligoIdx(
            getOligoIdxFromId(orig_oligo_id), ext='_mappedindelsummary.txt')

        p_mut, p_orig = {}, {}
        stats_mut = readSummaryToProfile(dirname + '/mapped_reads/' +
                                         mut_filepath + '/' + mut_filename,
                                         p_mut,
                                         oligoid=mut_oligo_id)
        stats_orig = readSummaryToProfile(dirname + '/mapped_reads/' +
                                          orig_filepath + '/' + orig_filename,
                                          p_orig,
                                          oligoid=orig_oligo_id)

        indels = [
            row['Orig Indel'], row['Left Mut-MH Indel'],
            row['Right Mut-MH Indel'], row['Merge Mut 1 Indel'],
            row['Merge Mut 2 Indel']
        ]
        reads = lambda indel, profile: profile[indel] if (indel in profile and
                                                          indel != '') else 0
        mut_read_str = '\t'.join(
            ['%d' % reads(indel, p_mut) for indel in indels])
        orig_read_str = '\t'.join(
            ['%d' % reads(indel, p_orig) for indel in indels])

        str_args = ('\t'.join([row[col] for col in rdr.fieldnames
                               ]), mut_read_str, orig_read_str,
                    stats_mut[0] - stats_mut[2], stats_orig[0] - stats_orig[2])
        fout.write(u'%s\t%s\t%s\t%d\t%d\n' % str_args)

    f.close()
    fout.close()
def plotInFrameCorr(data):

    shi_data = pd.read_csv(getHighDataDir() + '/shi_deepseq_frame_shifts.txt',
                           sep='\t')

    label1, label2 = 'New In Frame Perc', 'Predicted In Frame Per'
    PL.figure(figsize=(4, 4))

    xdata, ydata = data[label1], data[label2]
    PL.plot(xdata, ydata, '.', alpha=0.15)
    PL.plot(shi_data['Measured Frame Shift'],
            shi_data['Predicted Frame Shift'],
            '^',
            color='orange')
    for x, y, id in zip(shi_data['Measured Frame Shift'],
                        shi_data['Predicted Frame Shift'], shi_data['ID']):
        if x - y > 10:
            PL.text(x, y, id.split('/')[1][:-21])
    PL.plot([0, 100], [0, 100], 'k--')
    PL.title('R=%.3f' % (pearsonr(xdata, ydata)[0]))
    PL.xlabel('percent in frame mutations (measured)')
    PL.ylabel('percent in frame mutations (predicted)')
    PL.ylim((0, 80))
    PL.xlim((0, 80))
    PL.show(block=False)
    saveFig('in_frame_corr_%s_%s' %
            (label1.replace(' ', '_'), label2.replace(' ', '_')))
예제 #7
0
def plotDominantBars(all_result_outputs, label=''):
    pie_labels = ['I1_Rpt Left Reads - NonAmb','Ambiguous Rpt Reads','I1_Rpt Right Reads - NonAmb','I1_NonRpt Reads']
    mci_merged_data = mergeSamples(all_result_outputs, [], data_label='i1IndelData')
    mci_merged_data['Equal MCI'] = (mci_merged_data['Most Common Indel']==mci_merged_data['Most Common Indel 2']) & (mci_merged_data['Most Common Indel']==mci_merged_data['Most Common Indel 3'])
    mci_merged_data['Is Dominant I1'] = (mci_merged_data['Equal MCI'] & (mci_merged_data['MCI Type'] == 'I1'))
    
    oligo_data =  pd.read_csv(getHighDataDir() + '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv',sep='\t')
    remove_under = lambda x: x.replace('_','')
    oligo_data['Oligo Id'] = oligo_data['ID'].apply(remove_under)
    merged_mci_data = pd.merge(mci_merged_data, oligo_data[['Oligo Id','Guide']], how='inner',on='Oligo Id')

    nt_perc_i1, cnt_labels = [], []
    nts = 'ATGC'
    for nt in nts:
        is_nt = lambda guide: (guide[-4] == nt)
        nt_data = merged_mci_data.loc[merged_mci_data['Guide'].apply(is_nt)]
        nt_perc_i1.append(sum(nt_data['Is Dominant I1'])*100.0/len(nt_data))
        cnt_labels.append('%d/%d' % (sum(nt_data['Is Dominant I1']),  len(nt_data)))
    
    PL.figure()
    PL.bar(range(4), nt_perc_i1, width=0.8)
    for i, cnt in enumerate(cnt_labels):
        PL.text(i-0.3,nt_perc_i1[i]+5.0,cnt)
    PL.xticks(range(4), [x for x in nts])
    PL.xlabel('Nucleotide on Left of cut-site')
    PL.ylabel('Percent gRNAs with single nucleotide insertion\nas most common indel in all 3 replicates')
    PL.show(block=False)
    saveFig('I1_bar_3_rep')
예제 #8
0
def loadProfilePair(old_id, new_id):
    p_old, p_new = {}, {}
    old_file, new_file = getSummaryFileSuffix(old_id), getSummaryFileSuffix(
        new_id)
    mut_reads_old, mut_reads_new = 0, 0
    for new_dir in [getHighDataDir() + '/' + x for x in new_dirs]:
        acc, pacc, null = readSummaryToProfile(new_dir + '/mapped_reads/' +
                                               new_file,
                                               p_new,
                                               oligoid=new_id)
        mut_reads_new += (acc - null)
    for old_dir in [getHighDataDir() + '/' + x for x in old_dirs]:
        acc, pacc, null = readSummaryToProfile(old_dir + '/mapped_reads/' +
                                               old_file,
                                               p_old,
                                               oligoid=old_id)
        mut_reads_old += (acc - null)
    return p_old, p_new, mut_reads_old, mut_reads_new
예제 #9
0
def loadRepReads(new_id):
    oligo_idx = getOligoIdxFromId(new_id)
    subdir, profilefilename = getFileForOligoIdx(
        oligo_idx, ext='_mappedindelprofiles.txt')
    profile_file = getHighDataDir(
    ) + '/' + new_dirs[0] + '/mapped_reads/' + subdir + '/' + profilefilename
    rep_reads = {}
    fetchRepresentativeCleanReads(profile_file, rep_reads, oligoid=new_id)
    return rep_reads
예제 #10
0
def runAnalysis():

    spec = {
        'results_dir':
        getHighDataDir() + '/indel_details/indel_pie_summaries_per_oligo',
        'dirname_to_result_fn':
        lambda x: '%s.txt' % x,
        'result_to_dirname_fn':
        lambda x: x.split('/')[-1][:-4],
        'py_func_load':
        loadData,
        'py_funcs_per_result': [(perOligoCounts, 'perOligoCounts'),
                                (perOligoMCI, 'perOligoMCI'),
                                (computePercentages, 'PercData')],
        'py_funcs_all_results': [plotSumPie, plotMCIPie, plotPercCorrelations],
        'check_output_fn':
        lambda x: True,
        'reads_colname':
        'Total reads',
        'min_reads':
        MIN_READS,
        'id_colname':
        'Oligo Id',
        'partitions': ['Real Guides'],
        'samples': ['K562 New']
    }
    analyseResultsPerPartition(spec)

    spec = {
        'results_dir':
        getHighDataDir() + '/indel_details/indel_pie_summaries_per_oligo',
        'dirname_to_result_fn': lambda x: '%s.txt' % x,
        'result_to_dirname_fn': lambda x: x.split('/')[-1][:-4],
        'py_func_load': loadData,
        'py_funcs_per_result': [(computePieData, 'PieData')],
        'py_funcs_all_results': [plotBarSummaryPieIndels],
        'check_output_fn': lambda x: True,
        'reads_colname': 'Total reads',
        'min_reads': MIN_READS,
        'id_colname': 'Oligo Id',
        'partitions': ['Real Guides'],
        'samples': ['DPI7']
    }
    analyseResultsPerPartition(spec)
예제 #11
0
def runAnalysis():
	
    spec = {'results_specs': [{'results_dir':getHighDataDir() + '/i1/i1_summaries',
                              'dirname_to_result_fn': lambda x: '%s.txt' % x,
                              'result_to_dirname_fn': lambda x: x.split('/')[-1][:-4]},
                            {'results_dir':getHighDataDir() + '/indel_details/indel_pie_summaries_per_oligo',
                              'dirname_to_result_fn': lambda x: '%s.txt' % x,
                              'result_to_dirname_fn': lambda x: x.split('/')[-1][:-4]}],
            'py_func_load': loadI1andMCIData,
            'py_funcs_per_result': [(mergeWithIndelData, 'i1IndelData')], 
            'py_funcs_all_results': [plotDominantBars,plotDominantPieDataWithAmbig,plotMergedPieDataWithAmbig, plotMergedI1Repeats],
            'check_output_fn': lambda x: True,
            'reads_colname': 'Total reads',
            'min_reads': MIN_READS,
            'id_colname': 'Oligo Id',
            'partitions': ['Non-Targeting'],
            'samples': ['K562 New']
            }
    analyseResultsPerPartition( spec ) 
예제 #12
0
def loadProfilesSeparately(old_id, new_id):

    p_olds, p_news, old_sep_mr, new_sep_mr = [{}, {}], [{}, {}], [0, 0], [0, 0]
    old_file, new_file = getSummaryFileSuffix(old_id), getSummaryFileSuffix(
        new_id)
    for new_dir in [getHighDataDir() + '/' + x for x in new_dirs]:
        idx = 0 if '800' in new_dir else 1
        acc, pacc, null = readSummaryToProfile(new_dir + '/mapped_reads/' +
                                               new_file,
                                               p_news[idx],
                                               oligoid=new_id)
        new_sep_mr[idx] += acc - null
    for old_dir in [getHighDataDir() + '/' + x for x in old_dirs]:
        idx = 0 if '800' in old_dir else 1
        acc, pacc, null = readSummaryToProfile(old_dir + '/mapped_reads/' +
                                               old_file,
                                               p_olds[idx],
                                               oligoid=old_id)
        old_sep_mr[idx] += acc - null
    return p_olds, p_news, old_sep_mr, new_sep_mr
예제 #13
0
def loadLocationSpacerLookup():
    f = io.open(getHighDataDir() + '/overbeek_2016_guides_s1.txt')
    reader = csv.DictReader(f, delimiter='\t')
    lookup = {
        'Overbeek%d' % eval(row['Spacer ']):
        (row['Genomic location of spacer (hg19)'], row['Spacer sequence'],
         row['sgRNA primer'])
        for row in reader
    }
    f.close()
    return lookup
예제 #14
0
def prepareExample(example_dir):

    setHighDataDir(example_dir)

    #Generate all possible indels
    new_gen_dir, old_gen_dir = getHighDataDir(
    ) + '/generated_indels_new', getHighDataDir() + '/generated_indels_old'
    if not os.path.isdir(new_gen_dir): os.makedirs(new_gen_dir)
    if not os.path.isdir(old_gen_dir): os.makedirs(old_gen_dir)
    cmd = getIndelGenExe() + ' ' + getHighDataDir(
    ) + '/exp_target_pam_new.fasta ' + new_gen_dir + '/'
    print(cmd)
    os.system(cmd)
    cmd = getIndelGenExe() + ' ' + getHighDataDir(
    ) + '/exp_target_pam_old.fasta ' + old_gen_dir + '/'
    print(cmd)
    os.system(cmd)

    #Compile number of reads per sample for each indel
    reads_dir = getHighDataDir() + '/reads_for_gen_indels'
    compileGenIndelReads(gen_indel_dir=new_gen_dir,
                         out_dir=reads_dir,
                         sample_dirs=new_dirs)
    compileGenIndelReads(gen_indel_dir=old_gen_dir,
                         out_dir=reads_dir,
                         sample_dirs=old_dirs)
    setReadsDir(reads_dir)

    #Compute features for each indel
    features_dir = getHighDataDir() + '/features_for_gen_indels'
    computeFeaturesForGenIndels(gen_indel_dir=new_gen_dir,
                                out_dir=features_dir)
    computeFeaturesForGenIndels(gen_indel_dir=old_gen_dir,
                                out_dir=features_dir)
    setFeaturesDir(features_dir)
예제 #15
0
def loadMappings():
    f = io.open(getHighDataDir() + '/overbeek_to_oligo_mapping.txt')
    reader = csv.reader(f, delimiter='\t')
    mappings = {}
    for toks in reader:
        overbeek_id = 'Overbeek' + toks[0].split()[-1]
        oligo_id = toks[1].split('_')[0]
        old = (toks[2] == 'Old')
        if overbeek_id not in mappings:
            mappings[overbeek_id] = []
        mappings[overbeek_id].append((oligo_id, old))
    f.close()
    return mappings
예제 #16
0
def runAnalysis():

    partitions = partitionGuides(oligo_detail_dir=getHighDataDir() +
                                 '/ST_June_2017/data')

    for part_desc in ['Real Guides']:

        selector = getSampleSelectors()['DPI7']
        guideset = partitions[part_desc]

        desc = part_desc + ' DPI7'
        data = loadAllData(guideset, sample_selector=selector, label=desc)
        plotHeatMap(data, label=desc)
예제 #17
0
def collectMhOfLen(filename, mh_len, fout):

    det = loadAllOligoDetails(oligo_detail_dir=getHighDataDir() +
                              '/ST_June_2017/data')
    oligo_details = {'Oligo' + x.split('_')[-1]: val for x, val in det.items()}

    indels_to_write = []
    max_reads, len_mh_max_reads, left_max_reads, right_max_reads, max_indel = 0, -1, -1, -1, ''
    f = io.open(filename)

    #Collect indels of the right length, write out with details of MH indel with max reads for that oligo
    for toks in csv.reader(f, delimiter='\t'):

        #Next Oligo (write out last)
        if toks[0][:3] == '@@@':
            if len(indels_to_write) > 0:
                oligo_line = u'%d\t%d\t%s\t%d\t%d\t%d\t%d' % (
                    accpt_reads, accpt_nonnull_reads, max_indel, max_reads,
                    len_mh_max_reads, left_max_reads, right_max_reads)
                for indel_line in indels_to_write:
                    fout.write(u'%s\t%s\n' % (indel_line, oligo_line))
            ctoks = toks[0][3:].split(':')
            oligo_id = ctoks[0]
            target = oligo_details[oligo_id]['Target']
            accpt_reads, accpt_nonnull_reads = eval(ctoks[1]), eval(ctoks[2])
            max_reads, len_mh_max_reads, left_max_reads, right_max_reads, max_indel = 0, -1, -1, -1, ''
            indels_to_write = []
            continue

        #MH details, collect MH's of correct length, and also track details of MH indel with max reads
        left, right, c_mh_len, indel, reads = eval(toks[0]), eval(
            toks[1]), eval(toks[2]), toks[3], eval(toks[-1])
        l_mh_seq, r_mh_seq = target[left:left + c_mh_len], target[right:right +
                                                                  c_mh_len]
        assert (l_mh_seq == r_mh_seq)
        gc_content = sum([x in ['G', 'C']
                          for x in l_mh_seq]) * 100.0 / len(l_mh_seq)
        if reads > max_reads:
            max_reads, len_mh_max_reads, left_max_reads, right_max_reads, max_indel = reads, c_mh_len, left, right, indel
        if c_mh_len != mh_len: continue
        indels_to_write.append(
            u'%s\t%s\t%d\t%d\t%d\t%.1f' %
            (oligo_id, indel, reads, left, right, gc_content))

    #Write last Oligo (if needed)
    if len(indels_to_write) > 0:
        oligo_line = u'%d\t%d\t%s\t%d\t%d\t%d\t%d' % (
            accpt_reads, accpt_nonnull_reads, max_indel, max_reads,
            len_mh_max_reads, left_max_reads, right_max_reads)
        for indel_line in indels_to_write:
            fout.write(u'%s\t%s\n' % (indel_line, oligo_line))
def runAnalysis():
	
    spec = {'results_dir':getHighDataDir() + '/microhomology_mismatch/mh_mismatch_indel_frequencies',
            'dirname_to_result_fn': lambda x: '%s.txt' % x,
            'result_to_dirname_fn': lambda x: x.split('/')[-1][:-4],
            'py_func_load': loadData,
            'py_funcs_per_result': [(passData,'Data')],
            'py_funcs_all_results': [plotMicrohomologyMismatches],
            'check_output_fn': lambda x: True,
            'reads_colname': 'Orig Non-Null Reads',
            'min_reads': MIN_READS,
            'id_colname': 'Oligo ID',
            'partitions': ['Non-Targeting'],
            'samples': ['K562 New']
            }
    analyseResultsPerPartition( spec )
예제 #19
0
def compileGenIndelReads(gen_indel_dir='generated_indels',
                         out_dir='reads_for_gen_indels_all_samples',
                         sample_dirs=[]):

    if not os.path.isdir(out_dir): os.mkdir(out_dir)

    for gen_file in os.listdir(gen_indel_dir):

        oligo_id = gen_file.split('_')[0]
        oligo_idx = getOligoIdxFromId(oligo_id)
        oligo_subdir, sum_filename = getFileForOligoIdx(
            oligo_idx, ext='_mappedindelsummary.txt')

        out_subdir = out_dir + '/' + oligo_subdir
        if not os.path.isdir(out_subdir): os.mkdir(out_subdir)

        #Read all profiles for this oligo
        profiles, mut_read_totals = [], []
        for dirname in sample_dirs:
            profiles.append({})
            filename = getHighDataDir(
            ) + '/' + dirname + '/mapped_reads/' + oligo_subdir + '/' + sum_filename
            stats = readSummaryToProfile(filename,
                                         profiles[-1],
                                         oligoid=oligo_id)
            mut_read_totals.append('%d' % (stats[0] - stats[2]))

        #Compile reads for each indel across all samples
        f = io.open(gen_indel_dir + '/' + gen_file)
        fout = io.open(out_subdir + '/%s_gen_indel_reads.txt' % oligo_id, 'w')
        fout.write(f.readline())  #Git commit
        fout.write(u'Indel\tDetails\t%s\n' %
                   '\t'.join([getDirLabel(x) for x in sample_dirs]))
        fout.write(u'All Mutated\t[]\t%s\n' % '\t'.join(mut_read_totals))
        for toks in csv.reader(f, delimiter='\t'):
            indel, indel_details = toks[0], toks[2]
            read_str = '\t'.join(
                ['%d' % (p1[indel] if indel in p1 else 0) for p1 in profiles])
            fout.write(u'%s\t%s\t%s\n' % (indel, indel_details, read_str))
        fout.close()
        f.close()
예제 #20
0
def computeFeaturesForGenIndels(gen_indel_dir='generated_indels',
                                out_dir='features_for_gen_indels'):

    if not os.path.isdir(out_dir): os.mkdir(out_dir)

    #Load Oligo details
    oligo_details = loadAllOligoDetails(oligo_detail_dir=getHighDataDir() +
                                        '/ST_June_2017/data')
    oligo_details = {
        id.replace('_', ''): row
        for (id, row) in oligo_details.items()
    }

    for gen_file in os.listdir(gen_indel_dir):
        print(gen_file)

        oligo_id = gen_file.split('_')[0]
        oligo_idx = getOligoIdxFromId(oligo_id)
        oligo_subdir, _ = getFileForOligoIdx(oligo_idx, ext='')

        out_subdir = out_dir + '/' + oligo_subdir
        if not os.path.isdir(out_subdir): os.mkdir(out_subdir)

        row = oligo_details[oligo_id]

        uncut_seq = row['Target'] if row[
            'PAM Direction'] != 'REVERSE' else Bio.Seq.reverse_complement(
                row['Target'])
        cut_site = eval(row['PAM Location']
                        ) - 3 if row['PAM Direction'] != 'REVERSE' else (
                            79 - eval(row['PAM Location']) - 3)
        generated_indel_file = gen_indel_dir + '/' + gen_file
        out_file = out_subdir + '/%s_gen_indel_features.txt' % oligo_id
        is_reverse = (row['PAM Direction'] == 'REVERSE')
        calculateFeaturesForGenIndelFile(generated_indel_file,
                                         uncut_seq,
                                         cut_site,
                                         out_file,
                                         is_reverse=is_reverse)
예제 #21
0
def runAnalysis():

    data = pd.read_csv(getHighDataDir() + '/old_new_kl_summaries.txt',
                       sep='\t').fillna(-1.0)
    kl_cols = [
        x for x in data.columns
        if 'KL' in x and 'Class KL' not in x and 'Old v Old' not in x
    ]
    max_kl = 9
    PL.figure(figsize=(2.5, 4))
    bps = []
    box_types = [('C2', 'Within Library'), ('C0', 'Between Library')]
    for i, (clr, box_type) in enumerate(box_types):
        col_box_data = [
            data[col] for col in kl_cols if renameCol(col) == box_type
        ]
        pos = [2 * x + i + 1 for x in range(len(col_box_data))]
        print('KL', box_type, np.median(col_box_data, axis=1))
        bps.append(
            PL.boxplot(col_box_data,
                       positions=pos,
                       patch_artist=True,
                       boxprops=dict(facecolor=clr),
                       showfliers=False))
    PL.xticks([1.5, 3.5, 5.5],
              ['Same\ngRNA', 'Other\ngRNA', 'Other\ngRNA\n(Rpt)'])
    PL.plot([2.5, 2.5], [0, max_kl], '-', color='silver')
    PL.plot([4.5, 4.5], [0, max_kl], '-', color='silver')
    PL.xlim((0.5, 6.5))
    PL.ylim((0, max_kl))
    PL.ylabel('KL')
    PL.subplots_adjust(left=0.1, right=0.95, top=0.95, bottom=0.25)
    PL.legend([bp["boxes"][0] for bp in bps], [x[1] for x in box_types],
              loc='upper left')
    PL.show(block=False)
    saveFig('kl_compare_old_new_KL')
예제 #22
0
# Configure environment
#----------------------------------------------------------------------
setRunLocal(True)
setHighDataDir('/results/endogenous_processing_example/')
setPythonCmd('python')
setIndelMapExe('/usr/local/bin/indelmap')

#----------------------------------------------------------------
# Processing of raw Van-Overbeek et al reads to produce descriptions of indels
#----------------------------------------------------------------
#Note:  This provides a demonstration on just 1 oligo, going from raw overbeek reads to indel descriptions.
#       Sam files are assumed to be already collected (for further details of this part see
#       collect_overbeek_sams.py in same dir)

printStatus('Create fastq files from Van Overbeek sam files')
sam_dir, fastq_dir = getHighDataDir() + '/overbeek_sam_files', getHighDataDir(
) + '/overbeek_fastq_files'
if not os.path.isdir(fastq_dir): os.makedirs(fastq_dir)
extractReads(sam_dir + '/Overbeek_6.sam', fastq_dir + '/Overbeek6.fastq',
             'chrX:66765045-66765067', 'Overbeek6')
sam_dir, fastq_dir = getHighDataDir(
) + '/overbeek_control_sam_files', getHighDataDir(
) + '/overbeek_control_fastq_files'
if not os.path.isdir(fastq_dir): os.makedirs(fastq_dir)
extractReads(sam_dir + '/Overbeek_6.sam', fastq_dir + '/Overbeek6.fastq',
             'chrX:66765045-66765067', 'Overbeek6')

printStatus('Compute mutational profile from Van Overbeek data')
createOverbeekTemplates(selected_id='Overbeek6')
computeOverbeekIndelProfiles(highdir=getHighDataDir(), selected_id='Overbeek6')
예제 #23
0
def loadValidationPairs():
    f = io.open(getHighDataDir() + '/old_new_validation_guides.txt')
    id_pairs = [[row['Old Oligo Id'], row['New Oligo Id']]
                for row in csv.DictReader(f, delimiter='\t')]
    f.close()
    return id_pairs
def plotMicrohomologyMismatches(all_result_outputs, label=''):
    
    mut_hdrs =  ['Left Mut', 'Right Mut','Merged Mut1', 'Merged Mut2']
    cols_to_sum = [x + ' Indel Reads in Mut' for x in mut_hdrs] + ['Orig Indel Reads in Orig', 'Mut Non-Null Reads', 'Orig Non-Null Reads'] 
    common_cols = ['Oligo ID','Mapped Oligo Id','Num Mismatches','Orig MH','Left Mut-MH','Right Mut-MH','Merged Mut 1 MH','Merged Mut 2 MH','Orig Indel','Left Mut-MH Indel','Right Mut-MH Indel','Merge Mut 1 Indel','Merge Mut 2 Indel']
    data =  mergeSamples(all_result_outputs, cols_to_sum, merge_on=common_cols)

    getLeft = lambda indel: tokFullIndel(indel)[2]['L']
    getRight = lambda indel: tokFullIndel(indel)[2]['R']
    getMHSize = lambda indel: tokFullIndel(indel)[2]['C']

    oligo_data = pd.read_csv(getHighDataDir() + '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv', sep='\t')
    oligo_data['Guide is matched'] = oligo_data.apply(isMatched, axis=1)
    reverse_lookup = {x: y == 'REVERSE' for (x,y) in zip(oligo_data['ID'],oligo_data['PAM Direction'])}
    is_reverse = lambda x: reverse_lookup[x]

    data = pd.merge(data, oligo_data[['ID','Guide is matched']], left_on='Oligo ID', right_on='ID', how='inner')

    data['MH Size'] = data['Orig Indel'].apply(getMHSize)
    data = data.loc[(data['MH Size'] != 0) & (data['Guide is matched'])]
    data['MH Left Loc'] = data['Orig Indel'].apply(getLeft) + data['MH Size']
    data['MH Right Loc'] = data['Orig Indel'].apply(getRight) - data['MH Size']
    data['Is Reverse'] = data['Oligo ID'].apply(is_reverse)

    for hdrL,hdrR in [mut_hdrs[:2], mut_hdrs[2:]]:
        data[hdrL + ' Reads'] = data['Is Reverse']*data[hdrR + ' Indel Reads in Mut Sum'] + (1- data['Is Reverse'])*data[hdrL + ' Indel Reads in Mut Sum']
        data[hdrR + ' Reads'] = data['Is Reverse']*data[hdrL + ' Indel Reads in Mut Sum'] + (1- data['Is Reverse'])*data[hdrR + ' Indel Reads in Mut Sum']
        data[hdrL + ' Reads Ratio'] =  data[hdrL + ' Reads']*100.0/data['Mut Non-Null Reads Sum']
        data[hdrR + ' Reads Ratio'] =  data[hdrR + ' Reads']*100.0/data['Mut Non-Null Reads Sum']
    data['Orig Indel Reads Ratio'] = data['Orig Indel Reads in Orig Sum']*100.0/data['Orig Non-Null Reads Sum']
    data['All Mut Reads Ratio'] = (data[[x + ' Reads' for x in mut_hdrs]].sum(axis=1))*100.0/data['Mut Non-Null Reads Sum']
    data['MH Dist'] = data['MH Right Loc'] - data['MH Left Loc']
    data['1st Mismatch'] = data.apply(getMismatch, axis=1)
    data['Last Mismatch'] = data.apply(getLastMismatch, axis=1)
    data['MH GC Content'] = data.apply(getMhGC, axis=1)

    mh_indel_types = [('Orig Indel','Left Mut'), ('Orig Indel','Right Mut'), ('Orig Indel','All Mut'),('Left Mut','Right Mut') ]
 
    label_lookup = {'Orig Indel': 'Perc. mutated reads of corresponding microhomology-\nmediated deletion with no sequence mismatches',
                    'Left Mut': 'Perc. mutated reads of mismatched microhomology-\nmediated deletion with retained left sequence',
                    'Right Mut': 'Perc mutated reads of mismatched microhomology-\nmediated deletion with retained right sequence',
                    'All Mut': 'Perc mutated reads of mismatched microhomology-\nmediated deletion (All)'
        }

    fig1 = PL.figure(figsize=(4,4))
    fig_all = PL.figure(figsize=(10,10))
    for i, (mh_typex, mh_typey) in enumerate(mh_indel_types):
        figs = [(fig_all, True), (fig1,False)] if i==2 else [(fig_all, True)]
        for fig, is_all in figs:
            PL.figure(fig.number)
            if is_all: PL.subplot(2,2,i+1)
            for nm,clr  in zip([1,2],['royalblue','orange']):
                nm_data = data.loc[data['Num Mismatches'] == nm]

                sty, lsty = 'o', '-'
                sel_data = nm_data.loc[(nm_data['MH Size'] >= 6) & (nm_data['MH Size'] <= 15)]

                PL.plot(sel_data[mh_typex + ' Reads Ratio'], sel_data[mh_typey + ' Reads Ratio'], sty, color=clr, markersize=4, label='No. MH Mismatches=%d' % (nm))
                rx, ry, grad = getRegrLine(sel_data[[mh_typex + ' Reads Ratio']], sel_data[[mh_typey + ' Reads Ratio']])
                if not is_all: print(grad, nm, mh_typex, mh_typey)
                if i != 3: PL.plot(rx, ry, lsty, color=clr, linewidth=2)

            PL.xlabel(label_lookup[mh_typex])
            PL.ylabel(label_lookup[mh_typey])
            PL.xlim((0,80))
            PL.ylim((0,80))
            PL.plot([0,80],[0,80],'k--')
            PL.legend()
            PL.show(block=False)
    saveFig('mm_mismatch_all')
    PL.figure(fig1.number)
    saveFig('mm_mismatch_one')
예제 #25
0
def runAnalysis():
    data = pd.read_csv(getHighDataDir() + '/old_new_kl_predicted_summaries.txt', sep='\t').fillna(-1.0)
    plotKLBoxes(data)
    plotInFrameCorr(data)
예제 #26
0
def computeAndComparePredicted(theta_file,
                               selected_id=None,
                               out_dir='.',
                               start_count=0,
                               end_count=10000):

    features_dir = getHighDataDir() + '/gen_indels/features_for_gen_indels'
    theta, train_set, feature_columns = readTheta(theta_file)

    new_sep_labels = 'New 2x800x', 'New 1600x'
    old_sep_labels = 'Old 2x800x', 'Old 1600x'

    #Note: here old refers to conventional scaffold library, new refers to improved scaffold library
    fout = io.open(
        out_dir + '/old_new_kl_predicted_summaries.txt' %
        (start_count, end_count), 'w')
    fout.write(
        u'Old Oligo Id\tNew Oligo Id\tOld Mut Reads\tNew Mut Reads\tCombined Mut Reads\t'
    )
    fout.write(u'\t'.join('%s Mut Reads' % x.split('/')[-1]
                          for x in new_sep_labels + old_sep_labels))
    fout.write(
        u'\tOld In Frame Perc\tNew In Frame Perc\tCombined in Frame Perc\tPredicted In Frame Per\t'
    )
    fout.write(u'\t'.join('%s In Frame Perc' % x.split('/')[-1]
                          for x in new_sep_labels + old_sep_labels))
    fout.write(
        u'\tOld v New KL\tOld v Predicted KL\tNew v Predicted KL\tCombined v Predicted KL\t'
    )
    fout.write(u'\t'.join('%s vs Predicted KL' % x.split('/')[-1]
                          for x in new_sep_labels + old_sep_labels) + '\t')
    fout.write(u'\t'.join([
        '%s vs %s KL' % (x.split('/')[-1], y.split('/')[-1])
        for x, y in (getCombs(new_sep_labels) + getCombs(old_sep_labels))
    ]) + '\n')

    id_pairs = loadValidationPairs()
    for (old_id, new_id) in id_pairs:
        if old_id in train_set or new_id in train_set:
            raise Exception('Bad!!! Testing on Training data: %s %s' %
                            (old_id, new_id))

        if selected_id is not None and selected_id != old_id:
            continue  #Guide pair selected for plotting

        #Load Old and new profiles, and produce combined profile from the two
        p_old, p_new, mut_reads_old, mut_reads_new = loadProfilePair(
            old_id, new_id)
        p_comb, mut_reads_comb = combineProfiles(p_old, p_new, mut_reads_old,
                                                 mut_reads_new)

        #Predict the profile (old and new will be the same so just do one)
        feature_data = loadOligoFeaturesAndReadCounts(new_id, [])
        p_predict, _ = computePredictedProfile(feature_data, theta,
                                               feature_columns)

        #Load separate profiles too
        p_old_sep, p_new_sep, old_sep_mr, new_sep_mr = loadProfilesSeparately(
            old_id, new_id)

        #Compute in frame percentages
        old_if_perc = getInFramePerc(p_old)
        new_if_perc = getInFramePerc(p_new)
        comb_if_perc = getInFramePerc(p_comb)
        pred_if_perc = getInFramePerc(p_predict)
        new_sep_if_percs = [
            getInFramePerc(profile) if len(profile) > 1 else -1
            for profile in p_new_sep
        ]
        old_sep_if_percs = [
            getInFramePerc(profile) if len(profile) > 1 else -1
            for profile in p_old_sep
        ]

        #Plot the comparison
        if selected_id is not None:
            rrds = loadRepReads(new_id)
            plotProfiles([p_new_sep[0], p_new_sep[1], p_predict],
                         [rrds, rrds, rrds], [56, 56, 56],
                         [False, False, False],
                         ['Replicate 1', 'Replicate 2', 'Predicted'],
                         title='%s (KL=%.2f, KL=%.2f)' %
                         (new_id, symmetricKL(p_new_sep[0], p_new_sep[1]),
                          symmetricKL(p_new, p_predict)))

        str_args = (symmetricKL(p_old, p_new), symmetricKL(p_old, p_predict),
                    symmetricKL(p_new,
                                p_predict), symmetricKL(p_comb, p_predict))
        kl_str = u'\t%.5f\t%.5f\t%.5f\t%.5f\t' % str_args
        kl_str += u'\t'.join([
            '%.5f' % symmetricKL(p_predict, x) for x in p_new_sep + p_old_sep
        ])
        kl_str += u'\t' + u'\t'.join([
            '%.5f' % symmetricKL(x, y)
            for (x, y) in (getCombs(p_new_sep) + getCombs(p_old_sep))
        ])
        if_str = u'\t'.join(
            ['%.3f' % x for x in new_sep_if_percs + old_sep_if_percs])
        mut_str = u'\t'.join(['%d' % x for x in new_sep_mr + old_sep_mr])
        fout.write(u'%s\t%s\t%d\t%d\t%d\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%s%s\n' %
                   (old_id, new_id, mut_reads_old, mut_reads_new,
                    mut_reads_comb, mut_str, old_if_perc, new_if_perc,
                    comb_if_perc, pred_if_perc, if_str, kl_str))
        fout.flush()
    fout.close()
예제 #27
0
def compareOverbeekProfiles(
        selected_overbeek_id=None,
        pred_results_dir='../indel_prediction/model_testing'):

    new_dirs = [
        'ST_June_2017/data/K562_800x_LV7A_DPI7/mapped_reads/Oligos_71',
        'ST_June_2017/data/K562_800x_LV7A_DPI10/mapped_reads/Oligos_71',
        'ST_June_2017/data/K562_800x_LV7B_DPI7/mapped_reads/Oligos_71',
        'ST_June_2017/data/K562_800x_LV7B_DPI10/mapped_reads/Oligos_71',
        'ST_June_2017/data/K562_1600x_LV7B_DPI5/mapped_reads/Oligos_71',
        'ST_Feb_2018/data/CAS9_12NA_1600X_DPI7/mapped_reads/Oligos_71'
    ]

    #Old Samples
    old_dirs = [
        'ST_June_2017/data/K562_1600x_6OA_DPI5/mapped_reads/Oligos_71',
        'ST_June_2017/data/K562_1600x_6OA_DPI7/mapped_reads/Oligos_71',
        'ST_April_2017/data/K562_800x_6OA_DPI3_Old7/mapped_reads/Oligos_71',
        'ST_April_2017/data/K562_800x_6OA_DPI7_Old8/mapped_reads/Oligos_71',
        'ST_April_2017/data/K562_800x_6OA_DPI10_Old9/mapped_reads/Oligos_71',
        'ST_April_2017/data/K562_800x_6OB_DPI3_Old10/mapped_reads/Oligos_71',
        'ST_April_2017/data/K562_800x_6OB_DPI7_Old11/mapped_reads/Oligos_71',
        'ST_April_2017/data/K562_800x_6OB_DPI10_Old12/mapped_reads/Oligos_71'
    ]
    remove_long_indels = False
    remove_wt, wt_thresh = True, 3.0
    mappings = loadMappings()

    all_overbeek_profiles, all_new_profiles, all_old_profiles, all_our_profiles, sel_overbeek_ids,oldnew_overbeek_ids, old_ids, new_ids = [],[],[],[], [],[],[],[]

    overbeek_inframes, ours_inframes, oof_sel_overbeek_ids = [], [], []

    kls, kls_old, kls_new, log_reads, overbeek_ids, above30_percentages, log_reads_new, log_reads_old, min_log_reads = [],[],[],[],[],[],[],[], []
    for idx in range(1, 97):

        overbeek_id = 'Overbeek%d' % idx
        if selected_overbeek_id is not None and selected_overbeek_id != overbeek_id:
            continue
        if overbeek_id not in mappings:
            continue

        overbeek_filename = getHighDataDir(
        ) + '/overbeek_fastq_files/' + overbeek_id + '_mappedindelsummary.txt'

        p1, p1_new, p1_old, o1, rep_reads1, rep_reads2 = {}, {}, {}, {}, {}, {}
        nreads2, nreads1, nreads_old, nreads_new, nnull_old, nnull_new, nnull1, nnull2 = 0, 0, 0, 0, 0, 0, 0, 0

        #Read the overbreek profile
        numread2, perc_accept2, num_null2 = readSummaryToProfile(
            overbeek_filename,
            o1,
            oligoid=overbeek_id,
            remove_long_indels=remove_long_indels,
            remove_wt=False)
        if selected_overbeek_id is not None:
            fetchRepresentativeCleanReads(
                getHighDataDir() + '/overbeek_fastq_files/' + overbeek_id +
                '_mappedindelprofiles.txt',
                rep_reads2,
                oligoid=overbeek_id)
            pam_loc2, pam_dir2 = getNullTargetPamDetails(
                getHighDataDir() + '/overbeek_control_fastq_files/' +
                overbeek_id + '_exptargets.txt',
                oligoid=overbeek_id)
        nreads2 += numread2
        nnull2 += num_null2

        if numread2 == 0: continue

        p1_new_reps, p1_old_reps = [{}, {}], [{}, {}]
        rr_new_reps, rr_old_reps = [{}, {}], [{}, {}]
        #Read all the new and old profiles
        pam_loc1, pam_dir1 = None, None
        for oligo_id, is_old in mappings[overbeek_id]:

            #Read all reads for all our K562 profiles
            oligo_idx = eval(oligo_id[5:])
            _, oligo_fileprefix = getFileForOligoIdx(oligo_idx, ext='')
            oligo_filename = oligo_fileprefix + '_mappedindelsummary.txt'
            read_filename = oligo_fileprefix + '_mappedindelprofiles.txt'
            exptarget_filename = oligo_fileprefix + '_exptargets.txt'
            if is_old:
                oligo_dirs, p1_old_new, null_oligo_dir = old_dirs, p1_old, 'ST_April_2017/data/NULL_Old/mapped_reads/Oligos_71'
                p1_reps, rr_reps = p1_old_reps, rr_old_reps
            else:
                oligo_dirs, p1_old_new, null_oligo_dir = new_dirs, p1_new, 'ST_April_2017/data/NULL_New/mapped_reads/Oligos_71'
                p1_reps, rr_reps = p1_new_reps, rr_new_reps

            for oligo_dir in [getHighDataDir() + '/' + x for x in oligo_dirs]:
                nr1, pa1, nn1 = readSummaryToProfile(
                    oligo_dir + '/' + oligo_filename,
                    p1_old_new,
                    oligoid=oligo_id,
                    remove_long_indels=remove_long_indels,
                    remove_wt=remove_wt,
                    wt_thresh=wt_thresh)
                numread1, perc_accept1, num_null1 = readSummaryToProfile(
                    oligo_dir + '/' + oligo_filename,
                    p1,
                    oligoid=oligo_id,
                    remove_long_indels=remove_long_indels,
                    remove_wt=remove_wt,
                    wt_thresh=wt_thresh)
                if 'DPI7' in oligo_dir:
                    rep_idx = 0 if '800x' in oligo_dir else 1
                    nr_rep, pa_rep, nn_rep = readSummaryToProfile(
                        oligo_dir + '/' + oligo_filename,
                        p1_reps[rep_idx],
                        oligoid=oligo_id,
                        remove_long_indels=remove_long_indels,
                        remove_wt=remove_wt,
                        wt_thresh=wt_thresh)
                if selected_overbeek_id is not None:
                    fetchRepresentativeCleanReads(oligo_dir + '/' +
                                                  read_filename,
                                                  rep_reads1,
                                                  oligoid=oligo_id)
                    if 'DPI7' in oligo_dir:
                        fetchRepresentativeCleanReads(oligo_dir + '/' +
                                                      read_filename,
                                                      rr_reps[rep_idx],
                                                      oligoid=oligo_id)
                    if pam_loc1 is None:
                        pam_loc1, pam_dir1 = getNullTargetPamDetails(
                            getHighDataDir() + '/' + null_oligo_dir + '/' +
                            exptarget_filename,
                            oligoid=oligo_id)
                if is_old:
                    nreads_old += numread1
                    nnull_old += num_null1
                else:
                    nreads_new += numread1
                    nnull_new += num_null1
                nreads1 += numread1
                nnull1 += num_null1

        kls.append(symmetricKL(p1, o1, True))
        kls_old.append(symmetricKL(p1_old, o1, True))
        kls_new.append(symmetricKL(p1_new, o1, True))

        log_reads.append(np.log10(nreads1 - nnull1 + 0.5))
        log_reads_old.append(np.log10(nreads_old - nnull_old + 0.5))
        log_reads_new.append(np.log10(nreads_new - nnull_new + 0.5))
        min_log_reads.append(min(log_reads_old[-1], log_reads_new[-1]))
        above30_percentages.append(computePercAbove30(o1))
        overbeek_ids.append(overbeek_id)

        if log_reads[-1] > 2.0:
            all_overbeek_profiles.append(o1)
            all_our_profiles.append(p1)
            sel_overbeek_ids.append(overbeek_id[8:])
            if above30_percentages[-1] < 50.0:
                oif, oof, _ = fetchIndelSizeCounts(o1)
                pif, pof, _ = fetchIndelSizeCounts(p1)
                overbeek_inframes.append(oif * 100.0 / (oif + oof))
                ours_inframes.append(pif * 100.0 / (pif + pof))
                oof_sel_overbeek_ids.append(overbeek_id)

        if min_log_reads[-1] > 2.0:
            all_new_profiles.append(p1_new)
            all_old_profiles.append(p1_old)
            oldnew_overbeek_ids.append(overbeek_id)
            old_ids.append(
                [id for id, is_old in mappings[overbeek_id] if is_old][0])
            new_ids.append(
                [id for id, is_old in mappings[overbeek_id] if not is_old][0])

        try:
            print(overbeek_id, [x for (x, y) in mappings[overbeek_id]],
                  kls[-1], nreads2, nreads1)
        except KeyError:
            print('Could not find', overbeek_id)
            print(mappings)

        if selected_overbeek_id is not None:
            title = '%s (KL=%.1f)' % (overbeek_id, kls[-1])
            labels = [
                'Conventional scaffold Rep A', 'Conventional scaffold  Rep B',
                'Improved scaffold Rep A', 'Improved scaffold  Rep B',
                'Endogenous Profile'
            ]
            plotProfiles([
                p1_old_reps[0], p1_old_reps[1], p1_new_reps[0], p1_new_reps[0],
                o1
            ], [
                rr_old_reps[0], rr_old_reps[1], rr_new_reps[0], rr_new_reps[1],
                rep_reads2
            ], [pam_loc1, pam_loc1, pam_loc1, pam_loc1, pam_loc2], [
                x == 'REVERSE'
                for x in [pam_dir1, pam_dir1, pam_dir1, pam_dir1, pam_dir2]
            ],
                         labels,
                         title=title)

    if selected_overbeek_id is None:

        plotInFrame(overbeek_inframes, ours_inframes, oof_sel_overbeek_ids,
                    pred_results_dir)

        i = 1
        PL.figure(figsize=(5.5, 5))
        for thr_l, thr_h in [(0.0, 10.0), (10.0, 20.0), (20.0, 50.0),
                             (50.0, 90.0), (90.0, 100.0)]:
            ydata = [
                kl for (kl, a30, id, reads) in zip(kls, above30_percentages,
                                                   overbeek_ids, log_reads)
                if a30 > thr_l and a30 <= thr_h
            ]
            xdata = [
                reads for (kl, a30, id, reads) in zip(kls, above30_percentages,
                                                      overbeek_ids, log_reads)
                if a30 > thr_l and a30 <= thr_h
            ]
            sel_ids = [
                id for (kl, a30, id, reads) in zip(kls, above30_percentages,
                                                   overbeek_ids, log_reads)
                if a30 > thr_l and a30 <= thr_h
            ]
            PL.plot(xdata,
                    ydata,
                    'o',
                    label='%d-%d%% Deletions > 30' % (thr_l, thr_h))
            for x, y, id in zip(xdata, ydata, sel_ids):
                if y > 3 and x > 2:
                    PL.text(x, y, id)
        PL.legend()
        PL.plot([0, 6], [0.77, 0.77], '--', color='grey')
        PL.text(0.1, 0.5, 'Median between our replicates', color='grey')
        PL.ylabel('Symmetric KL Divergence', fontsize=12)
        PL.xlabel('Log10 Mutated Reads', fontsize=12)
        PL.xlim((0, 5.5))
        PL.ylim((0, 8))
        PL.show(block=False)
        saveFig('scatter_KL')
        i += 1

        print('Median=', np.median(kls), 'Mean KL=', np.mean(kls))
        print(len(overbeek_ids))

        #Compute pairwise KL between overbeek and ours
        N = len(sel_overbeek_ids)
        kl_mat = np.zeros((N, N))
        for i, o1 in enumerate(all_overbeek_profiles):
            for j, p1 in enumerate(all_our_profiles):
                kl_mat[i, j] = symmetricKL(o1, p1)
        PL.figure(figsize=(8, 6))
        PL.imshow(kl_mat,
                  cmap='hot_r',
                  vmin=0.0,
                  vmax=3.0,
                  interpolation='nearest')
        PL.xticks(range(N), sel_overbeek_ids, rotation='vertical', fontsize=6)
        PL.yticks(range(N),
                  sel_overbeek_ids,
                  rotation='horizontal',
                  fontsize=6)
        PL.xlabel('Synthetic Measurement', fontsize=12)
        PL.ylabel('Endogenous Measurement', fontsize=12)
        PL.title('KL', fontsize=12)
        PL.colorbar()
        PL.show(block=False)
        saveFig('heatmap_KL')
예제 #28
0
def plotD1(all_result_outputs, label=''):
    mci_merged_data = mergeSamples(all_result_outputs, [],
                                   data_label='perOligoMCI')
    mci_merged_data['Equal MCI'] = (
        mci_merged_data['Most Common Indel']
        == mci_merged_data['Most Common Indel 2']) & (
            mci_merged_data['Most Common Indel']
            == mci_merged_data['Most Common Indel 3'])
    mci_common = mci_merged_data.loc[mci_merged_data['Equal MCI']]
    pie_vals, pie_labels = [], []
    dmci_data = mci_common.loc[(
        mci_common['MCI Type'] == 'D1'
    )]  #Note: type check discards equally most common indels

    spans_cutsite = lambda indel: tokFullIndel(indel)[2][
        'L'] < -1 and tokFullIndel(indel)[2]['R'] > 0
    for nt in 'ATGC':
        is_mh = lambda alt_seq: len(alt_seq) >= 2 and alt_seq == (len(alt_seq)
                                                                  * nt)
        num_repeat_nt = len(dmci_data.loc[
            dmci_data['Altered Sequence'].apply(is_mh)
            & dmci_data['Most Common Indel'].apply(spans_cutsite)])
        pie_vals.append(num_repeat_nt * 100.0 / len(dmci_data))
        print(num_repeat_nt)
        pie_labels.append('Removal of %s\nfrom %s|%s' % (nt, nt, nt))
    is_non_repeat = lambda seq: len(seq) < 2 or seq != (seq[0] * len(seq))
    num_non_repeat = len(
        dmci_data.loc[dmci_data['Altered Sequence'].apply(is_non_repeat)
                      | ~dmci_data['Most Common Indel'].apply(spans_cutsite)])
    pie_vals.append(num_non_repeat * 100.0 / len(dmci_data))
    print(num_non_repeat)
    pie_labels.append('Removal from non-repeat')
    PL.figure(figsize=(4, 4))
    PL.pie(pie_vals,
           labels=pie_labels,
           autopct='%.1f',
           labeldistance=1.1,
           counterclock=False,
           colors=OLD_COLORS)
    PL.title(
        'Size 1 deletions that are\n"most common" for their gRNA in all 3 replicates\n(%d gRNAs from %d total)'
        % (len(dmci_data), len(mci_merged_data)))
    PL.show(block=False)
    saveFig('pie_chart_D1')

    oligo_data = pd.read_csv(
        getHighDataDir() +
        '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv',
        sep='\t')
    remove_under = lambda x: x.replace('_', '')
    oligo_data['Oligo Id'] = oligo_data['ID'].apply(remove_under)
    merged_mci_data = pd.merge(mci_merged_data,
                               oligo_data[['Oligo Id', 'Guide']],
                               how='inner',
                               on='Oligo Id')
    print(len(merged_mci_data))

    nt_dbl_perc_d1, cnt_labels = [], []
    is_d1 = lambda indel: (indel.split('_')[0] == 'D1')
    non_dbl_nt = lambda row: row['Guide'][-4] != row['Guide'][-3]
    nts = 'ATGC'
    for nt in nts:
        double_nt = lambda row: row['Guide'][-4:-2] == (nt + nt)
        dbl_data = merged_mci_data.loc[merged_mci_data.apply(double_nt,
                                                             axis=1)]
        num_dbl_d1 = sum(
            dbl_data['Most Common Indel'].apply(is_d1) & dbl_data['Equal MCI']
            & (dbl_data['Oligo Id'] != 'Oligo28137')
        )  #Oligo28137: Corner case where a guide has CT|T and loses the C
        nt_dbl_perc_d1.append(num_dbl_d1 * 100.0 / len(dbl_data))
        cnt_labels.append('%d/%d' % (num_dbl_d1, len(dbl_data)))
        print(len(dbl_data))
    non_dbl_data = merged_mci_data.loc[merged_mci_data.apply(non_dbl_nt,
                                                             axis=1)]
    print(len(non_dbl_data))
    num_non_dbl_d1 = sum(non_dbl_data['Most Common Indel'].apply(is_d1)
                         & non_dbl_data['Equal MCI'])
    nt_dbl_perc_d1.append(num_non_dbl_d1 * 100.0 / len(non_dbl_data))
    cnt_labels.append('%d/%d' % (num_non_dbl_d1, len(non_dbl_data)))

    PL.figure()
    PL.bar(range(5), nt_dbl_perc_d1, width=0.8)
    for i, cnt in enumerate(cnt_labels):
        PL.text(i - 0.3, nt_dbl_perc_d1[i] + 5.0, cnt)
    PL.xticks(range(5), ['%s' % x * 2 for x in nts] + ['Other'])
    PL.ylim((0, 40))
    PL.xlabel('Nucleotides on either side of cut site')
    PL.ylabel(
        'Percent gRNAs with single nucleotide deletion\nas most common indel in all 3 replicates'
    )
    PL.show(block=False)
    saveFig('D1_bar_3_rep')
예제 #29
0
    compileGenIndelReads(gen_indel_dir=old_gen_dir,
                         out_dir=reads_dir,
                         sample_dirs=old_dirs)
    setReadsDir(reads_dir)

    #Compute features for each indel
    features_dir = getHighDataDir() + '/features_for_gen_indels'
    computeFeaturesForGenIndels(gen_indel_dir=new_gen_dir,
                                out_dir=features_dir)
    computeFeaturesForGenIndels(gen_indel_dir=old_gen_dir,
                                out_dir=features_dir)
    setFeaturesDir(features_dir)


if __name__ == '__main__':

    setIndelGenExe('/usr/local/bin/indelgen')
    setPlotDir('/results/plots')
    setFigType('png')

    shutil.copytree('/data/predicted_vs_measured_example',
                    '/results/predicted_vs_measured_example')
    prepareExample('/results/predicted_vs_measured_example')

    #Predict mutations using pre-trained model and compare to actual (for one oligo only)
    theta_file = getHighDataDir(
    ) + '/model_output_10000_0.01000000_0.01000000_-0.607_theta.txt_cf0.txt'
    computeAndComparePredicted(theta_file,
                               selected_id='Oligo35785',
                               out_dir='.')
예제 #30
0
def plotD2(all_result_outputs, label=''):

    #Merge replicates
    mci_merged_data = mergeSamples(all_result_outputs, [],
                                   data_label='perOligoMCI')
    mci_merged_data['Equal MCI'] = (
        mci_merged_data['Most Common Indel']
        == mci_merged_data['Most Common Indel 2']) & (
            mci_merged_data['Most Common Indel']
            == mci_merged_data['Most Common Indel 3'])

    oligo_data = pd.read_csv(
        getHighDataDir() +
        '/ST_June_2017/data/self_target_oligos_details_with_pam_details.csv',
        sep='\t')
    remove_under = lambda x: x.replace('_', '')
    oligo_data['Oligo Id'] = oligo_data['ID'].apply(remove_under)
    mci_merged_data_guides = pd.merge(mci_merged_data,
                                      oligo_data[['Oligo Id', 'Guide']],
                                      how='inner',
                                      on='Oligo Id')
    mci_common = mci_merged_data_guides.loc[mci_merged_data['Equal MCI']]
    dmci_data = mci_common.loc[(
        mci_common['MCI Type'] == 'D2'
    )]  #Note: type check discards equally most common indels

    pie_vals, pie_labels = [], []
    is_left_rpt = lambda row: row['Guide'][-5] == row['Guide'][
        -3] and tokFullIndel(row['Most Common Indel'])[2][
            'R'] >= 1 and tokFullIndel(row['Most Common Indel'])[2]['L'] <= -3
    is_right_rpt = lambda row: row['Guide'][-4] == row['Guide'][
        -2] and tokFullIndel(row['Most Common Indel'])[2][
            'R'] >= 2 and tokFullIndel(row['Most Common Indel'])[2]['L'] <= -2
    is_left_only_rpt = lambda row: is_left_rpt(row) and not is_right_rpt(row)
    is_right_only_rpt = lambda row: is_right_rpt(row) and not is_left_rpt(row)
    is_both_rpt = lambda row: is_right_rpt(row) and is_left_rpt(row)

    lrpt_data = dmci_data.loc[dmci_data.apply(is_left_only_rpt, axis=1)]
    pie_labels.append('Y|XY->Y')
    pie_vals.append(len(lrpt_data))
    rrpt_data = dmci_data.loc[dmci_data.apply(is_right_only_rpt, axis=1)]
    pie_labels.append('XY|X->X')
    pie_vals.append(len(rrpt_data))
    rpt_data = dmci_data.loc[dmci_data.apply(is_both_rpt, axis=1)]
    pie_labels.append('XY|XY->XY')
    pie_vals.append(len(rpt_data))

    is_r0 = lambda row: tokFullIndel(row['Most Common Indel'])[2]['R'] == 0
    ro_data = dmci_data.loc[dmci_data.apply(is_r0, axis=1)]
    pie_labels.append('Z|XY->Z')
    pie_vals.append(len(ro_data))
    is_l1 = lambda row: tokFullIndel(row['Most Common Indel'])[2]['L'] == -1
    l1_data = dmci_data.loc[dmci_data.apply(is_l1, axis=1)]
    pie_labels.append('XY|Z->Z')
    pie_vals.append(len(l1_data))

    seen_ids = set(rpt_data['Oligo Id']).union(set(ro_data['Oligo Id'])).union(
        set(l1_data['Oligo Id'])).union(set(lrpt_data['Oligo Id'])).union(
            set(rrpt_data['Oligo Id']))
    is_unseen = lambda id: id not in seen_ids
    unseen_data = dmci_data.loc[dmci_data['Oligo Id'].apply(is_unseen)]
    print(unseen_data)
    assert (len(unseen_data) == 0)
    #pie_labels.append('Other')
    #pie_vals.append(len(unseen_data))

    #pie_labels = [x for x in dmci_data['Most Common Indel'].unique()]
    #pie_vals  = [len(dmci_data.loc[dmci_data['Most Common Indel']==indel]) for indel in pie_labels]
    PL.figure(figsize=(4, 4))
    PL.pie(pie_vals,
           labels=pie_labels,
           autopct='%.1f',
           labeldistance=1.1,
           counterclock=False,
           colors=COLORS)
    PL.title(
        'Size 2 deletions that are\n"most common" for their gRNA in all 3 replicates\n(%d gRNAs from %d total)'
        % (len(dmci_data), len(mci_merged_data)))
    PL.show(block=False)
    saveFig('pie_chart_D2_indel_cats')

    PL.figure(figsize=(12, 8))

    #XY|XY->XY
    PL.subplot(2, 3, 1)
    pie_vals, pie_labels = [], []
    for mh_str in [y + x for x in 'ATGC' for y in 'ATGC']:
        pie_labels.append(mh_str)
        is_mh_str = lambda guide: guide[-5:-3] == mh_str
        pie_vals.append(len(rpt_data.loc[rpt_data['Guide'].apply(is_mh_str)]))
    for dnt, cnt in zip(pie_labels, pie_vals):
        print(dnt, cnt * 100 / sum(pie_vals))
    PL.pie(pie_vals,
           labels=pie_labels,
           autopct='%.1f',
           labeldistance=1.1,
           counterclock=False,
           colors=COLORS)
    PL.title('XY|XY->XY\n(%d gRNAs)' % len(rpt_data))
    PL.show(block=False)

    #__|
    PL.subplot(2, 3, 2)
    pie_vals, pie_labels = [], []
    for mh_str in [y + x for x in 'ATGC' for y in 'ATGC']:
        pie_labels.append(mh_str)
        is_mh_str = lambda guide: guide[-5:-3] == mh_str
        pie_vals.append(len(ro_data.loc[ro_data['Guide'].apply(is_mh_str)]))
    for dnt, cnt in zip(pie_labels, pie_vals):
        print(dnt, cnt * 100 / sum(pie_vals))
    PL.pie(pie_vals,
           labels=pie_labels,
           autopct='%.1f',
           labeldistance=1.1,
           counterclock=False,
           colors=COLORS)
    PL.title('XY| -> __|\n(%d gRNAs)' % len(ro_data))
    PL.show(block=False)

    #|__
    PL.subplot(2, 3, 3)
    pie_vals, pie_labels = [], []
    for mh_str in [y + x for x in 'ATGC' for y in 'ATGC']:
        pie_labels.append(mh_str)
        is_mh_str = lambda guide: guide[-3:-1] == mh_str
        pie_vals.append(len(l1_data.loc[l1_data['Guide'].apply(is_mh_str)]))
    for dnt, cnt in zip(pie_labels, pie_vals):
        print(dnt, cnt * 100 / sum(pie_vals))
    PL.pie(pie_vals,
           labels=pie_labels,
           autopct='%.1f',
           labeldistance=1.1,
           counterclock=False,
           colors=COLORS)
    PL.title('|XY -> |__\n(%d gRNAs)' % len(l1_data))
    PL.show(block=False)

    #XY|X->X
    PL.subplot(2, 3, 4)
    pie_vals, pie_labels = [], []
    for nt in 'ATGC':
        pie_labels.append('%sN|%s -> %s' % (nt, nt, nt))
        is_mh_str = lambda guide: guide[-5] == nt
        pie_vals.append(len(
            lrpt_data.loc[lrpt_data['Guide'].apply(is_mh_str)]))
    PL.pie(pie_vals,
           labels=pie_labels,
           autopct='%.1f',
           labeldistance=1.1,
           counterclock=False,
           colors=COLORS)
    PL.title('XY|X->X\n(%d gRNAs)' % len(lrpt_data))
    PL.show(block=False)

    #X|YX->X
    PL.subplot(2, 3, 5)
    pie_vals, pie_labels = [], []
    for nt in 'ATGC':
        pie_labels.append('%s|N%s -> %s' % (nt, nt, nt))
        is_mh_str = lambda guide: guide[-4] == nt
        pie_vals.append(len(
            rrpt_data.loc[rrpt_data['Guide'].apply(is_mh_str)]))
    PL.pie(pie_vals,
           labels=pie_labels,
           autopct='%.1f',
           labeldistance=1.1,
           counterclock=False,
           colors=COLORS)
    PL.title('X|YX->X\n(%d gRNAs)' % len(rrpt_data))
    PL.show(block=False)
    PL.subplots_adjust(left=0.05,
                       right=0.95,
                       top=0.9,
                       bottom=0.1,
                       hspace=0.3,
                       wspace=0.3)
    saveFig('D2_nts_per_cat')

    PL.figure(figsize=(12, 8))

    #XY|XY->XY
    PL.subplot(2, 3, 1)
    bar_heights, bar_labels, d2_dnt_counts, dnt_counts = [], [], [], []
    for dnt in [y + x for x in 'ATGC' for y in 'ATGC']:
        has_dnt = lambda guide: guide[-5:-3] == dnt and guide[-3:-1] == dnt
        dnt_data = mci_merged_data_guides.loc[
            mci_merged_data_guides['Guide'].apply(has_dnt)]
        dnt_counts.append(
            len(
                set(rpt_data['Oligo Id']).intersection(
                    set(dnt_data['Oligo Id']))))
        d2_dnt_counts.append(len(dnt_data))
        bar_heights.append(dnt_counts[-1] * 100.0 /
                           d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0)
        bar_labels.append(dnt)
        print(
            dnt, dnt_counts[-1] * 100.0 /
            d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0)
    PL.bar(range(len(bar_labels)), bar_heights, width=0.8)
    for i, (hgt, d2cnt,
            cnt) in enumerate(zip(bar_heights, d2_dnt_counts, dnt_counts)):
        PL.text(i - 0.3, hgt + 15, '%d/%d' % (cnt, d2cnt), rotation='vertical')
    PL.xticks(range(len(bar_labels)), bar_labels, rotation='vertical')
    PL.ylim((0, 90))
    PL.xlabel('XY')
    PL.title('XY|XY->XY')
    PL.ylabel(
        'Percent gRNAs with XY|XY->XY deletion\nas most common indel in all 3 replicates'
    )
    PL.show(block=False)

    #__|
    PL.subplot(2, 3, 2)
    bar_heights, bar_labels, d2_dnt_counts, dnt_counts = [], [], [], []
    for dnt in [y + x for x in 'ATGC' for y in 'ATGC']:
        has_dnt = lambda guide: guide[-5:-3] == dnt
        dnt_data = mci_merged_data_guides.loc[
            mci_merged_data_guides['Guide'].apply(has_dnt)]
        dnt_counts.append(
            len(
                set(ro_data['Oligo Id']).intersection(set(
                    dnt_data['Oligo Id']))))
        d2_dnt_counts.append(len(dnt_data))
        bar_heights.append(dnt_counts[-1] * 100.0 /
                           d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0)
        bar_labels.append(dnt)
        print(
            dnt, dnt_counts[-1] * 100.0 /
            d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0)
    PL.bar(range(len(bar_labels)), bar_heights, width=0.8)
    for i, (hgt, d2cnt,
            cnt) in enumerate(zip(bar_heights, d2_dnt_counts, dnt_counts)):
        PL.text(i - 0.3,
                hgt + 1.5,
                '%d/%d' % (cnt, d2cnt),
                rotation='vertical')
    PL.xticks(range(len(bar_labels)), bar_labels, rotation='vertical')
    PL.ylim((0, 8))
    PL.xlabel('XY')
    PL.title('XY| -> __|')
    PL.ylabel(
        'Percent gRNAs with XY| -> __| deletion\nas most common indel in all 3 replicates'
    )
    PL.show(block=False)

    #|__
    PL.subplot(2, 3, 3)
    bar_heights, bar_labels, d2_dnt_counts, dnt_counts = [], [], [], []
    for dnt in [y + x for x in 'ATGC' for y in 'ATGC']:
        has_dnt = lambda guide: guide[-3:-1] == dnt
        dnt_data = mci_merged_data_guides.loc[
            mci_merged_data_guides['Guide'].apply(has_dnt)]
        dnt_counts.append(
            len(
                set(l1_data['Oligo Id']).intersection(set(
                    dnt_data['Oligo Id']))))
        d2_dnt_counts.append(len(dnt_data))
        bar_heights.append(dnt_counts[-1] * 100.0 /
                           d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0)
        bar_labels.append(dnt)
        print(
            dnt, dnt_counts[-1] * 100.0 /
            d2_dnt_counts[-1] if d2_dnt_counts[-1] > 0 else 0)
    PL.bar(range(len(bar_labels)), bar_heights, width=0.8)
    for i, (hgt, d2cnt,
            cnt) in enumerate(zip(bar_heights, d2_dnt_counts, dnt_counts)):
        PL.text(i - 0.3,
                hgt + 1.5,
                '%d/%d' % (cnt, d2cnt),
                rotation='vertical')
    PL.xticks(range(len(bar_labels)), bar_labels, rotation='vertical')
    PL.ylim((0, 8))
    PL.xlabel('XY')
    PL.title('|XY -> |__')
    PL.ylabel(
        'Percent gRNAs with |XY -> |__ deletion\nas most common indel in all 3 replicates'
    )
    PL.show(block=False)

    #XY|X->X
    PL.subplot(2, 3, 4)
    bar_heights, bar_labels, d2_nt_counts, nt_counts = [], [], [], []
    for nt in 'ATGC':
        has_nt = lambda guide: guide[-3] == nt and guide[-5] == nt
        nt_data = mci_merged_data_guides.loc[
            mci_merged_data_guides['Guide'].apply(has_nt)]
        nt_counts.append(
            len(
                set(lrpt_data['Oligo Id']).intersection(
                    set(nt_data['Oligo Id']))))
        d2_nt_counts.append(len(nt_data))
        bar_heights.append(nt_counts[-1] * 100.0 /
                           d2_nt_counts[-1] if d2_nt_counts[-1] > 0 else 0)
        bar_labels.append(nt)
    PL.bar(range(len(bar_labels)), bar_heights, width=0.8)
    for i, (hgt, d2cnt,
            cnt) in enumerate(zip(bar_heights, d2_nt_counts, nt_counts)):
        PL.text(i - 0.3, hgt + 0.05, '%d/%d' % (cnt, d2cnt))
    PL.xticks(range(len(bar_labels)), bar_labels)
    PL.ylim((0, 5))
    PL.xlabel('X')
    PL.title('XY|X->X')
    PL.ylabel(
        'Percent gRNAs with XY|X->X deletion\nas most common indel in all 3 replicates'
    )
    PL.show(block=False)

    #X|YX->X
    PL.subplot(2, 3, 5)
    bar_heights, bar_labels, d2_nt_counts, nt_counts = [], [], [], []
    for nt in 'ATGC':
        has_nt = lambda guide: guide[-4] == nt and guide[-2] == nt
        nt_data = mci_merged_data_guides.loc[
            mci_merged_data_guides['Guide'].apply(has_nt)]
        nt_counts.append(
            len(
                set(rrpt_data['Oligo Id']).intersection(
                    set(nt_data['Oligo Id']))))
        d2_nt_counts.append(len(nt_data))
        bar_heights.append(nt_counts[-1] * 100.0 /
                           d2_nt_counts[-1] if d2_nt_counts[-1] > 0 else 0)
        bar_labels.append(nt)
    PL.bar(range(len(bar_labels)), bar_heights, width=0.8)
    for i, (hgt, d2cnt,
            cnt) in enumerate(zip(bar_heights, d2_nt_counts, nt_counts)):
        PL.text(i - 0.3, hgt + 0.05, '%d/%d' % (cnt, d2cnt))
    PL.xticks(range(len(bar_labels)), bar_labels)
    PL.ylim((0, 5))
    PL.xlabel('X')
    PL.title('X|YX->X')
    PL.ylabel(
        'Percent gRNAs with X|YX->X deletion\nas most common indel in all 3 replicates'
    )
    PL.show(block=False)

    PL.subplots_adjust(left=0.05,
                       right=0.95,
                       top=0.9,
                       bottom=0.1,
                       hspace=0.3,
                       wspace=0.3)
    saveFig('D2_nts_per_cat_bars')