def fetchMhMismatchFrequencies(dirname, outdir='mh_mismatch_indel_frequencies'): if not os.path.isdir(outdir): os.makedirs(outdir) if isOldLib(dirname): raise Exception('Old Lib not supported') mh_exp_indels_file = getHighDataDir() + '/mh_mismatch_indels.txt' fout = io.open(outdir + '/' + getDirLabel(dirname) + '.txt', 'w') hdr_str = '\t'.join([ '\t'.join([ x + ' Indel Reads in ' + y for x in ['Orig', 'Left Mut', 'Right Mut', 'Merged Mut1', 'Merged Mut2'] ]) for y in ['Mut', 'Orig'] ]) f = io.open(mh_exp_indels_file) rdr = csv.DictReader(f, delimiter='\t') fout.write(u'%s\t%s\tMut Non-Null Reads\tOrig Non-Null Reads\n' % ('\t'.join(rdr.fieldnames), hdr_str)) for row in rdr: #Load Indel Profiles for both the original and mutated micrhomology forms mut_oligo_id = row['Oligo ID'].replace('_', '') orig_oligo_id = row['Mapped Oligo Id'].replace('_', '') mut_filepath, mut_filename = getFileForOligoIdx( getOligoIdxFromId(mut_oligo_id), ext='_mappedindelsummary.txt') orig_filepath, orig_filename = getFileForOligoIdx( getOligoIdxFromId(orig_oligo_id), ext='_mappedindelsummary.txt') p_mut, p_orig = {}, {} stats_mut = readSummaryToProfile(dirname + '/mapped_reads/' + mut_filepath + '/' + mut_filename, p_mut, oligoid=mut_oligo_id) stats_orig = readSummaryToProfile(dirname + '/mapped_reads/' + orig_filepath + '/' + orig_filename, p_orig, oligoid=orig_oligo_id) indels = [ row['Orig Indel'], row['Left Mut-MH Indel'], row['Right Mut-MH Indel'], row['Merge Mut 1 Indel'], row['Merge Mut 2 Indel'] ] reads = lambda indel, profile: profile[indel] if (indel in profile and indel != '') else 0 mut_read_str = '\t'.join( ['%d' % reads(indel, p_mut) for indel in indels]) orig_read_str = '\t'.join( ['%d' % reads(indel, p_orig) for indel in indels]) str_args = ('\t'.join([row[col] for col in rdr.fieldnames ]), mut_read_str, orig_read_str, stats_mut[0] - stats_mut[2], stats_orig[0] - stats_orig[2]) fout.write(u'%s\t%s\t%s\t%d\t%d\n' % str_args) f.close() fout.close()
def loadOligoFeaturesAndReadCounts(oligo_id, sample_names): oligo_idx = getOligoIdxFromId(oligo_id) oligo_subdir, _ = getFileForOligoIdx(oligo_idx, ext='') features_file = FEATURES_DIR + '/' + oligo_subdir + '/%s_gen_indel_features.txt' % oligo_id reads_file = READS_DIR + '/' + oligo_subdir + '/%s_gen_indel_reads.txt' % oligo_id cut_site = getCutSite(features_file) indel_feature_data, feature_cols = readFeaturesData(features_file) if len(sample_names) > 0: read_data = pd.read_csv(reads_file, skiprows=1, sep='\t') read_data['Sum Sample Reads'] = read_data[sample_names].sum( axis=1) + 0.5 read_data = read_data.loc[read_data['Indel'] != 'All Mutated'] total_mut_reads = read_data['Sum Sample Reads'].sum() if total_mut_reads == 0: raise Exception('No Mutated Reads in %s' % reads_file) read_data['Frac Sample Reads'] = read_data[ 'Sum Sample Reads'] / total_mut_reads merged_data = pd.merge(indel_feature_data, read_data[['Indel', 'Frac Sample Reads']], left_index=True, right_on='Indel', how='inner') else: merged_data = indel_feature_data merged_data['Indel'] = merged_data.index return merged_data
def createDirectories(lookup, output_dir): if not os.path.exists(output_dir): os.makedirs(output_dir) for (bc1, bc2) in lookup: (oligo_id, oligo_idx) = lookup[(bc1, bc2)] filedir, filename = getFileForOligoIdx(oligo_idx) if filedir not in os.listdir(output_dir): os.mkdir(output_dir + '/' + filedir)
def loadRepReads(new_id): oligo_idx = getOligoIdxFromId(new_id) subdir, profilefilename = getFileForOligoIdx( oligo_idx, ext='_mappedindelprofiles.txt') profile_file = getHighDataDir( ) + '/' + new_dirs[0] + '/mapped_reads/' + subdir + '/' + profilefilename rep_reads = {} fetchRepresentativeCleanReads(profile_file, rep_reads, oligoid=new_id) return rep_reads
def compileGenIndelReads(gen_indel_dir='generated_indels', out_dir='reads_for_gen_indels_all_samples', sample_dirs=[]): if not os.path.isdir(out_dir): os.mkdir(out_dir) for gen_file in os.listdir(gen_indel_dir): oligo_id = gen_file.split('_')[0] oligo_idx = getOligoIdxFromId(oligo_id) oligo_subdir, sum_filename = getFileForOligoIdx( oligo_idx, ext='_mappedindelsummary.txt') out_subdir = out_dir + '/' + oligo_subdir if not os.path.isdir(out_subdir): os.mkdir(out_subdir) #Read all profiles for this oligo profiles, mut_read_totals = [], [] for dirname in sample_dirs: profiles.append({}) filename = getHighDataDir( ) + '/' + dirname + '/mapped_reads/' + oligo_subdir + '/' + sum_filename stats = readSummaryToProfile(filename, profiles[-1], oligoid=oligo_id) mut_read_totals.append('%d' % (stats[0] - stats[2])) #Compile reads for each indel across all samples f = io.open(gen_indel_dir + '/' + gen_file) fout = io.open(out_subdir + '/%s_gen_indel_reads.txt' % oligo_id, 'w') fout.write(f.readline()) #Git commit fout.write(u'Indel\tDetails\t%s\n' % '\t'.join([getDirLabel(x) for x in sample_dirs])) fout.write(u'All Mutated\t[]\t%s\n' % '\t'.join(mut_read_totals)) for toks in csv.reader(f, delimiter='\t'): indel, indel_details = toks[0], toks[2] read_str = '\t'.join( ['%d' % (p1[indel] if indel in p1 else 0) for p1 in profiles]) fout.write(u'%s\t%s\t%s\n' % (indel, indel_details, read_str)) fout.close() f.close()
def recordProfiles(output_dir, theta, guideset, feature_columns): while not os.path.isdir(output_dir): if mpi_rank == 0: os.mkdir(output_dir) else: time.sleep(5) for oligo_id in guideset: profile, counts = computePredictedProfile(oligo_id, theta, feature_columns) idx = getOligoIdxFromId(oligo_id) filepath, filename = getFileForOligoIdx(idx) if not os.path.isdir(output_dir + '/' + filepath): os.mkdir(output_dir + '/' + filepath) fout = io.open( output_dir + '/' + filepath + '/%s_mappedindelsummary_predicted.txt' % oligo_id, 'w') fout.write(u'@@@%s\n' % oligo_id) for val, indel, perc1, perc2 in counts: if val >= 1: fout.write('%s\t-\t%d\n' % (indel, val)) fout.close()
def computeFeaturesForGenIndels(gen_indel_dir='generated_indels', out_dir='features_for_gen_indels'): if not os.path.isdir(out_dir): os.mkdir(out_dir) #Load Oligo details oligo_details = loadAllOligoDetails(oligo_detail_dir=getHighDataDir() + '/ST_June_2017/data') oligo_details = { id.replace('_', ''): row for (id, row) in oligo_details.items() } for gen_file in os.listdir(gen_indel_dir): print(gen_file) oligo_id = gen_file.split('_')[0] oligo_idx = getOligoIdxFromId(oligo_id) oligo_subdir, _ = getFileForOligoIdx(oligo_idx, ext='') out_subdir = out_dir + '/' + oligo_subdir if not os.path.isdir(out_subdir): os.mkdir(out_subdir) row = oligo_details[oligo_id] uncut_seq = row['Target'] if row[ 'PAM Direction'] != 'REVERSE' else Bio.Seq.reverse_complement( row['Target']) cut_site = eval(row['PAM Location'] ) - 3 if row['PAM Direction'] != 'REVERSE' else ( 79 - eval(row['PAM Location']) - 3) generated_indel_file = gen_indel_dir + '/' + gen_file out_file = out_subdir + '/%s_gen_indel_features.txt' % oligo_id is_reverse = (row['PAM Direction'] == 'REVERSE') calculateFeaturesForGenIndelFile(generated_indel_file, uncut_seq, cut_site, out_file, is_reverse=is_reverse)
nulldir = getNullDir(dirname) repeat_indelmaps = set() repeat_reformat = set() f = io.open('../quality_checks/mapped_read_summaries/%s.txt' % dirlabel) for row in csv.DictReader(f, delimiter='\t'): if eval(row['Mapping Files']) != eval(row['Split Fasta File']): print row print 'PROBLEM IN MAPPED SPLIT - RERUN!', dirname break elif eval(row['Split Fasta File']) != eval( row['Mapped Split']) or eval( row['Mapped Split Assigned']) != eval( row['Mapped Split']): oligo_idx = eval(row['ID'][5:]) filepath, filename = getFileForOligoIdx(oligo_idx) repeat_indelmaps.add((filename, filepath)) print row elif eval(row['Mapped Split Assigned']) != eval(row['Summary']): oligo_idx = eval(row['ID'][5:]) filepath, filename = getFileForOligoIdx(oligo_idx) print row repeat_reformat.add((filename, filepath)) f.close() print 'INDELMAP', idx, dirname for filename, subdir in repeat_indelmaps: cmd = '~/run_python.sh indelmap_subdir.py %s %s %s -1 1 %s 0' % ( dirname, nulldir, subdir, filename) idx = runCmdCheckIdx(cmd, idx,
def compareOverbeekProfiles( selected_overbeek_id=None, pred_results_dir='../indel_prediction/model_testing'): new_dirs = [ 'ST_June_2017/data/K562_800x_LV7A_DPI7/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_800x_LV7A_DPI10/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_800x_LV7B_DPI7/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_800x_LV7B_DPI10/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_1600x_LV7B_DPI5/mapped_reads/Oligos_71', 'ST_Feb_2018/data/CAS9_12NA_1600X_DPI7/mapped_reads/Oligos_71' ] #Old Samples old_dirs = [ 'ST_June_2017/data/K562_1600x_6OA_DPI5/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_1600x_6OA_DPI7/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OA_DPI3_Old7/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OA_DPI7_Old8/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OA_DPI10_Old9/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OB_DPI3_Old10/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OB_DPI7_Old11/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OB_DPI10_Old12/mapped_reads/Oligos_71' ] remove_long_indels = False remove_wt, wt_thresh = True, 3.0 mappings = loadMappings() all_overbeek_profiles, all_new_profiles, all_old_profiles, all_our_profiles, sel_overbeek_ids,oldnew_overbeek_ids, old_ids, new_ids = [],[],[],[], [],[],[],[] overbeek_inframes, ours_inframes, oof_sel_overbeek_ids = [], [], [] kls, kls_old, kls_new, log_reads, overbeek_ids, above30_percentages, log_reads_new, log_reads_old, min_log_reads = [],[],[],[],[],[],[],[], [] for idx in range(1, 97): overbeek_id = 'Overbeek%d' % idx if selected_overbeek_id is not None and selected_overbeek_id != overbeek_id: continue if overbeek_id not in mappings: continue overbeek_filename = getHighDataDir( ) + '/overbeek_fastq_files/' + overbeek_id + '_mappedindelsummary.txt' p1, p1_new, p1_old, o1, rep_reads1, rep_reads2 = {}, {}, {}, {}, {}, {} nreads2, nreads1, nreads_old, nreads_new, nnull_old, nnull_new, nnull1, nnull2 = 0, 0, 0, 0, 0, 0, 0, 0 #Read the overbreek profile numread2, perc_accept2, num_null2 = readSummaryToProfile( overbeek_filename, o1, oligoid=overbeek_id, remove_long_indels=remove_long_indels, remove_wt=False) if selected_overbeek_id is not None: fetchRepresentativeCleanReads( getHighDataDir() + '/overbeek_fastq_files/' + overbeek_id + '_mappedindelprofiles.txt', rep_reads2, oligoid=overbeek_id) pam_loc2, pam_dir2 = getNullTargetPamDetails( getHighDataDir() + '/overbeek_control_fastq_files/' + overbeek_id + '_exptargets.txt', oligoid=overbeek_id) nreads2 += numread2 nnull2 += num_null2 if numread2 == 0: continue p1_new_reps, p1_old_reps = [{}, {}], [{}, {}] rr_new_reps, rr_old_reps = [{}, {}], [{}, {}] #Read all the new and old profiles pam_loc1, pam_dir1 = None, None for oligo_id, is_old in mappings[overbeek_id]: #Read all reads for all our K562 profiles oligo_idx = eval(oligo_id[5:]) _, oligo_fileprefix = getFileForOligoIdx(oligo_idx, ext='') oligo_filename = oligo_fileprefix + '_mappedindelsummary.txt' read_filename = oligo_fileprefix + '_mappedindelprofiles.txt' exptarget_filename = oligo_fileprefix + '_exptargets.txt' if is_old: oligo_dirs, p1_old_new, null_oligo_dir = old_dirs, p1_old, 'ST_April_2017/data/NULL_Old/mapped_reads/Oligos_71' p1_reps, rr_reps = p1_old_reps, rr_old_reps else: oligo_dirs, p1_old_new, null_oligo_dir = new_dirs, p1_new, 'ST_April_2017/data/NULL_New/mapped_reads/Oligos_71' p1_reps, rr_reps = p1_new_reps, rr_new_reps for oligo_dir in [getHighDataDir() + '/' + x for x in oligo_dirs]: nr1, pa1, nn1 = readSummaryToProfile( oligo_dir + '/' + oligo_filename, p1_old_new, oligoid=oligo_id, remove_long_indels=remove_long_indels, remove_wt=remove_wt, wt_thresh=wt_thresh) numread1, perc_accept1, num_null1 = readSummaryToProfile( oligo_dir + '/' + oligo_filename, p1, oligoid=oligo_id, remove_long_indels=remove_long_indels, remove_wt=remove_wt, wt_thresh=wt_thresh) if 'DPI7' in oligo_dir: rep_idx = 0 if '800x' in oligo_dir else 1 nr_rep, pa_rep, nn_rep = readSummaryToProfile( oligo_dir + '/' + oligo_filename, p1_reps[rep_idx], oligoid=oligo_id, remove_long_indels=remove_long_indels, remove_wt=remove_wt, wt_thresh=wt_thresh) if selected_overbeek_id is not None: fetchRepresentativeCleanReads(oligo_dir + '/' + read_filename, rep_reads1, oligoid=oligo_id) if 'DPI7' in oligo_dir: fetchRepresentativeCleanReads(oligo_dir + '/' + read_filename, rr_reps[rep_idx], oligoid=oligo_id) if pam_loc1 is None: pam_loc1, pam_dir1 = getNullTargetPamDetails( getHighDataDir() + '/' + null_oligo_dir + '/' + exptarget_filename, oligoid=oligo_id) if is_old: nreads_old += numread1 nnull_old += num_null1 else: nreads_new += numread1 nnull_new += num_null1 nreads1 += numread1 nnull1 += num_null1 kls.append(symmetricKL(p1, o1, True)) kls_old.append(symmetricKL(p1_old, o1, True)) kls_new.append(symmetricKL(p1_new, o1, True)) log_reads.append(np.log10(nreads1 - nnull1 + 0.5)) log_reads_old.append(np.log10(nreads_old - nnull_old + 0.5)) log_reads_new.append(np.log10(nreads_new - nnull_new + 0.5)) min_log_reads.append(min(log_reads_old[-1], log_reads_new[-1])) above30_percentages.append(computePercAbove30(o1)) overbeek_ids.append(overbeek_id) if log_reads[-1] > 2.0: all_overbeek_profiles.append(o1) all_our_profiles.append(p1) sel_overbeek_ids.append(overbeek_id[8:]) if above30_percentages[-1] < 50.0: oif, oof, _ = fetchIndelSizeCounts(o1) pif, pof, _ = fetchIndelSizeCounts(p1) overbeek_inframes.append(oif * 100.0 / (oif + oof)) ours_inframes.append(pif * 100.0 / (pif + pof)) oof_sel_overbeek_ids.append(overbeek_id) if min_log_reads[-1] > 2.0: all_new_profiles.append(p1_new) all_old_profiles.append(p1_old) oldnew_overbeek_ids.append(overbeek_id) old_ids.append( [id for id, is_old in mappings[overbeek_id] if is_old][0]) new_ids.append( [id for id, is_old in mappings[overbeek_id] if not is_old][0]) try: print(overbeek_id, [x for (x, y) in mappings[overbeek_id]], kls[-1], nreads2, nreads1) except KeyError: print('Could not find', overbeek_id) print(mappings) if selected_overbeek_id is not None: title = '%s (KL=%.1f)' % (overbeek_id, kls[-1]) labels = [ 'Conventional scaffold Rep A', 'Conventional scaffold Rep B', 'Improved scaffold Rep A', 'Improved scaffold Rep B', 'Endogenous Profile' ] plotProfiles([ p1_old_reps[0], p1_old_reps[1], p1_new_reps[0], p1_new_reps[0], o1 ], [ rr_old_reps[0], rr_old_reps[1], rr_new_reps[0], rr_new_reps[1], rep_reads2 ], [pam_loc1, pam_loc1, pam_loc1, pam_loc1, pam_loc2], [ x == 'REVERSE' for x in [pam_dir1, pam_dir1, pam_dir1, pam_dir1, pam_dir2] ], labels, title=title) if selected_overbeek_id is None: plotInFrame(overbeek_inframes, ours_inframes, oof_sel_overbeek_ids, pred_results_dir) i = 1 PL.figure(figsize=(5.5, 5)) for thr_l, thr_h in [(0.0, 10.0), (10.0, 20.0), (20.0, 50.0), (50.0, 90.0), (90.0, 100.0)]: ydata = [ kl for (kl, a30, id, reads) in zip(kls, above30_percentages, overbeek_ids, log_reads) if a30 > thr_l and a30 <= thr_h ] xdata = [ reads for (kl, a30, id, reads) in zip(kls, above30_percentages, overbeek_ids, log_reads) if a30 > thr_l and a30 <= thr_h ] sel_ids = [ id for (kl, a30, id, reads) in zip(kls, above30_percentages, overbeek_ids, log_reads) if a30 > thr_l and a30 <= thr_h ] PL.plot(xdata, ydata, 'o', label='%d-%d%% Deletions > 30' % (thr_l, thr_h)) for x, y, id in zip(xdata, ydata, sel_ids): if y > 3 and x > 2: PL.text(x, y, id) PL.legend() PL.plot([0, 6], [0.77, 0.77], '--', color='grey') PL.text(0.1, 0.5, 'Median between our replicates', color='grey') PL.ylabel('Symmetric KL Divergence', fontsize=12) PL.xlabel('Log10 Mutated Reads', fontsize=12) PL.xlim((0, 5.5)) PL.ylim((0, 8)) PL.show(block=False) saveFig('scatter_KL') i += 1 print('Median=', np.median(kls), 'Mean KL=', np.mean(kls)) print(len(overbeek_ids)) #Compute pairwise KL between overbeek and ours N = len(sel_overbeek_ids) kl_mat = np.zeros((N, N)) for i, o1 in enumerate(all_overbeek_profiles): for j, p1 in enumerate(all_our_profiles): kl_mat[i, j] = symmetricKL(o1, p1) PL.figure(figsize=(8, 6)) PL.imshow(kl_mat, cmap='hot_r', vmin=0.0, vmax=3.0, interpolation='nearest') PL.xticks(range(N), sel_overbeek_ids, rotation='vertical', fontsize=6) PL.yticks(range(N), sel_overbeek_ids, rotation='horizontal', fontsize=6) PL.xlabel('Synthetic Measurement', fontsize=12) PL.ylabel('Endogenous Measurement', fontsize=12) PL.title('KL', fontsize=12) PL.colorbar() PL.show(block=False) saveFig('heatmap_KL')
def getFileSuffix(oligo_id): oligo_idx = getOligoIdxFromId(oligo_id) subdir, sumfilename = getFileForOligoIdx(oligo_idx, ext='_mappedindelsummary.txt') return subdir + '/' + sumfilename