def main(): theta_file = DEFAULT_MODEL target_seq = 'CTGAGTAGCTATGCGGCCAGCAGCGAGACGCTCAGCGTGAAGCGGCAGTATCCCTCTTTCCTGCGCACCATCCCCAATC' pam_idx = 42 profile, rep_reads, in_frame = predictMutations(theta_file, target_seq, pam_idx) plotProfiles([profile], [rep_reads], [pam_idx], [False], ['Predicted']) import pdb pdb.set_trace()
def build_plot_by_profile(filename, profile, oligo_id): rep_reads = {} fetchReads(filename, rep_reads, oligo_id) setFigType('png') fig = plotProfiles([profile], [rep_reads], [43], [False], ['Predicted'], title='In Frame: %.1f%%' % rep_reads[FRAME_SHIFT]) return fig
def plot_predictions(theta_file, target_seq, pam_idx, out_filename=None): if pam_idx < 0 or pam_idx >= (len(target_seq) - 3): raise Exception('PAM idx out of range') if sum([x in ['A', 'T', 'G', 'C'] for x in target_seq]) != len(target_seq): raise Exception('Sequence must be composed of A,T,G,or C only') if len(target_seq) < 20 or pam_idx < 13 or pam_idx > len(target_seq) - 7: raise Exception( 'Sequence too short or PAM too close to edge of sequence (must have at least 10nt either side of cut site)' ) if target_seq[pam_idx + 1:pam_idx + 3] != 'GG': raise Exception('Non NGG PAM (check correct index of PAM)') profile, rep_reads, in_frame = predictMutations(theta_file, target_seq, pam_idx) if not out_filename: out_filename = '%s_%d.txt' % (target_seq, pam_idx) fout = io.open(out_filename, 'w') fout.write(u'@@@%s\n' % ('%.1f' % in_frame)) writePredictedProfileToSummary(profile, fout) fout.close() setFigType('png') fig = plotProfiles([profile], [rep_reads], [pam_idx], [False], ['Predicted'], title='In Frame: %.1f%%' % in_frame) return fig
def computeAndComparePredicted(theta_file, selected_id=None, out_dir='.', start_count=0, end_count=10000): features_dir = getHighDataDir() + '/gen_indels/features_for_gen_indels' theta, train_set, feature_columns = readTheta(theta_file) new_sep_labels = 'New 2x800x', 'New 1600x' old_sep_labels = 'Old 2x800x', 'Old 1600x' #Note: here old refers to conventional scaffold library, new refers to improved scaffold library fout = io.open( out_dir + '/old_new_kl_predicted_summaries.txt' % (start_count, end_count), 'w') fout.write( u'Old Oligo Id\tNew Oligo Id\tOld Mut Reads\tNew Mut Reads\tCombined Mut Reads\t' ) fout.write(u'\t'.join('%s Mut Reads' % x.split('/')[-1] for x in new_sep_labels + old_sep_labels)) fout.write( u'\tOld In Frame Perc\tNew In Frame Perc\tCombined in Frame Perc\tPredicted In Frame Per\t' ) fout.write(u'\t'.join('%s In Frame Perc' % x.split('/')[-1] for x in new_sep_labels + old_sep_labels)) fout.write( u'\tOld v New KL\tOld v Predicted KL\tNew v Predicted KL\tCombined v Predicted KL\t' ) fout.write(u'\t'.join('%s vs Predicted KL' % x.split('/')[-1] for x in new_sep_labels + old_sep_labels) + '\t') fout.write(u'\t'.join([ '%s vs %s KL' % (x.split('/')[-1], y.split('/')[-1]) for x, y in (getCombs(new_sep_labels) + getCombs(old_sep_labels)) ]) + '\n') id_pairs = loadValidationPairs() for (old_id, new_id) in id_pairs: if old_id in train_set or new_id in train_set: raise Exception('Bad!!! Testing on Training data: %s %s' % (old_id, new_id)) if selected_id is not None and selected_id != old_id: continue #Guide pair selected for plotting #Load Old and new profiles, and produce combined profile from the two p_old, p_new, mut_reads_old, mut_reads_new = loadProfilePair( old_id, new_id) p_comb, mut_reads_comb = combineProfiles(p_old, p_new, mut_reads_old, mut_reads_new) #Predict the profile (old and new will be the same so just do one) feature_data = loadOligoFeaturesAndReadCounts(new_id, []) p_predict, _ = computePredictedProfile(feature_data, theta, feature_columns) #Load separate profiles too p_old_sep, p_new_sep, old_sep_mr, new_sep_mr = loadProfilesSeparately( old_id, new_id) #Compute in frame percentages old_if_perc = getInFramePerc(p_old) new_if_perc = getInFramePerc(p_new) comb_if_perc = getInFramePerc(p_comb) pred_if_perc = getInFramePerc(p_predict) new_sep_if_percs = [ getInFramePerc(profile) if len(profile) > 1 else -1 for profile in p_new_sep ] old_sep_if_percs = [ getInFramePerc(profile) if len(profile) > 1 else -1 for profile in p_old_sep ] #Plot the comparison if selected_id is not None: rrds = loadRepReads(new_id) plotProfiles([p_new_sep[0], p_new_sep[1], p_predict], [rrds, rrds, rrds], [56, 56, 56], [False, False, False], ['Replicate 1', 'Replicate 2', 'Predicted'], title='%s (KL=%.2f, KL=%.2f)' % (new_id, symmetricKL(p_new_sep[0], p_new_sep[1]), symmetricKL(p_new, p_predict))) str_args = (symmetricKL(p_old, p_new), symmetricKL(p_old, p_predict), symmetricKL(p_new, p_predict), symmetricKL(p_comb, p_predict)) kl_str = u'\t%.5f\t%.5f\t%.5f\t%.5f\t' % str_args kl_str += u'\t'.join([ '%.5f' % symmetricKL(p_predict, x) for x in p_new_sep + p_old_sep ]) kl_str += u'\t' + u'\t'.join([ '%.5f' % symmetricKL(x, y) for (x, y) in (getCombs(p_new_sep) + getCombs(p_old_sep)) ]) if_str = u'\t'.join( ['%.3f' % x for x in new_sep_if_percs + old_sep_if_percs]) mut_str = u'\t'.join(['%d' % x for x in new_sep_mr + old_sep_mr]) fout.write(u'%s\t%s\t%d\t%d\t%d\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%s%s\n' % (old_id, new_id, mut_reads_old, mut_reads_new, mut_reads_comb, mut_str, old_if_perc, new_if_perc, comb_if_perc, pred_if_perc, if_str, kl_str)) fout.flush() fout.close()
def build_plot_by_profile(filename, profile, oligo_id): rep_reads = {} fetchReads(filename, rep_reads, oligo_id) setFigType('png') fig = plotProfiles([profile], [rep_reads], [43], [False], ['Predicted']) return fig
print('Predicting mutations...') p_predict, rep_reads, in_frame_perc = predictMutations( theta_file, target_seq, pam_idx) print('Writing to file...') writeProfilesToFile(out_prefix, [('Test Guide', p_predict, rep_reads, in_frame_perc)], write_rr=True) print('Done!') def predictMutationsBulk(target_file, out_prefix, theta_file=DEFAULT_MODEL): #Target File: a tab-delimited file with columns: ID, Target, PAM Index print('Predicting mutations...') profiles_and_rr = predictProfilesBulk(theta_file, target_file) print('Writing to file...') writeProfilesToFile(out_prefix, profiles_and_rr, write_rr=True) print('Done!') if __name__ == '__main__': theta_file = DEFAULT_MODEL target_seq = 'CTGAGTAGCTATGCGGCCAGCAGCGAGACGCTCAGCGTGAAGCGGCAGTATCCCTCTTTCCTGCGCACCATCCCCAATC' pam_idx = 42 profile, rep_reads, in_frame = predictMutations(theta_file, target_seq, pam_idx) plotProfiles([profile], [rep_reads], [pam_idx], [False], ['Predicted']) import pdb pdb.set_trace()
def compareOverbeekProfiles( selected_overbeek_id=None, pred_results_dir='../indel_prediction/model_testing'): new_dirs = [ 'ST_June_2017/data/K562_800x_LV7A_DPI7/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_800x_LV7A_DPI10/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_800x_LV7B_DPI7/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_800x_LV7B_DPI10/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_1600x_LV7B_DPI5/mapped_reads/Oligos_71', 'ST_Feb_2018/data/CAS9_12NA_1600X_DPI7/mapped_reads/Oligos_71' ] #Old Samples old_dirs = [ 'ST_June_2017/data/K562_1600x_6OA_DPI5/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_1600x_6OA_DPI7/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OA_DPI3_Old7/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OA_DPI7_Old8/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OA_DPI10_Old9/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OB_DPI3_Old10/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OB_DPI7_Old11/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OB_DPI10_Old12/mapped_reads/Oligos_71' ] remove_long_indels = False remove_wt, wt_thresh = True, 3.0 mappings = loadMappings() all_overbeek_profiles, all_new_profiles, all_old_profiles, all_our_profiles, sel_overbeek_ids,oldnew_overbeek_ids, old_ids, new_ids = [],[],[],[], [],[],[],[] overbeek_inframes, ours_inframes, oof_sel_overbeek_ids = [], [], [] kls, kls_old, kls_new, log_reads, overbeek_ids, above30_percentages, log_reads_new, log_reads_old, min_log_reads = [],[],[],[],[],[],[],[], [] for idx in range(1, 97): overbeek_id = 'Overbeek%d' % idx if selected_overbeek_id is not None and selected_overbeek_id != overbeek_id: continue if overbeek_id not in mappings: continue overbeek_filename = getHighDataDir( ) + '/overbeek_fastq_files/' + overbeek_id + '_mappedindelsummary.txt' p1, p1_new, p1_old, o1, rep_reads1, rep_reads2 = {}, {}, {}, {}, {}, {} nreads2, nreads1, nreads_old, nreads_new, nnull_old, nnull_new, nnull1, nnull2 = 0, 0, 0, 0, 0, 0, 0, 0 #Read the overbreek profile numread2, perc_accept2, num_null2 = readSummaryToProfile( overbeek_filename, o1, oligoid=overbeek_id, remove_long_indels=remove_long_indels, remove_wt=False) if selected_overbeek_id is not None: fetchRepresentativeCleanReads( getHighDataDir() + '/overbeek_fastq_files/' + overbeek_id + '_mappedindelprofiles.txt', rep_reads2, oligoid=overbeek_id) pam_loc2, pam_dir2 = getNullTargetPamDetails( getHighDataDir() + '/overbeek_control_fastq_files/' + overbeek_id + '_exptargets.txt', oligoid=overbeek_id) nreads2 += numread2 nnull2 += num_null2 if numread2 == 0: continue p1_new_reps, p1_old_reps = [{}, {}], [{}, {}] rr_new_reps, rr_old_reps = [{}, {}], [{}, {}] #Read all the new and old profiles pam_loc1, pam_dir1 = None, None for oligo_id, is_old in mappings[overbeek_id]: #Read all reads for all our K562 profiles oligo_idx = eval(oligo_id[5:]) _, oligo_fileprefix = getFileForOligoIdx(oligo_idx, ext='') oligo_filename = oligo_fileprefix + '_mappedindelsummary.txt' read_filename = oligo_fileprefix + '_mappedindelprofiles.txt' exptarget_filename = oligo_fileprefix + '_exptargets.txt' if is_old: oligo_dirs, p1_old_new, null_oligo_dir = old_dirs, p1_old, 'ST_April_2017/data/NULL_Old/mapped_reads/Oligos_71' p1_reps, rr_reps = p1_old_reps, rr_old_reps else: oligo_dirs, p1_old_new, null_oligo_dir = new_dirs, p1_new, 'ST_April_2017/data/NULL_New/mapped_reads/Oligos_71' p1_reps, rr_reps = p1_new_reps, rr_new_reps for oligo_dir in [getHighDataDir() + '/' + x for x in oligo_dirs]: nr1, pa1, nn1 = readSummaryToProfile( oligo_dir + '/' + oligo_filename, p1_old_new, oligoid=oligo_id, remove_long_indels=remove_long_indels, remove_wt=remove_wt, wt_thresh=wt_thresh) numread1, perc_accept1, num_null1 = readSummaryToProfile( oligo_dir + '/' + oligo_filename, p1, oligoid=oligo_id, remove_long_indels=remove_long_indels, remove_wt=remove_wt, wt_thresh=wt_thresh) if 'DPI7' in oligo_dir: rep_idx = 0 if '800x' in oligo_dir else 1 nr_rep, pa_rep, nn_rep = readSummaryToProfile( oligo_dir + '/' + oligo_filename, p1_reps[rep_idx], oligoid=oligo_id, remove_long_indels=remove_long_indels, remove_wt=remove_wt, wt_thresh=wt_thresh) if selected_overbeek_id is not None: fetchRepresentativeCleanReads(oligo_dir + '/' + read_filename, rep_reads1, oligoid=oligo_id) if 'DPI7' in oligo_dir: fetchRepresentativeCleanReads(oligo_dir + '/' + read_filename, rr_reps[rep_idx], oligoid=oligo_id) if pam_loc1 is None: pam_loc1, pam_dir1 = getNullTargetPamDetails( getHighDataDir() + '/' + null_oligo_dir + '/' + exptarget_filename, oligoid=oligo_id) if is_old: nreads_old += numread1 nnull_old += num_null1 else: nreads_new += numread1 nnull_new += num_null1 nreads1 += numread1 nnull1 += num_null1 kls.append(symmetricKL(p1, o1, True)) kls_old.append(symmetricKL(p1_old, o1, True)) kls_new.append(symmetricKL(p1_new, o1, True)) log_reads.append(np.log10(nreads1 - nnull1 + 0.5)) log_reads_old.append(np.log10(nreads_old - nnull_old + 0.5)) log_reads_new.append(np.log10(nreads_new - nnull_new + 0.5)) min_log_reads.append(min(log_reads_old[-1], log_reads_new[-1])) above30_percentages.append(computePercAbove30(o1)) overbeek_ids.append(overbeek_id) if log_reads[-1] > 2.0: all_overbeek_profiles.append(o1) all_our_profiles.append(p1) sel_overbeek_ids.append(overbeek_id[8:]) if above30_percentages[-1] < 50.0: oif, oof, _ = fetchIndelSizeCounts(o1) pif, pof, _ = fetchIndelSizeCounts(p1) overbeek_inframes.append(oif * 100.0 / (oif + oof)) ours_inframes.append(pif * 100.0 / (pif + pof)) oof_sel_overbeek_ids.append(overbeek_id) if min_log_reads[-1] > 2.0: all_new_profiles.append(p1_new) all_old_profiles.append(p1_old) oldnew_overbeek_ids.append(overbeek_id) old_ids.append( [id for id, is_old in mappings[overbeek_id] if is_old][0]) new_ids.append( [id for id, is_old in mappings[overbeek_id] if not is_old][0]) try: print(overbeek_id, [x for (x, y) in mappings[overbeek_id]], kls[-1], nreads2, nreads1) except KeyError: print('Could not find', overbeek_id) print(mappings) if selected_overbeek_id is not None: title = '%s (KL=%.1f)' % (overbeek_id, kls[-1]) labels = [ 'Conventional scaffold Rep A', 'Conventional scaffold Rep B', 'Improved scaffold Rep A', 'Improved scaffold Rep B', 'Endogenous Profile' ] plotProfiles([ p1_old_reps[0], p1_old_reps[1], p1_new_reps[0], p1_new_reps[0], o1 ], [ rr_old_reps[0], rr_old_reps[1], rr_new_reps[0], rr_new_reps[1], rep_reads2 ], [pam_loc1, pam_loc1, pam_loc1, pam_loc1, pam_loc2], [ x == 'REVERSE' for x in [pam_dir1, pam_dir1, pam_dir1, pam_dir1, pam_dir2] ], labels, title=title) if selected_overbeek_id is None: plotInFrame(overbeek_inframes, ours_inframes, oof_sel_overbeek_ids, pred_results_dir) i = 1 PL.figure(figsize=(5.5, 5)) for thr_l, thr_h in [(0.0, 10.0), (10.0, 20.0), (20.0, 50.0), (50.0, 90.0), (90.0, 100.0)]: ydata = [ kl for (kl, a30, id, reads) in zip(kls, above30_percentages, overbeek_ids, log_reads) if a30 > thr_l and a30 <= thr_h ] xdata = [ reads for (kl, a30, id, reads) in zip(kls, above30_percentages, overbeek_ids, log_reads) if a30 > thr_l and a30 <= thr_h ] sel_ids = [ id for (kl, a30, id, reads) in zip(kls, above30_percentages, overbeek_ids, log_reads) if a30 > thr_l and a30 <= thr_h ] PL.plot(xdata, ydata, 'o', label='%d-%d%% Deletions > 30' % (thr_l, thr_h)) for x, y, id in zip(xdata, ydata, sel_ids): if y > 3 and x > 2: PL.text(x, y, id) PL.legend() PL.plot([0, 6], [0.77, 0.77], '--', color='grey') PL.text(0.1, 0.5, 'Median between our replicates', color='grey') PL.ylabel('Symmetric KL Divergence', fontsize=12) PL.xlabel('Log10 Mutated Reads', fontsize=12) PL.xlim((0, 5.5)) PL.ylim((0, 8)) PL.show(block=False) saveFig('scatter_KL') i += 1 print('Median=', np.median(kls), 'Mean KL=', np.mean(kls)) print(len(overbeek_ids)) #Compute pairwise KL between overbeek and ours N = len(sel_overbeek_ids) kl_mat = np.zeros((N, N)) for i, o1 in enumerate(all_overbeek_profiles): for j, p1 in enumerate(all_our_profiles): kl_mat[i, j] = symmetricKL(o1, p1) PL.figure(figsize=(8, 6)) PL.imshow(kl_mat, cmap='hot_r', vmin=0.0, vmax=3.0, interpolation='nearest') PL.xticks(range(N), sel_overbeek_ids, rotation='vertical', fontsize=6) PL.yticks(range(N), sel_overbeek_ids, rotation='horizontal', fontsize=6) PL.xlabel('Synthetic Measurement', fontsize=12) PL.ylabel('Endogenous Measurement', fontsize=12) PL.title('KL', fontsize=12) PL.colorbar() PL.show(block=False) saveFig('heatmap_KL')
def computeAndComparePredicted(theta_file, selected_id=None, out_dir='.'): features_dir = getHighDataDir() + '/gen_indels/features_for_gen_indels' theta, train_set, feature_columns = readTheta(theta_file) #Note: here old refers to conventional scaffold library, new refers to improved scaffold library fout = io.open(out_dir + '/old_new_kl_predicted_summaries.txt', 'w') fout.write( u'Old Oligo Id\tNew Oligo Id\tOld Mut Reads\tNew Mut Reads\tCombined Mut Reads\tOld In Frame Perc\tNew In Frame Perc\tCombined in Frame Perc\tPredicted In Frame Per' ) fout.write( u'\tOld v New KL\tOld v Predicted KL\tNew v Predicted KL\tCombined v Predicted KL\n' ) id_pairs = loadValidationPairs() for (old_id, new_id) in id_pairs: if old_id in train_set or new_id in train_set: raise Exception('Bad!!! Testing on Training data: %s %s' % (old_id, new_id)) if selected_id is not None and selected_id != old_id: continue #Guide pair selected for plotting #Load Old and new profiles, and produce combined profile from the two p_old, p_new, mut_reads_old, mut_reads_new = loadProfilePair( old_id, new_id) p_comb, mut_reads_comb = combineProfiles(p_old, p_new, mut_reads_old, mut_reads_new) #Predict the profile (old and new will be the same so just do one) feature_data = loadOligoFeaturesAndReadCounts(new_id, []) p_predict, _ = computePredictedProfile(feature_data, theta, feature_columns) #Compute in frame percentages old_if, old_of, _ = fetchIndelSizeCounts(p_old) new_if, new_of, _ = fetchIndelSizeCounts(p_new) comb_if, comb_of, _ = fetchIndelSizeCounts(p_comb) pred_if, pred_of, _ = fetchIndelSizeCounts(p_predict) old_if_perc = old_if * 100.0 / (old_if + old_of) new_if_perc = new_if * 100.0 / (new_if + new_of) comb_if_perc = comb_if * 100.0 / (comb_if + comb_of) pred_if_perc = pred_if * 100.0 / (pred_if + pred_of) #Plot the comparison if selected_id is not None: rrds = loadRepReads(new_id) plotProfiles([p_old, p_new, p_predict], [rrds, rrds, rrds], [42, 42, 42], [False, False, False], ['Replicate 1', 'Replicate 2', 'Predicted'], title='%s (KL=%.2f, KL=%.2f)' % (new_id, symmetricKL( p_old, p_new), symmetricKL(p_comb, p_predict))) str_args = (symmetricKL(p_old, p_new), symmetricKL(p_old, p_predict), symmetricKL(p_new, p_predict), symmetricKL(p_comb, p_predict)) kl_str = u'\t%.5f\t%.5f\t%.5f\t%.5f' % str_args fout.write( u'%s\t%s\t%d\t%d\t%d\t%.3f\t%.3f\t%.3f\t%.3f%s\n' % (old_id, new_id, mut_reads_old, mut_reads_new, mut_reads_comb, old_if_perc, new_if_perc, comb_if_perc, pred_if_perc, kl_str)) fout.flush() fout.close()