def predictMutations(theta_file, target_seq, pam_idx, add_null=True): theta, train_set, theta_feature_columns = readTheta(theta_file) #generate indels left_trim = 0 tmp_genindels_file = 'tmp_genindels_%s_%d.txt' % (target_seq, random.randint(0,100000)) cmd = INDELGENTARGET_EXE + ' %s %d %s' % (target_seq, pam_idx, tmp_genindels_file) print(cmd); subprocess.check_call(cmd.split()) rep_reads = fetchRepReads(tmp_genindels_file) isize, smallest_indel = min([(tokFullIndel(x)[1],x) for x in rep_reads]) if len(rep_reads) > 0 else (0,'-') if isize > 0: left_trim = target_seq.find(rep_reads[smallest_indel][:10]) #compute features for all generated indels tmp_features_file = 'tmp_features_%s_%d.txt' % (target_seq, random.randint(0,100000)) calculateFeaturesForGenIndelFile( tmp_genindels_file, target_seq, pam_idx-3, tmp_features_file) os.remove(tmp_genindels_file) feature_data, feature_columns = readFeaturesData(tmp_features_file) os.remove(tmp_features_file) if len(set(theta_feature_columns).difference(set(feature_columns))) != 0: raise Exception('Stored feature names associated with model thetas are not contained in those computed') if len(set(theta_feature_columns).union(set(feature_columns))) != len(theta_feature_columns): feature_data = feature_data[['Indel'] + theta_feature_columns] feature_columns = theta_feature_columns #Predict the profile p_predict, _ = computePredictedProfile(feature_data, theta, theta_feature_columns) in_frame, out_frame, _ = fetchIndelSizeCounts(p_predict) in_frame_perc = in_frame*100.0/(in_frame + out_frame) if add_null: p_predict['-'] = 1000 rep_reads['-'] = target_seq[left_trim:] return p_predict, rep_reads, in_frame_perc
def computePercAbove30(profile): above_count, below_count = 0, 0 i, o, s = fetchIndelSizeCounts(profile) for x in s['D']: if x > 30: above_count += s['D'][x] else: below_count += s['D'][x] count_i = 0 for x in s['I']: count_i += s['I'][x] return above_count * 100.0 / (below_count + above_count + count_i)
def getInFramePerc(profile): p_if, p_of, _ = fetchIndelSizeCounts(profile) p_if_perc = p_if * 100.0 / (p_if + p_of) return p_if_perc
def compareOverbeekProfiles( selected_overbeek_id=None, pred_results_dir='../indel_prediction/model_testing'): new_dirs = [ 'ST_June_2017/data/K562_800x_LV7A_DPI7/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_800x_LV7A_DPI10/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_800x_LV7B_DPI7/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_800x_LV7B_DPI10/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_1600x_LV7B_DPI5/mapped_reads/Oligos_71', 'ST_Feb_2018/data/CAS9_12NA_1600X_DPI7/mapped_reads/Oligos_71' ] #Old Samples old_dirs = [ 'ST_June_2017/data/K562_1600x_6OA_DPI5/mapped_reads/Oligos_71', 'ST_June_2017/data/K562_1600x_6OA_DPI7/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OA_DPI3_Old7/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OA_DPI7_Old8/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OA_DPI10_Old9/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OB_DPI3_Old10/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OB_DPI7_Old11/mapped_reads/Oligos_71', 'ST_April_2017/data/K562_800x_6OB_DPI10_Old12/mapped_reads/Oligos_71' ] remove_long_indels = False remove_wt, wt_thresh = True, 3.0 mappings = loadMappings() all_overbeek_profiles, all_new_profiles, all_old_profiles, all_our_profiles, sel_overbeek_ids,oldnew_overbeek_ids, old_ids, new_ids = [],[],[],[], [],[],[],[] overbeek_inframes, ours_inframes, oof_sel_overbeek_ids = [], [], [] kls, kls_old, kls_new, log_reads, overbeek_ids, above30_percentages, log_reads_new, log_reads_old, min_log_reads = [],[],[],[],[],[],[],[], [] for idx in range(1, 97): overbeek_id = 'Overbeek%d' % idx if selected_overbeek_id is not None and selected_overbeek_id != overbeek_id: continue if overbeek_id not in mappings: continue overbeek_filename = getHighDataDir( ) + '/overbeek_fastq_files/' + overbeek_id + '_mappedindelsummary.txt' p1, p1_new, p1_old, o1, rep_reads1, rep_reads2 = {}, {}, {}, {}, {}, {} nreads2, nreads1, nreads_old, nreads_new, nnull_old, nnull_new, nnull1, nnull2 = 0, 0, 0, 0, 0, 0, 0, 0 #Read the overbreek profile numread2, perc_accept2, num_null2 = readSummaryToProfile( overbeek_filename, o1, oligoid=overbeek_id, remove_long_indels=remove_long_indels, remove_wt=False) if selected_overbeek_id is not None: fetchRepresentativeCleanReads( getHighDataDir() + '/overbeek_fastq_files/' + overbeek_id + '_mappedindelprofiles.txt', rep_reads2, oligoid=overbeek_id) pam_loc2, pam_dir2 = getNullTargetPamDetails( getHighDataDir() + '/overbeek_control_fastq_files/' + overbeek_id + '_exptargets.txt', oligoid=overbeek_id) nreads2 += numread2 nnull2 += num_null2 if numread2 == 0: continue p1_new_reps, p1_old_reps = [{}, {}], [{}, {}] rr_new_reps, rr_old_reps = [{}, {}], [{}, {}] #Read all the new and old profiles pam_loc1, pam_dir1 = None, None for oligo_id, is_old in mappings[overbeek_id]: #Read all reads for all our K562 profiles oligo_idx = eval(oligo_id[5:]) _, oligo_fileprefix = getFileForOligoIdx(oligo_idx, ext='') oligo_filename = oligo_fileprefix + '_mappedindelsummary.txt' read_filename = oligo_fileprefix + '_mappedindelprofiles.txt' exptarget_filename = oligo_fileprefix + '_exptargets.txt' if is_old: oligo_dirs, p1_old_new, null_oligo_dir = old_dirs, p1_old, 'ST_April_2017/data/NULL_Old/mapped_reads/Oligos_71' p1_reps, rr_reps = p1_old_reps, rr_old_reps else: oligo_dirs, p1_old_new, null_oligo_dir = new_dirs, p1_new, 'ST_April_2017/data/NULL_New/mapped_reads/Oligos_71' p1_reps, rr_reps = p1_new_reps, rr_new_reps for oligo_dir in [getHighDataDir() + '/' + x for x in oligo_dirs]: nr1, pa1, nn1 = readSummaryToProfile( oligo_dir + '/' + oligo_filename, p1_old_new, oligoid=oligo_id, remove_long_indels=remove_long_indels, remove_wt=remove_wt, wt_thresh=wt_thresh) numread1, perc_accept1, num_null1 = readSummaryToProfile( oligo_dir + '/' + oligo_filename, p1, oligoid=oligo_id, remove_long_indels=remove_long_indels, remove_wt=remove_wt, wt_thresh=wt_thresh) if 'DPI7' in oligo_dir: rep_idx = 0 if '800x' in oligo_dir else 1 nr_rep, pa_rep, nn_rep = readSummaryToProfile( oligo_dir + '/' + oligo_filename, p1_reps[rep_idx], oligoid=oligo_id, remove_long_indels=remove_long_indels, remove_wt=remove_wt, wt_thresh=wt_thresh) if selected_overbeek_id is not None: fetchRepresentativeCleanReads(oligo_dir + '/' + read_filename, rep_reads1, oligoid=oligo_id) if 'DPI7' in oligo_dir: fetchRepresentativeCleanReads(oligo_dir + '/' + read_filename, rr_reps[rep_idx], oligoid=oligo_id) if pam_loc1 is None: pam_loc1, pam_dir1 = getNullTargetPamDetails( getHighDataDir() + '/' + null_oligo_dir + '/' + exptarget_filename, oligoid=oligo_id) if is_old: nreads_old += numread1 nnull_old += num_null1 else: nreads_new += numread1 nnull_new += num_null1 nreads1 += numread1 nnull1 += num_null1 kls.append(symmetricKL(p1, o1, True)) kls_old.append(symmetricKL(p1_old, o1, True)) kls_new.append(symmetricKL(p1_new, o1, True)) log_reads.append(np.log10(nreads1 - nnull1 + 0.5)) log_reads_old.append(np.log10(nreads_old - nnull_old + 0.5)) log_reads_new.append(np.log10(nreads_new - nnull_new + 0.5)) min_log_reads.append(min(log_reads_old[-1], log_reads_new[-1])) above30_percentages.append(computePercAbove30(o1)) overbeek_ids.append(overbeek_id) if log_reads[-1] > 2.0: all_overbeek_profiles.append(o1) all_our_profiles.append(p1) sel_overbeek_ids.append(overbeek_id[8:]) if above30_percentages[-1] < 50.0: oif, oof, _ = fetchIndelSizeCounts(o1) pif, pof, _ = fetchIndelSizeCounts(p1) overbeek_inframes.append(oif * 100.0 / (oif + oof)) ours_inframes.append(pif * 100.0 / (pif + pof)) oof_sel_overbeek_ids.append(overbeek_id) if min_log_reads[-1] > 2.0: all_new_profiles.append(p1_new) all_old_profiles.append(p1_old) oldnew_overbeek_ids.append(overbeek_id) old_ids.append( [id for id, is_old in mappings[overbeek_id] if is_old][0]) new_ids.append( [id for id, is_old in mappings[overbeek_id] if not is_old][0]) try: print(overbeek_id, [x for (x, y) in mappings[overbeek_id]], kls[-1], nreads2, nreads1) except KeyError: print('Could not find', overbeek_id) print(mappings) if selected_overbeek_id is not None: title = '%s (KL=%.1f)' % (overbeek_id, kls[-1]) labels = [ 'Conventional scaffold Rep A', 'Conventional scaffold Rep B', 'Improved scaffold Rep A', 'Improved scaffold Rep B', 'Endogenous Profile' ] plotProfiles([ p1_old_reps[0], p1_old_reps[1], p1_new_reps[0], p1_new_reps[0], o1 ], [ rr_old_reps[0], rr_old_reps[1], rr_new_reps[0], rr_new_reps[1], rep_reads2 ], [pam_loc1, pam_loc1, pam_loc1, pam_loc1, pam_loc2], [ x == 'REVERSE' for x in [pam_dir1, pam_dir1, pam_dir1, pam_dir1, pam_dir2] ], labels, title=title) if selected_overbeek_id is None: plotInFrame(overbeek_inframes, ours_inframes, oof_sel_overbeek_ids, pred_results_dir) i = 1 PL.figure(figsize=(5.5, 5)) for thr_l, thr_h in [(0.0, 10.0), (10.0, 20.0), (20.0, 50.0), (50.0, 90.0), (90.0, 100.0)]: ydata = [ kl for (kl, a30, id, reads) in zip(kls, above30_percentages, overbeek_ids, log_reads) if a30 > thr_l and a30 <= thr_h ] xdata = [ reads for (kl, a30, id, reads) in zip(kls, above30_percentages, overbeek_ids, log_reads) if a30 > thr_l and a30 <= thr_h ] sel_ids = [ id for (kl, a30, id, reads) in zip(kls, above30_percentages, overbeek_ids, log_reads) if a30 > thr_l and a30 <= thr_h ] PL.plot(xdata, ydata, 'o', label='%d-%d%% Deletions > 30' % (thr_l, thr_h)) for x, y, id in zip(xdata, ydata, sel_ids): if y > 3 and x > 2: PL.text(x, y, id) PL.legend() PL.plot([0, 6], [0.77, 0.77], '--', color='grey') PL.text(0.1, 0.5, 'Median between our replicates', color='grey') PL.ylabel('Symmetric KL Divergence', fontsize=12) PL.xlabel('Log10 Mutated Reads', fontsize=12) PL.xlim((0, 5.5)) PL.ylim((0, 8)) PL.show(block=False) saveFig('scatter_KL') i += 1 print('Median=', np.median(kls), 'Mean KL=', np.mean(kls)) print(len(overbeek_ids)) #Compute pairwise KL between overbeek and ours N = len(sel_overbeek_ids) kl_mat = np.zeros((N, N)) for i, o1 in enumerate(all_overbeek_profiles): for j, p1 in enumerate(all_our_profiles): kl_mat[i, j] = symmetricKL(o1, p1) PL.figure(figsize=(8, 6)) PL.imshow(kl_mat, cmap='hot_r', vmin=0.0, vmax=3.0, interpolation='nearest') PL.xticks(range(N), sel_overbeek_ids, rotation='vertical', fontsize=6) PL.yticks(range(N), sel_overbeek_ids, rotation='horizontal', fontsize=6) PL.xlabel('Synthetic Measurement', fontsize=12) PL.ylabel('Endogenous Measurement', fontsize=12) PL.title('KL', fontsize=12) PL.colorbar() PL.show(block=False) saveFig('heatmap_KL')
random.shuffle(alt2_id_pairs) print(len(id_pairs), len(alt_id_pairs), len(alt2_id_pairs)) for (old_id, new_id), (alt_old_id, alt_new_id), (alt2_old_id, alt2_new_id) in zip( id_pairs, alt_id_pairs, alt2_id_pairs): print(old_id, new_id) p_old, p_new, mut_reads_old, mut_reads_new = loadProfilePair( old_id, new_id) old_ps, new_ps = loadSeparateProfilePairs(old_id, new_id) p_old_alt, p_new_alt, mut_reads_old_alt, mut_reads_new_alt = loadProfilePair( alt_old_id, alt_new_id) p_old_alt2, p_new_alt2, mut_reads_old_alt2, mut_reads_new_alt2 = loadProfilePair( alt2_old_id, alt2_new_id) old_if, old_of, _ = fetchIndelSizeCounts(p_old) new_if, new_of, _ = fetchIndelSizeCounts(p_new) old_if_perc = old_if * 100.0 / (old_if + old_of) new_if_perc = new_if * 100.0 / (new_if + new_of) out_str = '' for kl_func in [symmetricKL, classSymmetricKL]: str_args = (kl_func(p_old, p_new), meanSymKL(old_ps, kl_func=kl_func), meanSymKL(new_ps, kl_func=kl_func)) out_str += u'\t%.5f\t%.5f\t%.5f' % str_args out_str += u'\t%s\t%s\t%d\t%d' % (alt_old_id, alt_new_id, mut_reads_old_alt, mut_reads_new_alt) for kl_func in [symmetricKL, classSymmetricKL]: str_args = (np.mean([ kl_func(p_old, p_new_alt),
def computeAndComparePredicted(theta_file, selected_id=None, out_dir='.'): features_dir = getHighDataDir() + '/gen_indels/features_for_gen_indels' theta, train_set, feature_columns = readTheta(theta_file) #Note: here old refers to conventional scaffold library, new refers to improved scaffold library fout = io.open(out_dir + '/old_new_kl_predicted_summaries.txt', 'w') fout.write( u'Old Oligo Id\tNew Oligo Id\tOld Mut Reads\tNew Mut Reads\tCombined Mut Reads\tOld In Frame Perc\tNew In Frame Perc\tCombined in Frame Perc\tPredicted In Frame Per' ) fout.write( u'\tOld v New KL\tOld v Predicted KL\tNew v Predicted KL\tCombined v Predicted KL\n' ) id_pairs = loadValidationPairs() for (old_id, new_id) in id_pairs: if old_id in train_set or new_id in train_set: raise Exception('Bad!!! Testing on Training data: %s %s' % (old_id, new_id)) if selected_id is not None and selected_id != old_id: continue #Guide pair selected for plotting #Load Old and new profiles, and produce combined profile from the two p_old, p_new, mut_reads_old, mut_reads_new = loadProfilePair( old_id, new_id) p_comb, mut_reads_comb = combineProfiles(p_old, p_new, mut_reads_old, mut_reads_new) #Predict the profile (old and new will be the same so just do one) feature_data = loadOligoFeaturesAndReadCounts(new_id, []) p_predict, _ = computePredictedProfile(feature_data, theta, feature_columns) #Compute in frame percentages old_if, old_of, _ = fetchIndelSizeCounts(p_old) new_if, new_of, _ = fetchIndelSizeCounts(p_new) comb_if, comb_of, _ = fetchIndelSizeCounts(p_comb) pred_if, pred_of, _ = fetchIndelSizeCounts(p_predict) old_if_perc = old_if * 100.0 / (old_if + old_of) new_if_perc = new_if * 100.0 / (new_if + new_of) comb_if_perc = comb_if * 100.0 / (comb_if + comb_of) pred_if_perc = pred_if * 100.0 / (pred_if + pred_of) #Plot the comparison if selected_id is not None: rrds = loadRepReads(new_id) plotProfiles([p_old, p_new, p_predict], [rrds, rrds, rrds], [42, 42, 42], [False, False, False], ['Replicate 1', 'Replicate 2', 'Predicted'], title='%s (KL=%.2f, KL=%.2f)' % (new_id, symmetricKL( p_old, p_new), symmetricKL(p_comb, p_predict))) str_args = (symmetricKL(p_old, p_new), symmetricKL(p_old, p_predict), symmetricKL(p_new, p_predict), symmetricKL(p_comb, p_predict)) kl_str = u'\t%.5f\t%.5f\t%.5f\t%.5f' % str_args fout.write( u'%s\t%s\t%d\t%d\t%d\t%.3f\t%.3f\t%.3f\t%.3f%s\n' % (old_id, new_id, mut_reads_old, mut_reads_new, mut_reads_comb, old_if_perc, new_if_perc, comb_if_perc, pred_if_perc, kl_str)) fout.flush() fout.close()