def prepare_indelphi(seq, cut, celltype): print(celltype) inDelphi.init_model(celltype=celltype) pred_df, stats = inDelphi.predict(seq, cut) pred_df = inDelphi.add_mhless_genotypes(pred_df, stats) pred_df = inDelphi.add_genotype_column(pred_df, stats) pred_df = inDelphi.add_name_column(pred_df, stats) freq = pred_df.loc[:, 'Predicted frequency'] pred_df.loc[:, 'Predicted frequency'] = freq / freq.sum() pred_df = pred_df.sort_values(by=['Predicted frequency'], ascending=False) return (pred_df)
ref = ref.upper() m = re.search(grna, ref) if m: cut = m.end() - 3 #? orientation = 1 else: ref = str(Seq(ref).reverse_complement()) m = re.search(grna, ref) orientation = -1 cut = m.end() - 3 return ref, cut, orientation if __name__ == '__main__': f = sys.argv[1] #input inDelphi.init_model(celltype='U2OS') lines = open(f).readlines() h = lines[0].rstrip( ) + "\twt_cut\twt_orientation\tmut_cut\tmut_orientation" Fr = open(f.split(".")[0] + "_ori.xls", "w") Fr.write(h + "\n") for x in lines[1:]: x = x.rstrip() l = x.split("\t") wt_grna = l[2].upper() mut_grna = l[3].upper() wt_ref = l[6].upper() mut_ref = l[7].upper() wt_ref, wt_cut, wt_orientation = my_cut(wt_grna, wt_ref) mut_ref, mut_cut, mut_orientation = my_cut(mut_grna, mut_ref) #print wt_cut,wt_orientation
import inDelphi from scipy.stats import linregress import numpy as np import pandas as pd sys.path.append("/cluster/bh0085") from mybio import util from _config import REDUCED_LIB, OUT_PLACE import imp if not "__file__" in vars(): __file__ = "f_test" NAME = util.get_fn(__file__) OUT_DIR = os.path.join(OUT_PLACE, NAME) util.ensure_dir_exists(OUT_DIR) all_predictions = pd.DataFrame() for model in ["mESC", "U2OS"]: imp.reload(inDelphi) inDelphi.init_model(celltype=model) for k, row in REDUCED_LIB.iterrows(): target_seq = row[ "Designed sequence (61-bp, cutsite at position 34 by 0-index)"] CUTSITE = 34 pred_df, stats = inDelphi.predict(target_seq, CUTSITE) pred_df = inDelphi.add_mhless_genotypes(pred_df, stats) pred_df = pred_df.assign(**{"celltype": model, "libid": k}) all_predictions = all_predictions.append(pred_df, ignore_index=True) all_predictions.to_csv(os.path.join(OUT_DIR, "indelphi_genotypes.csv"), index=False)
sim_info = pd.DataFrame() bar = Bar('Simulating sequences:', max=len(target_seqs_data)) for gRNA_id, target_seq in target_seqs_data.items(): # Calculate activity score using doench for that guide ## 30mer: 4bp 5', 20bp guide, 3bp PAM, 3bp 5' seq_for_doench = target_seq[cutsite - 21:cutsite + 9] doench_score = calc_doench_score(seq_for_doench) ## doench score is from 0 to 100, scale to get numb of edited reads n_edit_seqs = round(doench_score * sim_reads / 100) #round to have an integer number of reads # Calculate editing outcomes inDelphi.init_model(celltype='mESC') pred_df, stats = inDelphi.predict(target_seq, cutsite) pred_df = inDelphi.add_mhless_genotypes(pred_df, stats) #pred_df = inDelphi.add_genotype_column(pred_df,stats) ## adds gaps in the deletions, use add_genotype_column to avoid gaps, but sequences could be confused pred_df = inDelphi.add_genotype_column_wgaps(pred_df, stats) pred_frequency = np.array(pred_df["Predicted frequency"]) # normalize probabilities to sum exactly 1 pred_frequency /= pred_frequency.sum() # Simulate data ## first, create the edited reads edit_seqs = np.random.choice(pred_df["Genotype"], p=pred_frequency, size=(n_edit_seqs)) ## add non edited reads up to the sim_reads objective
def runDelphi(dataframe, truth_reference, max_oligos = 1, cell_type = 'mESC', file_prefix = "inDelphi", pathout = ""): # init model inDelphi.init_model(celltype = cell_type) print("init runDelphi.."); time.sleep(1) # Open files so that data can be written outfile1 = open(pathout + file_prefix + "_" + cell_type + "_statistics.txt", 'w') outfile2 = open(pathout + file_prefix + "_" + cell_type + "_indels_frequency_predicted.txt", 'w') outfile3 = open(pathout + file_prefix + "_" + cell_type + "_indels_frequency_actual.txt", 'w') outfile4 = open(pathout + file_prefix + "_" + cell_type + "_modelstats.txt", 'w') # prepare dictionaries for storing relevant data mutations_reference_actual = compile_mutations() mutations_reference_pred = compile_mutations() # Save some info to the outfiles outfile2.write("oligo_id" + "\t" + "\t".join([v for v in mutations_reference_pred.keys()]) + "\n") outfile3.write("oligo_id" + "\t" + "\t".join([v for v in mutations_reference_pred.keys()]) + "\n") errs = list() klavg = 0 # Variables for loop elapsed_time = 0 iteration = 0; n_oligos = 0 while n_oligos < max_oligos: start = time.time() oligo = dataframe.iloc[[iteration]].values.tolist()[0] oligo_id = oligo[0]; seq = oligo[2]; pam_idx = int(oligo[6]) sys.stdout.write("Query [{0}/{1}]: {2}\n".format(n_oligos, max_oligos, oligo_id)) try: # Get a dataframe from the ground truth and # the the prediction of in-delphi. Merge afterwards. #print("pam=",pam_idx) actual = format_ground_truth(truth_reference, oligo_id) prediction, stats = format_predction(seq, pam_idx) # cutsite is modified in func df = actual.merge(prediction,how='outer', on='type' ).fillna(0) ## Run statistical analysis loss = L(list(df['pred']),list(df['actual'])) stat01 = str(1-stats['Frameshift frequency']/100) #stats_frameshift(df, 'pred') stat02 = stats_frameshift(df, 'actual') acc01 = accuracy_type_agreement(x,y, top = 1) acc02 = accuracy_type_agreement(x, y, top = 2) acc03 = accuracy_type_agreement(x, y, top = 3) acc04 = None #accuracy_type_disrupt_reading_frame(df) acc05 = None #accuracy_type_disrupt_reading_frame(df) acc06 = None #accuracy_type_disrupt_reading_frame(df) stat_freq_pred = stats_frequency_mutations(df, 'pred', mutations_reference_pred.copy()) stat_freq_actual = stats_frequency_mutations(actual, 'actual', mutations_reference_actual.copy()) #time.sleep(5) outfile2.write(oligo_id + "\t" + "\t".join([str(v) for v in x]) +"\n") #predicted outfile3.write(oligo_id + "\t" + "\t".join([str(v) for v in y]) +"\n") outfile1.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\n".format(oligo_id, loss, acc01, acc02, acc03, acc04, acc05, acc06, stat01, stat02)) outfile4.write("\t".join([str(v) for k, v in stats.items()])) # Gather statistics about the code, i.e. timing klavg += loss n_oligos += 1 end = time.time(); elapsed_time += (end-start) if n_oligos % 20 == 0: print("KL=",np.round(klavg/n_oligos,5)) time.sleep(1) sys.stdout.write("SElapsed Time: " + str(round(elapsed_time,0)) + "s\n") except Exception as e: sys.stdout.write("\n>>>ERROR: " + str(e) + "\n") errs.append(oligo_id) print("CUR ERRORS: " + str(errs)) time.sleep(3) pass iteration += 1 # close all files outfile1.close(); outfile2.close() outfile3.close(); outfile4.close()