示例#1
0
def prepare_indelphi(seq, cut, celltype):
    print(celltype)
    inDelphi.init_model(celltype=celltype)
    pred_df, stats = inDelphi.predict(seq, cut)
    pred_df = inDelphi.add_mhless_genotypes(pred_df, stats)
    pred_df = inDelphi.add_genotype_column(pred_df, stats)
    pred_df = inDelphi.add_name_column(pred_df, stats)
    freq = pred_df.loc[:, 'Predicted frequency']
    pred_df.loc[:, 'Predicted frequency'] = freq / freq.sum()
    pred_df = pred_df.sort_values(by=['Predicted frequency'], ascending=False)
    return (pred_df)
示例#2
0
    ref = ref.upper()
    m = re.search(grna, ref)
    if m:
        cut = m.end() - 3  #?
        orientation = 1
    else:
        ref = str(Seq(ref).reverse_complement())
        m = re.search(grna, ref)
        orientation = -1
        cut = m.end() - 3
    return ref, cut, orientation


if __name__ == '__main__':
    f = sys.argv[1]  #input
    inDelphi.init_model(celltype='U2OS')
    lines = open(f).readlines()
    h = lines[0].rstrip(
    ) + "\twt_cut\twt_orientation\tmut_cut\tmut_orientation"
    Fr = open(f.split(".")[0] + "_ori.xls", "w")
    Fr.write(h + "\n")
    for x in lines[1:]:
        x = x.rstrip()
        l = x.split("\t")
        wt_grna = l[2].upper()
        mut_grna = l[3].upper()
        wt_ref = l[6].upper()
        mut_ref = l[7].upper()
        wt_ref, wt_cut, wt_orientation = my_cut(wt_grna, wt_ref)
        mut_ref, mut_cut, mut_orientation = my_cut(mut_grna, mut_ref)
        #print wt_cut,wt_orientation
示例#3
0
import inDelphi
from scipy.stats import linregress
import numpy as np
import pandas as pd

sys.path.append("/cluster/bh0085")
from mybio import util
from _config import REDUCED_LIB, OUT_PLACE
import imp

if not "__file__" in vars(): __file__ = "f_test"
NAME = util.get_fn(__file__)
OUT_DIR = os.path.join(OUT_PLACE, NAME)
util.ensure_dir_exists(OUT_DIR)

all_predictions = pd.DataFrame()
for model in ["mESC", "U2OS"]:
    imp.reload(inDelphi)
    inDelphi.init_model(celltype=model)
    for k, row in REDUCED_LIB.iterrows():
        target_seq = row[
            "Designed sequence (61-bp, cutsite at position 34 by 0-index)"]
        CUTSITE = 34
        pred_df, stats = inDelphi.predict(target_seq, CUTSITE)
        pred_df = inDelphi.add_mhless_genotypes(pred_df, stats)
        pred_df = pred_df.assign(**{"celltype": model, "libid": k})
        all_predictions = all_predictions.append(pred_df, ignore_index=True)

all_predictions.to_csv(os.path.join(OUT_DIR, "indelphi_genotypes.csv"),
                       index=False)
示例#4
0
sim_info = pd.DataFrame()

bar = Bar('Simulating sequences:', max=len(target_seqs_data))

for gRNA_id, target_seq in target_seqs_data.items():

    # Calculate activity score using doench for that guide
    ## 30mer: 4bp 5', 20bp guide, 3bp PAM, 3bp 5'
    seq_for_doench = target_seq[cutsite - 21:cutsite + 9]
    doench_score = calc_doench_score(seq_for_doench)
    ## doench score is from 0 to 100, scale to get numb of edited reads
    n_edit_seqs = round(doench_score * sim_reads /
                        100)  #round to have an integer number of reads

    # Calculate editing outcomes
    inDelphi.init_model(celltype='mESC')
    pred_df, stats = inDelphi.predict(target_seq, cutsite)
    pred_df = inDelphi.add_mhless_genotypes(pred_df, stats)
    #pred_df = inDelphi.add_genotype_column(pred_df,stats)
    ## adds gaps in the deletions, use add_genotype_column to avoid gaps, but sequences could be confused
    pred_df = inDelphi.add_genotype_column_wgaps(pred_df, stats)
    pred_frequency = np.array(pred_df["Predicted frequency"])
    # normalize probabilities to sum exactly 1
    pred_frequency /= pred_frequency.sum()

    # Simulate data
    ## first, create the edited reads
    edit_seqs = np.random.choice(pred_df["Genotype"],
                                 p=pred_frequency,
                                 size=(n_edit_seqs))
    ## add non edited reads up to the sim_reads objective
示例#5
0
def runDelphi(dataframe, truth_reference, max_oligos = 1, cell_type = 'mESC', file_prefix = "inDelphi", pathout = ""):
	# init model
	inDelphi.init_model(celltype = cell_type)

	print("init runDelphi.."); time.sleep(1)

	# Open files so that data can be written
	outfile1 = open(pathout + file_prefix + "_" + cell_type + "_statistics.txt", 'w')
	outfile2 = open(pathout + file_prefix + "_" + cell_type + "_indels_frequency_predicted.txt", 'w')
	outfile3 = open(pathout + file_prefix + "_" + cell_type + "_indels_frequency_actual.txt", 'w')
	outfile4 = open(pathout + file_prefix + "_" + cell_type + "_modelstats.txt", 'w')

	# prepare dictionaries for storing relevant data
	mutations_reference_actual = compile_mutations()
	mutations_reference_pred = compile_mutations()

	# Save some info to the outfiles
	outfile2.write("oligo_id" + "\t" + "\t".join([v for v in mutations_reference_pred.keys()]) + "\n")
	outfile3.write("oligo_id" + "\t" + "\t".join([v for v in mutations_reference_pred.keys()]) + "\n")
	errs = list()
	klavg = 0
	# Variables for loop
	elapsed_time = 0
	iteration = 0; n_oligos = 0
	while n_oligos < max_oligos:	
		start = time.time()
		oligo = dataframe.iloc[[iteration]].values.tolist()[0]
		oligo_id = oligo[0]; seq = oligo[2]; pam_idx = int(oligo[6])
		sys.stdout.write("Query [{0}/{1}]: {2}\n".format(n_oligos, max_oligos, oligo_id))
		try: 
			# Get a dataframe from the ground truth and
			# the the prediction of in-delphi. Merge afterwards.
			#print("pam=",pam_idx)
			actual = format_ground_truth(truth_reference, oligo_id)
			prediction, stats = format_predction(seq, pam_idx) # cutsite is modified in func
			df = actual.merge(prediction,how='outer', on='type' ).fillna(0)

			## Run statistical analysis
			loss = L(list(df['pred']),list(df['actual']))
			stat01 = str(1-stats['Frameshift frequency']/100) #stats_frameshift(df, 'pred')
			stat02 = stats_frameshift(df, 'actual')
			acc01 = accuracy_type_agreement(x,y, top = 1)
			acc02 = accuracy_type_agreement(x, y, top = 2)
			acc03 = accuracy_type_agreement(x, y, top = 3)
			acc04 = None #accuracy_type_disrupt_reading_frame(df)
			acc05 = None #accuracy_type_disrupt_reading_frame(df)
			acc06 = None #accuracy_type_disrupt_reading_frame(df)
			stat_freq_pred = stats_frequency_mutations(df, 'pred', mutations_reference_pred.copy())
			stat_freq_actual = stats_frequency_mutations(actual, 'actual', mutations_reference_actual.copy())

			#time.sleep(5)
			outfile2.write(oligo_id + "\t" + "\t".join([str(v) for v in x]) +"\n") #predicted
			outfile3.write(oligo_id + "\t" + "\t".join([str(v) for v in y]) +"\n")
			outfile1.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\n".format(oligo_id, loss, acc01, acc02, acc03, acc04, acc05, acc06, stat01, stat02))
			outfile4.write("\t".join([str(v) for k, v in stats.items()]))


			# Gather statistics about the code, i.e. timing
			klavg += loss
			n_oligos += 1
			
			end = time.time(); elapsed_time += (end-start)

			if n_oligos % 20 == 0:
				print("KL=",np.round(klavg/n_oligos,5))
				time.sleep(1)

			sys.stdout.write("SElapsed Time: " + str(round(elapsed_time,0)) + "s\n")
		except Exception as e:
			sys.stdout.write("\n>>>ERROR: " + str(e) + "\n")
			errs.append(oligo_id)
			print("CUR ERRORS: " + str(errs))
			time.sleep(3)
			pass
		iteration += 1
	# close all files
	outfile1.close(); outfile2.close()
	outfile3.close(); outfile4.close()