Пример #1
0
def prepare_indelphi(seq, cut, celltype):
    print(celltype)
    inDelphi.init_model(celltype=celltype)
    pred_df, stats = inDelphi.predict(seq, cut)
    pred_df = inDelphi.add_mhless_genotypes(pred_df, stats)
    pred_df = inDelphi.add_genotype_column(pred_df, stats)
    pred_df = inDelphi.add_name_column(pred_df, stats)
    freq = pred_df.loc[:, 'Predicted frequency']
    pred_df.loc[:, 'Predicted frequency'] = freq / freq.sum()
    pred_df = pred_df.sort_values(by=['Predicted frequency'], ascending=False)
    return (pred_df)
Пример #2
0
def format_predction(seq, pam_idx):
	# A wrapper function that formats the output of
	# the inDelphi prediction method
	cutsite = pam_idx - 3
	seqA = seq[0:cutsite]
	seqB = seq[cutsite::]
	pred_df, stats = inDelphi.predict(seq, cutsite)
	pred_df = pred_df.groupby(['Category','Length'],as_index=False).agg({'Predicted frequency': 'sum'})
	pred_df['New category'] = pred_df['Category'].astype(str).str[0] + pred_df['Length'].map(str)
	pred_df = pred_df[['New category', 'Predicted frequency']]
	pred_df.columns = ['type','pred']
	pred_df['type'] = pred_df['type'].apply(lambda x: x.upper())
	pred_df['pred'] = pred_df['pred'].astype(float)/100 #.round(decimals = 4)/100

	return pred_df, stats
"""
Run each component of inDephi to understand its inputs/outputs
"""

# zzjfrank, 2020-10-11

from inDelphi import init_model, predict

# the example sequence in inDelphi webserver
left = 'GCAGTCAGTGCAGTAGAGGATGTGTCGCTCTCCCGTACGGCGTGAAAATGACTAGCAAAG'
right = 'TTGGGGCCTTTTTGGAAGACCTAGAGCCTTAGGCCACGGTACACAATGGTGTCCTGCATA'
seq = left + right
cutsite = len(left)

pred_df, stats = predict(seq, cutsite)
Пример #4
0
    for x in lines[1:]:
        x = x.rstrip()
        l = x.split("\t")
        wt_grna = l[2].upper()
        mut_grna = l[3].upper()
        wt_ref = l[6].upper()
        mut_ref = l[7].upper()
        wt_ref, wt_cut, wt_orientation = my_cut(wt_grna, wt_ref)
        mut_ref, mut_cut, mut_orientation = my_cut(mut_grna, mut_ref)
        #print wt_cut,wt_orientation
        #print mut_cut,mut_orientation
        fm = "\t".join(l[:5]) + "\t" + wt_ref + "\t" + mut_ref + "\t" + str(
            wt_cut) + "\t" + str(wt_orientation) + "\t" + str(
                mut_cut) + "\t" + str(mut_orientation)
        Fr.write(fm + "\n")
        wt_pred_df, wt_stats = inDelphi.predict(wt_ref, wt_cut)
        mut_pred_df, mut_stats = inDelphi.predict(mut_ref, mut_cut)
        #wt_stats['gRNA']=wt_grna
        #mut_stats['gRNA']=mut_grna
        #wt_stats['gRNA orientation']=wt_orientation
        #mut_stats['gRNA orientation']=mut_orientation
        #print wt_pred_df
        wt_df_indel = inDelphi.get_indel_length_fqs(wt_pred_df)
        mut_df_indel = inDelphi.get_indel_length_fqs(mut_pred_df)
        wt_df_indel.to_csv("wt_" + l[1] + "_indel.xls", sep="\t", index=False)
        mut_df_indel.to_csv("mut_" + l[1] + "_indel.xls",
                            sep="\t",
                            index=False)
        #break
    Fr.close()
Пример #5
0
bar = Bar('Simulating sequences:', max=len(target_seqs_data))

for gRNA_id, target_seq in target_seqs_data.items():

    # Calculate activity score using doench for that guide
    ## 30mer: 4bp 5', 20bp guide, 3bp PAM, 3bp 5'
    seq_for_doench = target_seq[cutsite - 21:cutsite + 9]
    doench_score = calc_doench_score(seq_for_doench)
    ## doench score is from 0 to 100, scale to get numb of edited reads
    n_edit_seqs = round(doench_score * sim_reads /
                        100)  #round to have an integer number of reads

    # Calculate editing outcomes
    inDelphi.init_model(celltype='mESC')
    pred_df, stats = inDelphi.predict(target_seq, cutsite)
    pred_df = inDelphi.add_mhless_genotypes(pred_df, stats)
    #pred_df = inDelphi.add_genotype_column(pred_df,stats)
    ## adds gaps in the deletions, use add_genotype_column to avoid gaps, but sequences could be confused
    pred_df = inDelphi.add_genotype_column_wgaps(pred_df, stats)
    pred_frequency = np.array(pred_df["Predicted frequency"])
    # normalize probabilities to sum exactly 1
    pred_frequency /= pred_frequency.sum()

    # Simulate data
    ## first, create the edited reads
    edit_seqs = np.random.choice(pred_df["Genotype"],
                                 p=pred_frequency,
                                 size=(n_edit_seqs))
    ## add non edited reads up to the sim_reads objective
    wt_seqs = np.repeat(target_seq, sim_reads - n_edit_seqs)
Пример #6
0
import inDelphi
from scipy.stats import linregress
import numpy as np
import pandas as pd

sys.path.append("/cluster/bh0085")
from mybio import util
from _config import REDUCED_LIB, OUT_PLACE
import imp

if not "__file__" in vars(): __file__ = "f_test"
NAME = util.get_fn(__file__)
OUT_DIR = os.path.join(OUT_PLACE, NAME)
util.ensure_dir_exists(OUT_DIR)

all_predictions = pd.DataFrame()
for model in ["mESC", "U2OS"]:
    imp.reload(inDelphi)
    inDelphi.init_model(celltype=model)
    for k, row in REDUCED_LIB.iterrows():
        target_seq = row[
            "Designed sequence (61-bp, cutsite at position 34 by 0-index)"]
        CUTSITE = 34
        pred_df, stats = inDelphi.predict(target_seq, CUTSITE)
        pred_df = inDelphi.add_mhless_genotypes(pred_df, stats)
        pred_df = pred_df.assign(**{"celltype": model, "libid": k})
        all_predictions = all_predictions.append(pred_df, ignore_index=True)

all_predictions.to_csv(os.path.join(OUT_DIR, "indelphi_genotypes.csv"),
                       index=False)