def calc_statistics(df, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions # Denominator is ins if sum(_lib.crispr_subset(df)['Count']) <= 1000: return ins_criteria = (df['Category'] == 'ins') & (df['Length'] == 1) & ( df['Indel with Mismatches'] != 'yes') ins_count = sum(df[ins_criteria]['Count']) del_criteria = (df['Category'] == 'del') & (df['Indel with Mismatches'] != 'yes') del_count = sum(df[del_criteria]['Count']) if del_count == 0: return alldf_dict['Ins1bp/Del Ratio'].append(ins_count / (del_count + ins_count)) mhdel_crit = (df['Category'] == 'del') & (df['Indel with Mismatches'] != 'yes') & (df['Microhomology-Based'] == 'yes') mhdel_count = sum(df[mhdel_crit]['Count']) try: alldf_dict['Ins1bp/MHDel Ratio'].append(ins_count / (mhdel_count + ins_count)) except ZeroDivisionError: alldf_dict['Ins1bp/MHDel Ratio'].append(0) ins_ratio = ins_count / sum(_lib.crispr_subset(df)['Count']) alldf_dict['Ins1bp Ratio'].append(ins_ratio) seq, cutsite = _lib.get_sequence_cutsite(df) alldf_dict['Sequence Context'].append(seq[-55:-30] + 'NNNN' + seq[-26:]) alldf_dict['Fourbp'].append(seq[cutsite - 2:cutsite + 2]) alldf_dict['Base1'].append(seq[cutsite - 2]) alldf_dict['Base2'].append(seq[cutsite - 1]) alldf_dict['Base3'].append(seq[cutsite]) alldf_dict['Base4'].append(seq[cutsite + 1]) _predict2.init_model() del_score = _predict2.total_deletion_score(seq, cutsite) alldf_dict['Del Score'].append(del_score) dlpred = _predict2.deletion_length_distribution(seq, cutsite) from scipy.stats import entropy norm_entropy = entropy(dlpred) / np.log(len(dlpred)) alldf_dict['Entropy'].append(norm_entropy) alldf_dict['_Experiment'].append(exp) return alldf_dict
def calc_statistics(orig_df, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions df = _lib.crispr_subset(orig_df) if sum(df['Count']) <= 1000: return df['Frequency'] = _lib.normalize_frequency(df) # Wildtype repair frequency dlwt = _config.d.DISLIB_WT row = dlwt[dlwt['name'] == exp].iloc[0] if row['wt_repairable'] != 'yes': alldf_dict['wt_obs'].append(np.nan) alldf_dict['dl'].append(np.nan) else: dls = [int(s) for s in row['dls'].split(';')] gts = [int(s) for s in row['gts'].split(';')] obs_freq = 0 for dl, gt in zip(dls, gts): crit = (df['Length'] == dl) & (df['Genotype Position'] == gt) obs_freq += sum(df[crit]['Frequency']) alldf_dict['wt_obs'].append(obs_freq) alldf_dict['dl'].append(set(dls).pop()) alldf_dict['_Experiment'].append(exp) return alldf_dict
def calc_statistics(df, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions longdup_len = int(exp.split('_')[1]) # Denominator is ins df = _lib.crispr_subset(df) if sum(df['Count']) <= 500: return df['Frequency'] = _lib.normalize_frequency(df) criteria = (df['Category'] == 'del') & (df['Length'] == longdup_len) freq = sum(df[criteria]['Frequency']) alldf_dict['Length'].append(longdup_len) alldf_dict['Frequency'].append(freq) alldf_dict['_Experiment'].append(exp) return alldf_dict
def calc_statistics(df, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions # Denominator is all non-noise categories df = _lib.notnoise_subset(df) df['Frequency'] = _lib.normalize_frequency(df) if sum(df['Frequency']) == 0: return # Consider only deletions, anywhere del_df = _lib.del_subset(df) # Get left side for del_pos in range(-10, -1 + 1): total_freq = sum( del_df[del_df['Genotype Position'] == del_pos]['Frequency']) alldf_dict[str(del_pos)].append(total_freq) # Get right side for del_pos in range(1, 10 + 1): criteria = (del_df['Genotype Position'] - del_df['Length'] == del_pos) total_freq = sum(del_df[criteria]['Frequency']) alldf_dict[str(del_pos)].append(total_freq) editing_rate = sum(_lib.crispr_subset(df)['Frequency']) / sum( df['Frequency']) alldf_dict['Editing Rate'].append(editing_rate) alldf_dict['_Experiment'].append(exp) # Test alternative hypothesis: is asymmetry actually meaningful? If -1 arises from 0gt and sequencing mismatch, and +1 arises from Ngt and sequencing mismatch, then we should see asymmetry in 0gt vs Ngt. def detect_0gt_microhomology(row): # Returns a frequency if row['Category'] != 'del': return 0 cutsite = int(row['_Cutsite']) seq = row['_Sequence Context'] gt_pos = int(row['Genotype Position']) del_len = int(row['Length']) left = seq[cutsite - del_len:cutsite] right = seq[cutsite:cutsite + del_len] if len(left) != len(right): return 0 mhs = [] mh = [0] for idx, (c1, c2) in enumerate(zip(left, right)): if c1 == c2: mh.append(idx + 1) else: mhs.append(mh) mh = [idx + 1] mhs.append(mh) for mh in mhs: if gt_pos in mh: if 0 in mh: return row['Frequency'] return 0 freq_0gt = sum(del_df.apply(detect_0gt_microhomology, axis=1)) alldf_dict['0gt Frequency'].append(freq_0gt) criteria = (del_df['Genotype Position'] - del_df['Length'] == 0) freq_Ngt = sum(del_df[criteria]['Frequency']) alldf_dict['Ngt Frequency'].append(freq_Ngt) return
def calc_statistics(df1, df2, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions df1 = _lib.crispr_subset(df1) df2 = _lib.crispr_subset(df2) if sum(df1['Count']) < 1000 or sum(df2['Count']) < 1000: return def get_r_from_subsets(d1, d2): if sum(d1['Count']) < 100 or sum(d2['Count']) < 100: return np.nan d1['Frequency'] = _lib.normalize_frequency(d1) d2['Frequency'] = _lib.normalize_frequency(d2) mdf = _lib.merge_crispr_events(d1, d2, '_1', '_2') return pearsonr(mdf['Frequency_1'], mdf['Frequency_2'])[0] # everything alldf_dict['all'].append(get_r_from_subsets(df1, df2)) # All CRISPR dels d1 = _lib.del_subset(df1) d2 = _lib.del_subset(df2) alldf_dict['All del'].append(get_r_from_subsets(d1, d2)) # Del at cut d1 = df1[df1['Category'] == 'del'] d2 = df2[df2['Category'] == 'del'] alldf_dict['del'].append(get_r_from_subsets(d1, d2)) # MH dels d1 = df1[(df1['Category'] == 'del') & (df1['Microhomology-Based'] == 'yes')] d2 = df2[(df2['Category'] == 'del') & (df2['Microhomology-Based'] == 'yes')] alldf_dict['mh_del'].append(get_r_from_subsets(d1, d2)) # MHless dels d1 = df1[(df1['Category'] == 'del') & (df1['Microhomology-Based'] == 'no')] d2 = df2[(df2['Category'] == 'del') & (df2['Microhomology-Based'] == 'no')] alldf_dict['nomh_del'].append(get_r_from_subsets(d1, d2)) # Del not at cut d1 = df1[df1['Category'] == 'del_notatcut'] d2 = df2[df2['Category'] == 'del_notatcut'] alldf_dict['del_notatcut'].append(get_r_from_subsets(d1, d2)) # All CRISPR ins d1 = _lib.ins_subset(df1) d2 = _lib.ins_subset(df2) alldf_dict['All ins'].append(get_r_from_subsets(d1, d2)) # All ins at cutsite d1 = df1[df1['Category'] == 'ins'] d2 = df2[df2['Category'] == 'ins'] alldf_dict['ins'].append(get_r_from_subsets(d1, d2)) # 1bp ins d1 = df1[(df1['Category'] == 'ins') & (df1['Length'] == 1)] d2 = df2[(df2['Category'] == 'ins') & (df2['Length'] == 1)] alldf_dict['ins_1bp'].append(get_r_from_subsets(d1, d2)) # 2bp+ ins d1 = df1[(df1['Category'] == 'ins') & (df1['Length'] > 1)] d2 = df2[(df2['Category'] == 'ins') & (df2['Length'] > 1)] alldf_dict['ins_2bpplus'].append(get_r_from_subsets(d1, d2)) # Ins not at cut d1 = df1[df1['Category'] == 'ins_notatcut'] d2 = df2[df2['Category'] == 'ins_notatcut'] alldf_dict['ins_notatcut'].append(get_r_from_subsets(d1, d2)) alldf_dict['_Experiment'].append(exp) return
def calc_statistics(orig_df, exp, rate_model, bp_model, alldf_dict, rs, data_nm): # Calculate statistics on df, saving to alldf_dict # Deletion positions df = _lib.mh_del_subset(orig_df) df = _lib.indels_without_mismatches_subset(df) if sum(df['Count']) <= 1000: return obs_d = defaultdict(list) df = orig_df # Grab observed deletions, MH and MH-less for del_len in range(1, 59+1): crit = (df['Category'] == 'del') & (df['Indel with Mismatches'] != 'yes') & (df['Length'] == del_len) s = df[crit] mh_s = s[s['Microhomology-Based'] == 'yes'] for idx, row in mh_s.iterrows(): obs_d['Count'].append(row['Count']) obs_d['Genotype Position'].append(row['Genotype Position']) obs_d['Length'].append(row['Length']) obs_d['Category'].append('del') mhless_s = s[s['Microhomology-Based'] != 'yes'] obs_d['Length'].append(del_len) obs_d['Count'].append(sum(mhless_s['Count'])) obs_d['Genotype Position'].append('e') obs_d['Category'].append('del') obs_df = pd.DataFrame(obs_d) # Grab observed 1 bp insertions ins_crit = (orig_df['Category'] == 'ins') & (orig_df['Length'] == 1) & (orig_df['Indel with Mismatches'] != 'yes') ins_df = orig_df[ins_crit] truncated_ins_d = defaultdict(list) for ins_base in list('ACGT'): crit = (ins_df['Inserted Bases'] == ins_base) tot_count = sum(ins_df[crit]['Count']) truncated_ins_d['Count'].append(tot_count) truncated_ins_d['Inserted Bases'].append(ins_base) truncated_ins_d['Category'].append('ins') truncated_ins_d['Length'].append(1) ins_df = pd.DataFrame(truncated_ins_d) obs_df = obs_df.append(ins_df, ignore_index = True) obs_df['Frequency'] = _lib.normalize_frequency(obs_df) crispr_subset = _lib.crispr_subset(orig_df) frac_explained = sum(obs_df['Count']) / sum(crispr_subset['Count']) # print frac_explained # Save this for aggregate plotting alldf_dict['Fraction Explained'].append(frac_explained) # Predict MH dels and MH-less dels _predict2.init_model() seq, cutsite = _lib.get_sequence_cutsite(orig_df) pred_df = _predict2.predict_indels(seq, cutsite, rate_model, bp_model) mdf = obs_df.merge(pred_df, how = 'outer', on = ['Category', 'Genotype Position', 'Inserted Bases', 'Length']) mdf['Frequency'].fillna(value = 0, inplace = True) mdf['Predicted_Frequency'].fillna(value = 0, inplace = True) r = pearsonr(mdf['Frequency'], mdf['Predicted_Frequency'])[0] # Merge observed and predicted, double check correlation # Store in a way that I can plot it. data_nm_out_dir = out_dir + data_nm + '/' util.ensure_dir_exists(data_nm_out_dir) exp_out_dir = data_nm_out_dir + exp + '/' util.ensure_dir_exists(exp_out_dir) out_fn = exp_out_dir + '%.3f.csv' % (r) mdf.to_csv(out_fn) # Store in alldf_dict alldf_dict['_Experiment'].append(exp) alldf_dict['rs'].append(rs) return alldf_dict
def calc_statistics(df, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions # Denominator is ins if sum(_lib.crispr_subset(df)['Count']) <= 1000: return editing_rate = sum(_lib.crispr_subset(df)['Count']) / sum( _lib.notnoise_subset(df)['Count']) alldf_dict['Editing Rate'].append(editing_rate) ins_criteria = (df['Category'] == 'ins') & (df['Length'] == 1) & ( df['Indel with Mismatches'] != 'yes') ins_count = sum(df[ins_criteria]['Count']) del_criteria = (df['Category'] == 'del') & (df['Indel with Mismatches'] != 'yes') del_count = sum(df[del_criteria]['Count']) if del_count == 0: return alldf_dict['Ins1bp/Del Ratio'].append(ins_count / (del_count + ins_count)) mhdel_crit = (df['Category'] == 'del') & (df['Indel with Mismatches'] != 'yes') & (df['Microhomology-Based'] == 'yes') mhdel_count = sum(df[mhdel_crit]['Count']) try: alldf_dict['Ins1bp/MHDel Ratio'].append(ins_count / (mhdel_count + ins_count)) except ZeroDivisionError: alldf_dict['Ins1bp/MHDel Ratio'].append(0) ins_ratio = ins_count / sum(_lib.crispr_subset(df)['Count']) alldf_dict['Ins1bp Ratio'].append(ins_ratio) seq, cutsite = _lib.get_sequence_cutsite(df) fivebase = seq[cutsite - 1] alldf_dict['Fivebase'].append(fivebase) _predict2.init_model() del_score = _predict2.total_deletion_score(seq, cutsite) alldf_dict['Del Score'].append(del_score) dlpred = _predict2.deletion_length_distribution(seq, cutsite) from scipy.stats import entropy norm_entropy = entropy(dlpred) / np.log(len(dlpred)) alldf_dict['Entropy'].append(norm_entropy) local_seq = seq[cutsite - 4:cutsite + 4] gc = (local_seq.count('C') + local_seq.count('G')) / len(local_seq) alldf_dict['GC'].append(gc) if fivebase == 'A': fivebase_oh = np.array([1, 0, 0, 0]) if fivebase == 'C': fivebase_oh = np.array([0, 1, 0, 0]) if fivebase == 'G': fivebase_oh = np.array([0, 0, 1, 0]) if fivebase == 'T': fivebase_oh = np.array([0, 0, 0, 1]) alldf_dict['Fivebase_OH'].append(fivebase_oh) threebase = seq[cutsite] alldf_dict['Threebase'].append(threebase) if threebase == 'A': threebase_oh = np.array([1, 0, 0, 0]) if threebase == 'C': threebase_oh = np.array([0, 1, 0, 0]) if threebase == 'G': threebase_oh = np.array([0, 0, 1, 0]) if threebase == 'T': threebase_oh = np.array([0, 0, 0, 1]) alldf_dict['Threebase_OH'].append(threebase_oh) alldf_dict['_Experiment'].append(exp) return alldf_dict