def calc_statistics(orig_df1, orig_df2, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions df1 = _lib.mh_del_subset(orig_df1) df1 = _lib.indels_without_mismatches_subset(df1) df2 = _lib.mh_del_subset(orig_df2) df2 = _lib.indels_without_mismatches_subset(df2) if bool(sum(df1['Count']) <= 1000) or bool(sum(df2['Count']) <= 1000): return df1['Frequency'] = _lib.normalize_frequency(df1) df2['Frequency'] = _lib.normalize_frequency(df2) join_cols = ['Category', 'Genotype Position', 'Length'] mdf = df1.merge(df2, how='outer', on=join_cols, suffixes=['_1', '_2']) mdf['Frequency_1'].fillna(value=0, inplace=True) mdf['Frequency_2'].fillna(value=0, inplace=True) r = pearsonr(mdf['Frequency_1'], mdf['Frequency_2'])[0] print exp, r alldf_dict['_Experiment'].append(exp) return alldf_dict
def get_r_from_subsets(d1, d2): if sum(d1['Count']) < 100 or sum(d2['Count']) < 100: return np.nan d1['Frequency'] = _lib.normalize_frequency(d1) d2['Frequency'] = _lib.normalize_frequency(d2) mdf = _lib.merge_crispr_events(d1, d2, '_1', '_2') return pearsonr(mdf['Frequency_1'], mdf['Frequency_2'])[0]
def featurize(orig_df): seq, cutsite = _lib.get_sequence_cutsite(orig_df) mh_lens, gc_fracs, del_lens, freqs = [], [], [], [] dl_freqs = [] DELLEN_LIMIT = 60 df = _lib.mh_del_subset(orig_df) df = _lib.indels_without_mismatches_subset(df) df = df[df['Length'] <= DELLEN_LIMIT] if sum(df['Count']) < 1000: return None criteria = (orig_df['Category'] == 'del') & (orig_df['Length'] <= 28) s = orig_df[criteria] s['Frequency'] = _lib.normalize_frequency(s) for del_len in range(1, 28 + 1): dl_freq = sum(s[s['Length'] == del_len]['Frequency']) dl_freqs.append(dl_freq) df['Frequency'] = _lib.normalize_frequency(df) for del_len in range(1, DELLEN_LIMIT + 1): left = seq[cutsite - del_len:cutsite] right = seq[cutsite:cutsite + del_len] mhs = find_microhomologies(left, right) for mh in mhs: mh_len = len(mh) - 1 if mh_len > 0: gtpos = max(mh) s = cutsite - del_len + gtpos - mh_len e = s + mh_len mh_seq = seq[s:e] gc_frac = get_gc_frac(mh_seq) criteria = (df['Length'] == del_len) & (df['Genotype Position'] == gtpos) freq = sum(df[criteria]['Frequency']) mh_lens.append(mh_len) gc_fracs.append(gc_frac) del_lens.append(del_len) freqs.append(freq) return mh_lens, gc_fracs, del_lens, freqs, dl_freqs
def calc_statistics(df, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions # Denominator is 1-27 bp dels df = _lib.crispr_del_27bp_subset(df) if sum(df['Count']) <= 500: return df['Frequency'] = _lib.normalize_frequency(df) if sum(df['Frequency']) == 0: return for del_len in range(1, 27 + 1): df_lensubset = df[df['Length'] == del_len] mhyes_freq = sum(df_lensubset[df_lensubset['Microhomology-Based'] == 'yes']['Frequency']) mhno_freq = sum(df_lensubset[df_lensubset['Microhomology-Based'] == 'no']['Frequency']) alldf_dict['Deletion Length'].append(del_len) alldf_dict['Microhomology-Based'].append('has MH') alldf_dict['Frequency'].append(mhyes_freq) alldf_dict['_Experiment'].append(exp) alldf_dict['Deletion Length'].append(del_len) alldf_dict['Microhomology-Based'].append('no MH') alldf_dict['Frequency'].append(mhno_freq) alldf_dict['_Experiment'].append(exp) return alldf_dict
def calc_statistics(df, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions # Denominator is 1-27 bp dels df = _lib.crispr_del_27bp_subset(df) if sum(df['Count']) <= 500: return df['Frequency'] = _lib.normalize_frequency(df) if sum(df['Frequency']) == 0: return df_s = df[df['Microhomology-Based'] == 'no'] gt0 = sum(df_s[df_s['Genotype Position'] == 0]['Frequency']) gtN = sum(df_s[df_s['Genotype Position'] == df_s['Length']]['Frequency']) gtMid = sum(df_s['Frequency']) - gt0 - gtN alldf_dict['0gt'].append(gt0) alldf_dict['Ngt'].append(gtN) alldf_dict['Mid'].append(gtMid) alldf_dict['_Experiment'].append(exp) return alldf_dict
def calc_statistics(df, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions df = _lib.mh_del_subset(df) df = _lib.indels_without_mismatches_subset(df) df = df[df['Length'] >= 5] if sum(df['Count']) <= 1000: return df['Frequency'] = _lib.normalize_frequency(df) _predict2.init_model() seq, cutsite = _lib.get_sequence_cutsite(df) pred_df = _predict2.predict_mhdel(seq, cutsite) pred_df = pred_df[pred_df['Length'] >= 5] pred_df['Predicted_Frequency'] = pred_df['Predicted_Frequency'] / sum( pred_df['Predicted_Frequency']) join_cols = ['Category', 'Genotype Position', 'Length'] mdf = df.merge(pred_df, how='outer', on=join_cols) mdf['Frequency'].fillna(value=0, inplace=True) mdf['Predicted_Frequency'].fillna(value=0, inplace=True) obs = mdf['Frequency'] pred = mdf['Predicted_Frequency'] ns_criteria = (mdf['Length'] - mdf['Genotype Position'] == 0) s = mdf[ns_criteria] alldf_dict['Predicted Ngt'] += list(s['Predicted_Frequency']) alldf_dict['Observed Ngt'] += list(s['Frequency']) alldf_dict['_Experiment'] += [exp] * len(s['Frequency']) return alldf_dict
def calc_statistics(df, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions # Denominator is 3-27 bp dels # Remove obvious subpopulation of 1 bp, mh vs. no mh df = _lib.crispr_del_3bp_27bp_subset(df) if sum(df['Count']) <= 500: return df['Frequency'] = _lib.normalize_frequency(df) # Consider only deletion length 5 df = df[df['Length'] == 5] df_s = df[df['Microhomology-Based'] == 'no'] gt0 = sum(df_s[df_s['Genotype Position'] == 0]['Frequency']) gtN = sum(df_s[df_s['Genotype Position'] == df_s['Length']]['Frequency']) gt0N = gt0 + gtN mh_freq = sum(df[df['Microhomology-Based'] == 'yes']['Frequency']) alldf_dict['0N_freq'].append(gt0N) alldf_dict['MH_freq'].append(mh_freq) alldf_dict['_Experiment'].append(exp) return alldf_dict
def calc_statistics(orig_df, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions df = _lib.mh_del_subset(orig_df) df = _lib.indels_without_mismatches_subset(df) df = df[df['Length'] >= 5] if sum(df['Count']) <= 500: return df['Frequency'] = _lib.normalize_frequency(df) _predict2.init_model() seq, cutsite = _lib.get_sequence_cutsite(df) pred_df = _predict2.predict_mhdel_cpf1(seq, cutsite) join_cols = ['Category', 'Genotype Position', 'Length'] mdf = df.merge(pred_df, how='outer', on=join_cols) mdf['Frequency'].fillna(value=0, inplace=True) mdf['Predicted_Frequency'].fillna(value=0, inplace=True) obs = mdf['Frequency'] pred = mdf['Predicted_Frequency'] r = pearsonr(obs, pred)[0] alldf_dict['gt_r'].append(r) alldf_dict['_Experiment'].append(exp) return alldf_dict
def calc_statistics(orig_df, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions df = _lib.crispr_subset(orig_df) if sum(df['Count']) <= 1000: return df['Frequency'] = _lib.normalize_frequency(df) # Wildtype repair frequency dlwt = _config.d.DISLIB_WT row = dlwt[dlwt['name'] == exp].iloc[0] if row['wt_repairable'] != 'yes': alldf_dict['wt_obs'].append(np.nan) alldf_dict['dl'].append(np.nan) else: dls = [int(s) for s in row['dls'].split(';')] gts = [int(s) for s in row['gts'].split(';')] obs_freq = 0 for dl, gt in zip(dls, gts): crit = (df['Length'] == dl) & (df['Genotype Position'] == gt) obs_freq += sum(df[crit]['Frequency']) alldf_dict['wt_obs'].append(obs_freq) alldf_dict['dl'].append(set(dls).pop()) alldf_dict['_Experiment'].append(exp) return alldf_dict
def calc_statistics(orig_df, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions df = _lib.mh_del_subset(orig_df) df = _lib.indels_without_mismatches_subset(df) if sum(df['Count']) <= 1000: return df['Frequency'] = _lib.normalize_frequency(df) _predict2.init_model() seq, cutsite = _lib.get_sequence_cutsite(df) pred_df = _predict2.predict_mhdel(seq, cutsite) join_cols = ['Category', 'Genotype Position', 'Length'] mdf = df.merge(pred_df, how = 'outer', on = join_cols) mdf['Frequency'].fillna(value = 0, inplace = True) mdf['Predicted_Frequency'].fillna(value = 0, inplace = True) obs = mdf['Frequency'] pred = mdf['Predicted_Frequency'] obs_entropy = entropy(obs) / np.log(len(obs)) pred_entropy = entropy(pred) / np.log(len(pred)) alldf_dict['obs gt entropy'].append(obs_entropy) alldf_dict['pred gt entropy'].append(pred_entropy) df = orig_df[orig_df['Category'] == 'del'] df = df[df['Length'] <= 28] df['Frequency'] = _lib.normalize_frequency(df) obs_dl = [] for del_len in range(1, 28+1): freq = sum(df[df['Length'] == del_len]['Frequency']) obs_dl.append(freq) pred_dl = _predict2.deletion_length_distribution(seq, cutsite) obs_entropy = entropy(obs_dl) / np.log(len(obs_dl)) pred_entropy = entropy(pred_dl) / np.log(len(pred_dl)) alldf_dict['obs dl entropy'].append(obs_entropy) alldf_dict['pred dl entropy'].append(pred_entropy) alldf_dict['_Experiment'].append(exp) return alldf_dict
def calc_statistics(df, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions # Denominator is ins df = df[df['Category'] == 'ins'] if sum(df['Count']) <= 500: return df['Frequency'] = _lib.normalize_frequency(df) for ins_len in range(1, 15 + 1): df_s = df[df['Length'] == ins_len] criteria = (df_s['Ins Template Length'] >= 6) frac_templated = sum(df_s[criteria]['Frequency']) criteria = (df_s['Ins Fivehomopolymer'] == 'yes') & (df_s['Ins Template Length'] < 6) frac_fivehomopolymer = sum(df_s[criteria]['Frequency']) criteria = (df_s['Ins Fivehomopolymer'] == 'no') & (df_s['Ins Template Length'] < 6) frac_other = sum(df_s[criteria]['Frequency']) alldf_dict['Frac 5hm'].append(frac_fivehomopolymer) alldf_dict['Frac templated'].append(frac_templated) alldf_dict['Frac other'].append(frac_other) alldf_dict['Frac total'].append(sum(df_s['Frequency'])) alldf_dict['Length'].append(ins_len) alldf_dict['_Experiment'].append(exp) # Get all insertions longer than 15 bp df_s = df[df['Length'] > 15] criteria = (df_s['Ins Template Length'] >= 6) frac_templated = sum(df_s[criteria]['Frequency']) criteria = (df_s['Ins Fivehomopolymer'] == 'yes') & (df_s['Ins Template Length'] < 6) frac_fivehomopolymer = sum(df_s[criteria]['Frequency']) criteria = (df_s['Ins Fivehomopolymer'] == 'no') & (df_s['Ins Template Length'] < 6) frac_other = sum(df_s[criteria]['Frequency']) alldf_dict['Frac 5hm'].append(frac_fivehomopolymer) alldf_dict['Frac templated'].append(frac_templated) alldf_dict['Frac other'].append(frac_other) alldf_dict['Frac total'].append(sum(df_s['Frequency'])) alldf_dict['Length'].append(16) alldf_dict['_Experiment'].append(exp) return alldf_dict
def calc_statistics(df, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions longdup_len = int(exp.split('_')[1]) # Denominator is ins df = _lib.crispr_subset(df) if sum(df['Count']) <= 500: return df['Frequency'] = _lib.normalize_frequency(df) criteria = (df['Category'] == 'del') & (df['Length'] == longdup_len) freq = sum(df[criteria]['Frequency']) alldf_dict['Length'].append(longdup_len) alldf_dict['Frequency'].append(freq) alldf_dict['_Experiment'].append(exp) return alldf_dict
def calc_statistics(df, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions # Denominator is all dels df = df[df['Category'] == 'del'] if sum(df['Count']) <= 1000: return df['Frequency'] = _lib.normalize_frequency(df) if sum(df['Frequency']) == 0: return for del_len in range(1, 80 + 1): cum_freq = sum(df[df['Length'] <= del_len]['Frequency']) alldf_dict['Cumulative Frequency'].append(cum_freq) alldf_dict['Length'].append(del_len) alldf_dict['_Experiment'].append(exp) return alldf_dict
def calc_statistics(df, exp, alldf_dict): # Calculate statistics on df, saving to alldf_dict # Deletion positions # Denominator is all non-noise categories df = _lib.notnoise_subset(df) df['Frequency'] = _lib.normalize_frequency(df) if sum(df['Frequency']) == 0: return # Consider only deletions, anywhere del_df = _lib.del_subset(df) # Get left side for del_pos in range(-10, -1 + 1): total_freq = sum( del_df[del_df['Genotype Position'] == del_pos]['Frequency']) alldf_dict[str(del_pos)].append(total_freq) # Get right side for del_pos in range(1, 10 + 1): criteria = (del_df['Genotype Position'] - del_df['Length'] == del_pos) total_freq = sum(del_df[criteria]['Frequency']) alldf_dict[str(del_pos)].append(total_freq) editing_rate = sum(_lib.crispr_subset(df)['Frequency']) / sum( df['Frequency']) alldf_dict['Editing Rate'].append(editing_rate) alldf_dict['_Experiment'].append(exp) # Test alternative hypothesis: is asymmetry actually meaningful? If -1 arises from 0gt and sequencing mismatch, and +1 arises from Ngt and sequencing mismatch, then we should see asymmetry in 0gt vs Ngt. def detect_0gt_microhomology(row): # Returns a frequency if row['Category'] != 'del': return 0 cutsite = int(row['_Cutsite']) seq = row['_Sequence Context'] gt_pos = int(row['Genotype Position']) del_len = int(row['Length']) left = seq[cutsite - del_len:cutsite] right = seq[cutsite:cutsite + del_len] if len(left) != len(right): return 0 mhs = [] mh = [0] for idx, (c1, c2) in enumerate(zip(left, right)): if c1 == c2: mh.append(idx + 1) else: mhs.append(mh) mh = [idx + 1] mhs.append(mh) for mh in mhs: if gt_pos in mh: if 0 in mh: return row['Frequency'] return 0 freq_0gt = sum(del_df.apply(detect_0gt_microhomology, axis=1)) alldf_dict['0gt Frequency'].append(freq_0gt) criteria = (del_df['Genotype Position'] - del_df['Length'] == 0) freq_Ngt = sum(del_df[criteria]['Frequency']) alldf_dict['Ngt Frequency'].append(freq_Ngt) return
def calc_statistics(orig_df, exp, rate_model, bp_model, alldf_dict, rs, data_nm): # Calculate statistics on df, saving to alldf_dict # Deletion positions df = _lib.mh_del_subset(orig_df) df = _lib.indels_without_mismatches_subset(df) if sum(df['Count']) <= 1000: return obs_d = defaultdict(list) df = orig_df # Grab observed deletions, MH and MH-less for del_len in range(1, 59+1): crit = (df['Category'] == 'del') & (df['Indel with Mismatches'] != 'yes') & (df['Length'] == del_len) s = df[crit] mh_s = s[s['Microhomology-Based'] == 'yes'] for idx, row in mh_s.iterrows(): obs_d['Count'].append(row['Count']) obs_d['Genotype Position'].append(row['Genotype Position']) obs_d['Length'].append(row['Length']) obs_d['Category'].append('del') mhless_s = s[s['Microhomology-Based'] != 'yes'] obs_d['Length'].append(del_len) obs_d['Count'].append(sum(mhless_s['Count'])) obs_d['Genotype Position'].append('e') obs_d['Category'].append('del') obs_df = pd.DataFrame(obs_d) # Grab observed 1 bp insertions ins_crit = (orig_df['Category'] == 'ins') & (orig_df['Length'] == 1) & (orig_df['Indel with Mismatches'] != 'yes') ins_df = orig_df[ins_crit] truncated_ins_d = defaultdict(list) for ins_base in list('ACGT'): crit = (ins_df['Inserted Bases'] == ins_base) tot_count = sum(ins_df[crit]['Count']) truncated_ins_d['Count'].append(tot_count) truncated_ins_d['Inserted Bases'].append(ins_base) truncated_ins_d['Category'].append('ins') truncated_ins_d['Length'].append(1) ins_df = pd.DataFrame(truncated_ins_d) obs_df = obs_df.append(ins_df, ignore_index = True) obs_df['Frequency'] = _lib.normalize_frequency(obs_df) crispr_subset = _lib.crispr_subset(orig_df) frac_explained = sum(obs_df['Count']) / sum(crispr_subset['Count']) # print frac_explained # Save this for aggregate plotting alldf_dict['Fraction Explained'].append(frac_explained) # Predict MH dels and MH-less dels _predict2.init_model() seq, cutsite = _lib.get_sequence_cutsite(orig_df) pred_df = _predict2.predict_indels(seq, cutsite, rate_model, bp_model) mdf = obs_df.merge(pred_df, how = 'outer', on = ['Category', 'Genotype Position', 'Inserted Bases', 'Length']) mdf['Frequency'].fillna(value = 0, inplace = True) mdf['Predicted_Frequency'].fillna(value = 0, inplace = True) r = pearsonr(mdf['Frequency'], mdf['Predicted_Frequency'])[0] # Merge observed and predicted, double check correlation # Store in a way that I can plot it. data_nm_out_dir = out_dir + data_nm + '/' util.ensure_dir_exists(data_nm_out_dir) exp_out_dir = data_nm_out_dir + exp + '/' util.ensure_dir_exists(exp_out_dir) out_fn = exp_out_dir + '%.3f.csv' % (r) mdf.to_csv(out_fn) # Store in alldf_dict alldf_dict['_Experiment'].append(exp) alldf_dict['rs'].append(rs) return alldf_dict
def calc_statistics(orig_df, exp, rate_model, bp_model, alldf_dict, rs): # Calculate statistics on df, saving to alldf_dict # Deletion positions df = _lib.mh_del_subset(orig_df) df = _lib.indels_without_mismatches_subset(df) if sum(df['Count']) <= 1000: return ins_crit = (orig_df['Category'] == 'ins') & (orig_df['Length'] == 1) ins_df = orig_df[ins_crit] truncated_ins_d = defaultdict(list) for ins_base in list('ACGT'): crit = (ins_df['Inserted Bases'] == ins_base) tot_count = sum(ins_df[crit]['Count']) truncated_ins_d['Count'].append(tot_count) truncated_ins_d['Inserted Bases'].append(ins_base) truncated_ins_d['Category'].append('ins') truncated_ins_d['Length'].append(1) ins_df = pd.DataFrame(truncated_ins_d) df = df.append(ins_df, ignore_index=True) df['Frequency'] = _lib.normalize_frequency(df) _predict2.init_model() seq, cutsite = _lib.get_sequence_cutsite(orig_df) pred_df = _predict2.predict_mhdel(seq, cutsite) # Predict rate of 1 bp insertions # Featurize first del_score = _predict2.total_deletion_score(seq, cutsite) dlpred = _predict2.deletion_length_distribution(seq, cutsite) norm_entropy = entropy(dlpred) / np.log(len(dlpred)) ohmapper = { 'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'T': [0, 0, 0, 1] } fivebase = seq[cutsite - 1] onebp_features = ohmapper[fivebase] + [norm_entropy] + [del_score] onebp_features = np.array(onebp_features).reshape(1, -1) rate_1bpins = float(rate_model.predict(onebp_features)) # Predict 1 bp genotype frequencies pred_1bpins_d = defaultdict(list) for ins_base in bp_model[fivebase]: freq = bp_model[fivebase][ins_base] freq *= rate_1bpins / (1 - rate_1bpins) pred_1bpins_d['Category'].append('ins') pred_1bpins_d['Length'].append(1) pred_1bpins_d['Inserted Bases'].append(ins_base) pred_1bpins_d['Predicted_Frequency'].append(freq) pred_1bpins_df = pd.DataFrame(pred_1bpins_d) pred_df = pred_df.append(pred_1bpins_df, ignore_index=True) pred_df['Predicted_Frequency'] /= sum(pred_df['Predicted_Frequency']) join_cols = ['Category', 'Genotype Position', 'Length', 'Inserted Bases'] mdf = df.merge(pred_df, how='outer', on=join_cols) mdf['Frequency'].fillna(value=0, inplace=True) mdf['Predicted_Frequency'].fillna(value=0, inplace=True) obs = mdf['Frequency'] pred = mdf['Predicted_Frequency'] r = pearsonr(obs, pred)[0] alldf_dict['gt_r'].append(r) obs_entropy = entropy(obs) / np.log(len(obs)) pred_entropy = entropy(pred) / np.log(len(pred)) alldf_dict['obs entropy'].append(obs_entropy) alldf_dict['pred entropy'].append(pred_entropy) alldf_dict['_Experiment'].append(exp) alldf_dict['rs'].append(rs) return alldf_dict