def calc_statistics(df, exp, alldf_dict):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    # Denominator is ins
    if sum(_lib.crispr_subset(df)['Count']) <= 1000:
        return

    ins_criteria = (df['Category'] == 'ins') & (df['Length'] == 1) & (
        df['Indel with Mismatches'] != 'yes')
    ins_count = sum(df[ins_criteria]['Count'])

    del_criteria = (df['Category']
                    == 'del') & (df['Indel with Mismatches'] != 'yes')
    del_count = sum(df[del_criteria]['Count'])
    if del_count == 0:
        return
    alldf_dict['Ins1bp/Del Ratio'].append(ins_count / (del_count + ins_count))

    mhdel_crit = (df['Category']
                  == 'del') & (df['Indel with Mismatches'] !=
                               'yes') & (df['Microhomology-Based'] == 'yes')
    mhdel_count = sum(df[mhdel_crit]['Count'])
    try:
        alldf_dict['Ins1bp/MHDel Ratio'].append(ins_count /
                                                (mhdel_count + ins_count))
    except ZeroDivisionError:
        alldf_dict['Ins1bp/MHDel Ratio'].append(0)

    ins_ratio = ins_count / sum(_lib.crispr_subset(df)['Count'])
    alldf_dict['Ins1bp Ratio'].append(ins_ratio)

    seq, cutsite = _lib.get_sequence_cutsite(df)
    alldf_dict['Sequence Context'].append(seq[-55:-30] + 'NNNN' + seq[-26:])

    alldf_dict['Fourbp'].append(seq[cutsite - 2:cutsite + 2])

    alldf_dict['Base1'].append(seq[cutsite - 2])
    alldf_dict['Base2'].append(seq[cutsite - 1])
    alldf_dict['Base3'].append(seq[cutsite])
    alldf_dict['Base4'].append(seq[cutsite + 1])

    _predict2.init_model()
    del_score = _predict2.total_deletion_score(seq, cutsite)
    alldf_dict['Del Score'].append(del_score)

    dlpred = _predict2.deletion_length_distribution(seq, cutsite)
    from scipy.stats import entropy
    norm_entropy = entropy(dlpred) / np.log(len(dlpred))
    alldf_dict['Entropy'].append(norm_entropy)

    alldf_dict['_Experiment'].append(exp)

    return alldf_dict
def calc_statistics(orig_df, exp, alldf_dict):
  # Calculate statistics on df, saving to alldf_dict
  # Deletion positions

  df = _lib.crispr_subset(orig_df)
  if sum(df['Count']) <= 1000:
    return
  df['Frequency'] = _lib.normalize_frequency(df)

  # Wildtype repair frequency
  dlwt = _config.d.DISLIB_WT
  row = dlwt[dlwt['name'] == exp].iloc[0]
  if row['wt_repairable'] != 'yes':
    alldf_dict['wt_obs'].append(np.nan)
    alldf_dict['dl'].append(np.nan)
  else:
    dls = [int(s) for s in row['dls'].split(';')]
    gts = [int(s) for s in row['gts'].split(';')]

    obs_freq = 0
    for dl, gt in zip(dls, gts):
      crit = (df['Length'] == dl) & (df['Genotype Position'] == gt)
      obs_freq += sum(df[crit]['Frequency'])

    alldf_dict['wt_obs'].append(obs_freq)
    alldf_dict['dl'].append(set(dls).pop())

  alldf_dict['_Experiment'].append(exp)
  return alldf_dict
示例#3
0
def calc_statistics(df, exp, alldf_dict):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    longdup_len = int(exp.split('_')[1])

    # Denominator is ins
    df = _lib.crispr_subset(df)
    if sum(df['Count']) <= 500:
        return
    df['Frequency'] = _lib.normalize_frequency(df)

    criteria = (df['Category'] == 'del') & (df['Length'] == longdup_len)
    freq = sum(df[criteria]['Frequency'])

    alldf_dict['Length'].append(longdup_len)
    alldf_dict['Frequency'].append(freq)
    alldf_dict['_Experiment'].append(exp)

    return alldf_dict
示例#4
0
def calc_statistics(df, exp, alldf_dict):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    # Denominator is all non-noise categories
    df = _lib.notnoise_subset(df)
    df['Frequency'] = _lib.normalize_frequency(df)
    if sum(df['Frequency']) == 0:
        return

    # Consider only deletions, anywhere
    del_df = _lib.del_subset(df)
    # Get left side
    for del_pos in range(-10, -1 + 1):
        total_freq = sum(
            del_df[del_df['Genotype Position'] == del_pos]['Frequency'])
        alldf_dict[str(del_pos)].append(total_freq)

    # Get right side
    for del_pos in range(1, 10 + 1):
        criteria = (del_df['Genotype Position'] - del_df['Length'] == del_pos)
        total_freq = sum(del_df[criteria]['Frequency'])
        alldf_dict[str(del_pos)].append(total_freq)

    editing_rate = sum(_lib.crispr_subset(df)['Frequency']) / sum(
        df['Frequency'])
    alldf_dict['Editing Rate'].append(editing_rate)
    alldf_dict['_Experiment'].append(exp)

    # Test alternative hypothesis: is asymmetry actually meaningful? If -1 arises from 0gt and sequencing mismatch, and +1 arises from Ngt and sequencing mismatch, then we should see asymmetry in 0gt vs Ngt.
    def detect_0gt_microhomology(row):
        # Returns a frequency
        if row['Category'] != 'del':
            return 0
        cutsite = int(row['_Cutsite'])
        seq = row['_Sequence Context']
        gt_pos = int(row['Genotype Position'])
        del_len = int(row['Length'])

        left = seq[cutsite - del_len:cutsite]
        right = seq[cutsite:cutsite + del_len]
        if len(left) != len(right):
            return 0

        mhs = []
        mh = [0]
        for idx, (c1, c2) in enumerate(zip(left, right)):
            if c1 == c2:
                mh.append(idx + 1)
            else:
                mhs.append(mh)
                mh = [idx + 1]
        mhs.append(mh)

        for mh in mhs:
            if gt_pos in mh:
                if 0 in mh:
                    return row['Frequency']
        return 0

    freq_0gt = sum(del_df.apply(detect_0gt_microhomology, axis=1))
    alldf_dict['0gt Frequency'].append(freq_0gt)

    criteria = (del_df['Genotype Position'] - del_df['Length'] == 0)
    freq_Ngt = sum(del_df[criteria]['Frequency'])
    alldf_dict['Ngt Frequency'].append(freq_Ngt)
    return
def calc_statistics(df1, df2, exp, alldf_dict):
  # Calculate statistics on df, saving to alldf_dict
  # Deletion positions
  df1 = _lib.crispr_subset(df1)
  df2 = _lib.crispr_subset(df2)

  if sum(df1['Count']) < 1000 or sum(df2['Count']) < 1000:
    return

  def get_r_from_subsets(d1, d2):
    if sum(d1['Count']) < 100 or sum(d2['Count']) < 100:
      return np.nan
    d1['Frequency'] = _lib.normalize_frequency(d1)
    d2['Frequency'] = _lib.normalize_frequency(d2)
    mdf = _lib.merge_crispr_events(d1, d2, '_1', '_2')
    return pearsonr(mdf['Frequency_1'], mdf['Frequency_2'])[0]

  # everything
  alldf_dict['all'].append(get_r_from_subsets(df1, df2)) 

  # All CRISPR dels
  d1 = _lib.del_subset(df1)
  d2 = _lib.del_subset(df2)
  alldf_dict['All del'].append(get_r_from_subsets(d1, d2))

  # Del at cut
  d1 = df1[df1['Category'] == 'del']
  d2 = df2[df2['Category'] == 'del']
  alldf_dict['del'].append(get_r_from_subsets(d1, d2))

  # MH dels
  d1 = df1[(df1['Category'] == 'del') & (df1['Microhomology-Based'] == 'yes')]
  d2 = df2[(df2['Category'] == 'del') & (df2['Microhomology-Based'] == 'yes')]
  alldf_dict['mh_del'].append(get_r_from_subsets(d1, d2))

  # MHless dels
  d1 = df1[(df1['Category'] == 'del') & (df1['Microhomology-Based'] == 'no')]
  d2 = df2[(df2['Category'] == 'del') & (df2['Microhomology-Based'] == 'no')]
  alldf_dict['nomh_del'].append(get_r_from_subsets(d1, d2))

  # Del not at cut
  d1 = df1[df1['Category'] == 'del_notatcut']
  d2 = df2[df2['Category'] == 'del_notatcut']
  alldf_dict['del_notatcut'].append(get_r_from_subsets(d1, d2))

  # All CRISPR ins
  d1 = _lib.ins_subset(df1)
  d2 = _lib.ins_subset(df2)
  alldf_dict['All ins'].append(get_r_from_subsets(d1, d2))

  # All ins at cutsite
  d1 = df1[df1['Category'] == 'ins']
  d2 = df2[df2['Category'] == 'ins']
  alldf_dict['ins'].append(get_r_from_subsets(d1, d2))

  # 1bp ins
  d1 = df1[(df1['Category'] == 'ins') & (df1['Length'] == 1)]
  d2 = df2[(df2['Category'] == 'ins') & (df2['Length'] == 1)]
  alldf_dict['ins_1bp'].append(get_r_from_subsets(d1, d2))

  # 2bp+ ins
  d1 = df1[(df1['Category'] == 'ins') & (df1['Length'] > 1)]
  d2 = df2[(df2['Category'] == 'ins') & (df2['Length'] > 1)]
  alldf_dict['ins_2bpplus'].append(get_r_from_subsets(d1, d2))

  # Ins not at cut
  d1 = df1[df1['Category'] == 'ins_notatcut']
  d2 = df2[df2['Category'] == 'ins_notatcut']
  alldf_dict['ins_notatcut'].append(get_r_from_subsets(d1, d2))

  alldf_dict['_Experiment'].append(exp)
  return
def calc_statistics(orig_df, exp, rate_model, bp_model, alldf_dict, rs, data_nm):
  # Calculate statistics on df, saving to alldf_dict
  # Deletion positions

  df = _lib.mh_del_subset(orig_df)
  df = _lib.indels_without_mismatches_subset(df)
  if sum(df['Count']) <= 1000:
    return
  
  obs_d = defaultdict(list)

  df = orig_df
  # Grab observed deletions, MH and MH-less
  for del_len in range(1, 59+1):
    crit = (df['Category'] == 'del') & (df['Indel with Mismatches'] != 'yes') & (df['Length'] == del_len)
    s = df[crit]

    mh_s = s[s['Microhomology-Based'] == 'yes']
    for idx, row in mh_s.iterrows():
      obs_d['Count'].append(row['Count'])
      obs_d['Genotype Position'].append(row['Genotype Position'])
      obs_d['Length'].append(row['Length'])
      obs_d['Category'].append('del')

    mhless_s = s[s['Microhomology-Based'] != 'yes']
    obs_d['Length'].append(del_len)
    obs_d['Count'].append(sum(mhless_s['Count']))
    obs_d['Genotype Position'].append('e')
    obs_d['Category'].append('del')

  obs_df = pd.DataFrame(obs_d) 

  # Grab observed 1 bp insertions
  ins_crit = (orig_df['Category'] == 'ins') & (orig_df['Length'] == 1) & (orig_df['Indel with Mismatches'] != 'yes')
  ins_df = orig_df[ins_crit]
  truncated_ins_d = defaultdict(list)
  for ins_base in list('ACGT'):
    crit = (ins_df['Inserted Bases'] == ins_base)
    tot_count = sum(ins_df[crit]['Count'])
    truncated_ins_d['Count'].append(tot_count)
    truncated_ins_d['Inserted Bases'].append(ins_base)
    truncated_ins_d['Category'].append('ins')
    truncated_ins_d['Length'].append(1)
  ins_df = pd.DataFrame(truncated_ins_d)
  obs_df = obs_df.append(ins_df, ignore_index = True)

  obs_df['Frequency'] = _lib.normalize_frequency(obs_df)

  crispr_subset = _lib.crispr_subset(orig_df)
  frac_explained = sum(obs_df['Count']) / sum(crispr_subset['Count'])
  # print frac_explained
  # Save this for aggregate plotting
  alldf_dict['Fraction Explained'].append(frac_explained)

  # Predict MH dels and MH-less dels
  _predict2.init_model()
  seq, cutsite = _lib.get_sequence_cutsite(orig_df)
  pred_df = _predict2.predict_indels(seq, cutsite, 
                                     rate_model, bp_model)


  mdf = obs_df.merge(pred_df, how = 'outer', on = ['Category', 'Genotype Position', 'Inserted Bases', 'Length'])
  mdf['Frequency'].fillna(value = 0, inplace = True)
  mdf['Predicted_Frequency'].fillna(value = 0, inplace = True)
  r = pearsonr(mdf['Frequency'], mdf['Predicted_Frequency'])[0]

  # Merge observed and predicted, double check correlation
  # Store in a way that I can plot it.

  data_nm_out_dir = out_dir + data_nm + '/'
  util.ensure_dir_exists(data_nm_out_dir)
  exp_out_dir = data_nm_out_dir + exp + '/'
  util.ensure_dir_exists(exp_out_dir)
  out_fn = exp_out_dir + '%.3f.csv' % (r)
  mdf.to_csv(out_fn)

  # Store in alldf_dict
  alldf_dict['_Experiment'].append(exp)
  alldf_dict['rs'].append(rs)

  return alldf_dict
示例#7
0
def calc_statistics(df, exp, alldf_dict):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    # Denominator is ins
    if sum(_lib.crispr_subset(df)['Count']) <= 1000:
        return

    editing_rate = sum(_lib.crispr_subset(df)['Count']) / sum(
        _lib.notnoise_subset(df)['Count'])
    alldf_dict['Editing Rate'].append(editing_rate)

    ins_criteria = (df['Category'] == 'ins') & (df['Length'] == 1) & (
        df['Indel with Mismatches'] != 'yes')
    ins_count = sum(df[ins_criteria]['Count'])

    del_criteria = (df['Category']
                    == 'del') & (df['Indel with Mismatches'] != 'yes')
    del_count = sum(df[del_criteria]['Count'])
    if del_count == 0:
        return
    alldf_dict['Ins1bp/Del Ratio'].append(ins_count / (del_count + ins_count))

    mhdel_crit = (df['Category']
                  == 'del') & (df['Indel with Mismatches'] !=
                               'yes') & (df['Microhomology-Based'] == 'yes')
    mhdel_count = sum(df[mhdel_crit]['Count'])
    try:
        alldf_dict['Ins1bp/MHDel Ratio'].append(ins_count /
                                                (mhdel_count + ins_count))
    except ZeroDivisionError:
        alldf_dict['Ins1bp/MHDel Ratio'].append(0)

    ins_ratio = ins_count / sum(_lib.crispr_subset(df)['Count'])
    alldf_dict['Ins1bp Ratio'].append(ins_ratio)

    seq, cutsite = _lib.get_sequence_cutsite(df)
    fivebase = seq[cutsite - 1]
    alldf_dict['Fivebase'].append(fivebase)

    _predict2.init_model()
    del_score = _predict2.total_deletion_score(seq, cutsite)
    alldf_dict['Del Score'].append(del_score)

    dlpred = _predict2.deletion_length_distribution(seq, cutsite)
    from scipy.stats import entropy
    norm_entropy = entropy(dlpred) / np.log(len(dlpred))
    alldf_dict['Entropy'].append(norm_entropy)

    local_seq = seq[cutsite - 4:cutsite + 4]
    gc = (local_seq.count('C') + local_seq.count('G')) / len(local_seq)
    alldf_dict['GC'].append(gc)

    if fivebase == 'A':
        fivebase_oh = np.array([1, 0, 0, 0])
    if fivebase == 'C':
        fivebase_oh = np.array([0, 1, 0, 0])
    if fivebase == 'G':
        fivebase_oh = np.array([0, 0, 1, 0])
    if fivebase == 'T':
        fivebase_oh = np.array([0, 0, 0, 1])
    alldf_dict['Fivebase_OH'].append(fivebase_oh)

    threebase = seq[cutsite]
    alldf_dict['Threebase'].append(threebase)
    if threebase == 'A':
        threebase_oh = np.array([1, 0, 0, 0])
    if threebase == 'C':
        threebase_oh = np.array([0, 1, 0, 0])
    if threebase == 'G':
        threebase_oh = np.array([0, 0, 1, 0])
    if threebase == 'T':
        threebase_oh = np.array([0, 0, 0, 1])
    alldf_dict['Threebase_OH'].append(threebase_oh)

    alldf_dict['_Experiment'].append(exp)

    return alldf_dict