Python normalize_frequency示例，_lib.normalize_frequency Python示例

示例#1

0

显示文件

def calc_statistics(orig_df1, orig_df2, exp, alldf_dict):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    df1 = _lib.mh_del_subset(orig_df1)
    df1 = _lib.indels_without_mismatches_subset(df1)
    df2 = _lib.mh_del_subset(orig_df2)
    df2 = _lib.indels_without_mismatches_subset(df2)

    if bool(sum(df1['Count']) <= 1000) or bool(sum(df2['Count']) <= 1000):
        return
    df1['Frequency'] = _lib.normalize_frequency(df1)
    df2['Frequency'] = _lib.normalize_frequency(df2)

    join_cols = ['Category', 'Genotype Position', 'Length']
    mdf = df1.merge(df2, how='outer', on=join_cols, suffixes=['_1', '_2'])
    mdf['Frequency_1'].fillna(value=0, inplace=True)
    mdf['Frequency_2'].fillna(value=0, inplace=True)

    r = pearsonr(mdf['Frequency_1'], mdf['Frequency_2'])[0]

    print exp, r

    alldf_dict['_Experiment'].append(exp)

    return alldf_dict

示例#2

0

显示文件

文件： fg2_replicability_lib.py 项目： gifford-lab/inDelphi-dataprocessinganalysis

 def get_r_from_subsets(d1, d2):
   if sum(d1['Count']) < 100 or sum(d2['Count']) < 100:
     return np.nan
   d1['Frequency'] = _lib.normalize_frequency(d1)
   d2['Frequency'] = _lib.normalize_frequency(d2)
   mdf = _lib.merge_crispr_events(d1, d2, '_1', '_2')
   return pearsonr(mdf['Frequency_1'], mdf['Frequency_2'])[0]

示例#3

0

显示文件

文件： c2_model_dataset.py 项目： gifford-lab/inDelphi-dataprocessinganalysis

def featurize(orig_df):
    seq, cutsite = _lib.get_sequence_cutsite(orig_df)
    mh_lens, gc_fracs, del_lens, freqs = [], [], [], []
    dl_freqs = []

    DELLEN_LIMIT = 60

    df = _lib.mh_del_subset(orig_df)
    df = _lib.indels_without_mismatches_subset(df)
    df = df[df['Length'] <= DELLEN_LIMIT]

    if sum(df['Count']) < 1000:
        return None

    criteria = (orig_df['Category'] == 'del') & (orig_df['Length'] <= 28)
    s = orig_df[criteria]
    s['Frequency'] = _lib.normalize_frequency(s)
    for del_len in range(1, 28 + 1):
        dl_freq = sum(s[s['Length'] == del_len]['Frequency'])
        dl_freqs.append(dl_freq)

    df['Frequency'] = _lib.normalize_frequency(df)

    for del_len in range(1, DELLEN_LIMIT + 1):
        left = seq[cutsite - del_len:cutsite]
        right = seq[cutsite:cutsite + del_len]

        mhs = find_microhomologies(left, right)
        for mh in mhs:
            mh_len = len(mh) - 1
            if mh_len > 0:
                gtpos = max(mh)

                s = cutsite - del_len + gtpos - mh_len
                e = s + mh_len
                mh_seq = seq[s:e]
                gc_frac = get_gc_frac(mh_seq)

                criteria = (df['Length'] == del_len) & (df['Genotype Position']
                                                        == gtpos)
                freq = sum(df[criteria]['Frequency'])

                mh_lens.append(mh_len)
                gc_fracs.append(gc_frac)
                del_lens.append(del_len)
                freqs.append(freq)

    return mh_lens, gc_fracs, del_lens, freqs, dl_freqs

示例#4

0

显示文件

def calc_statistics(df, exp, alldf_dict):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    # Denominator is 1-27 bp dels
    df = _lib.crispr_del_27bp_subset(df)

    if sum(df['Count']) <= 500:
        return

    df['Frequency'] = _lib.normalize_frequency(df)
    if sum(df['Frequency']) == 0:
        return

    for del_len in range(1, 27 + 1):
        df_lensubset = df[df['Length'] == del_len]
        mhyes_freq = sum(df_lensubset[df_lensubset['Microhomology-Based'] ==
                                      'yes']['Frequency'])
        mhno_freq = sum(df_lensubset[df_lensubset['Microhomology-Based'] ==
                                     'no']['Frequency'])

        alldf_dict['Deletion Length'].append(del_len)
        alldf_dict['Microhomology-Based'].append('has MH')
        alldf_dict['Frequency'].append(mhyes_freq)
        alldf_dict['_Experiment'].append(exp)

        alldf_dict['Deletion Length'].append(del_len)
        alldf_dict['Microhomology-Based'].append('no MH')
        alldf_dict['Frequency'].append(mhno_freq)
        alldf_dict['_Experiment'].append(exp)

    return alldf_dict

示例#5

0

显示文件

def calc_statistics(df, exp, alldf_dict):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    # Denominator is 1-27 bp dels
    df = _lib.crispr_del_27bp_subset(df)

    if sum(df['Count']) <= 500:
        return

    df['Frequency'] = _lib.normalize_frequency(df)
    if sum(df['Frequency']) == 0:
        return

    df_s = df[df['Microhomology-Based'] == 'no']

    gt0 = sum(df_s[df_s['Genotype Position'] == 0]['Frequency'])
    gtN = sum(df_s[df_s['Genotype Position'] == df_s['Length']]['Frequency'])
    gtMid = sum(df_s['Frequency']) - gt0 - gtN

    alldf_dict['0gt'].append(gt0)
    alldf_dict['Ngt'].append(gtN)
    alldf_dict['Mid'].append(gtMid)
    alldf_dict['_Experiment'].append(exp)

    return alldf_dict

示例#6

0

显示文件

文件： fl3_testmodel_0N.py 项目： gifford-lab/inDelphi-dataprocessinganalysis

def calc_statistics(df, exp, alldf_dict):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    df = _lib.mh_del_subset(df)
    df = _lib.indels_without_mismatches_subset(df)
    df = df[df['Length'] >= 5]
    if sum(df['Count']) <= 1000:
        return
    df['Frequency'] = _lib.normalize_frequency(df)

    _predict2.init_model()

    seq, cutsite = _lib.get_sequence_cutsite(df)
    pred_df = _predict2.predict_mhdel(seq, cutsite)
    pred_df = pred_df[pred_df['Length'] >= 5]
    pred_df['Predicted_Frequency'] = pred_df['Predicted_Frequency'] / sum(
        pred_df['Predicted_Frequency'])

    join_cols = ['Category', 'Genotype Position', 'Length']
    mdf = df.merge(pred_df, how='outer', on=join_cols)
    mdf['Frequency'].fillna(value=0, inplace=True)
    mdf['Predicted_Frequency'].fillna(value=0, inplace=True)
    obs = mdf['Frequency']
    pred = mdf['Predicted_Frequency']

    ns_criteria = (mdf['Length'] - mdf['Genotype Position'] == 0)
    s = mdf[ns_criteria]

    alldf_dict['Predicted Ngt'] += list(s['Predicted_Frequency'])
    alldf_dict['Observed Ngt'] += list(s['Frequency'])
    alldf_dict['_Experiment'] += [exp] * len(s['Frequency'])

    return alldf_dict

示例#7

0

显示文件

def calc_statistics(df, exp, alldf_dict):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    # Denominator is 3-27 bp dels
    # Remove obvious subpopulation of 1 bp, mh vs. no mh
    df = _lib.crispr_del_3bp_27bp_subset(df)

    if sum(df['Count']) <= 500:
        return

    df['Frequency'] = _lib.normalize_frequency(df)

    # Consider only deletion length 5
    df = df[df['Length'] == 5]

    df_s = df[df['Microhomology-Based'] == 'no']
    gt0 = sum(df_s[df_s['Genotype Position'] == 0]['Frequency'])
    gtN = sum(df_s[df_s['Genotype Position'] == df_s['Length']]['Frequency'])
    gt0N = gt0 + gtN

    mh_freq = sum(df[df['Microhomology-Based'] == 'yes']['Frequency'])

    alldf_dict['0N_freq'].append(gt0N)
    alldf_dict['MH_freq'].append(mh_freq)
    alldf_dict['_Experiment'].append(exp)

    return alldf_dict

示例#8

0

显示文件

def calc_statistics(orig_df, exp, alldf_dict):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    df = _lib.mh_del_subset(orig_df)
    df = _lib.indels_without_mismatches_subset(df)
    df = df[df['Length'] >= 5]
    if sum(df['Count']) <= 500:
        return
    df['Frequency'] = _lib.normalize_frequency(df)

    _predict2.init_model()

    seq, cutsite = _lib.get_sequence_cutsite(df)
    pred_df = _predict2.predict_mhdel_cpf1(seq, cutsite)

    join_cols = ['Category', 'Genotype Position', 'Length']
    mdf = df.merge(pred_df, how='outer', on=join_cols)
    mdf['Frequency'].fillna(value=0, inplace=True)
    mdf['Predicted_Frequency'].fillna(value=0, inplace=True)
    obs = mdf['Frequency']
    pred = mdf['Predicted_Frequency']
    r = pearsonr(obs, pred)[0]
    alldf_dict['gt_r'].append(r)

    alldf_dict['_Experiment'].append(exp)

    return alldf_dict

示例#9

0

显示文件

文件： fm_dis_prlwt.py 项目： gifford-lab/inDelphi-dataprocessinganalysis

def calc_statistics(orig_df, exp, alldf_dict):
  # Calculate statistics on df, saving to alldf_dict
  # Deletion positions

  df = _lib.crispr_subset(orig_df)
  if sum(df['Count']) <= 1000:
    return
  df['Frequency'] = _lib.normalize_frequency(df)

  # Wildtype repair frequency
  dlwt = _config.d.DISLIB_WT
  row = dlwt[dlwt['name'] == exp].iloc[0]
  if row['wt_repairable'] != 'yes':
    alldf_dict['wt_obs'].append(np.nan)
    alldf_dict['dl'].append(np.nan)
  else:
    dls = [int(s) for s in row['dls'].split(';')]
    gts = [int(s) for s in row['gts'].split(';')]

    obs_freq = 0
    for dl, gt in zip(dls, gts):
      crit = (df['Length'] == dl) & (df['Genotype Position'] == gt)
      obs_freq += sum(df[crit]['Frequency'])

    alldf_dict['wt_obs'].append(obs_freq)
    alldf_dict['dl'].append(set(dls).pop())

  alldf_dict['_Experiment'].append(exp)
  return alldf_dict

示例#10

0

显示文件

def calc_statistics(orig_df, exp, alldf_dict):
  # Calculate statistics on df, saving to alldf_dict
  # Deletion positions

  df = _lib.mh_del_subset(orig_df)
  df = _lib.indels_without_mismatches_subset(df)
  if sum(df['Count']) <= 1000:
    return
  df['Frequency'] = _lib.normalize_frequency(df)

  _predict2.init_model()

  seq, cutsite = _lib.get_sequence_cutsite(df)
  pred_df = _predict2.predict_mhdel(seq, cutsite)

  join_cols = ['Category', 'Genotype Position', 'Length']
  mdf = df.merge(pred_df, how = 'outer', on = join_cols)
  mdf['Frequency'].fillna(value = 0, inplace = True)
  mdf['Predicted_Frequency'].fillna(value = 0, inplace = True)
  obs = mdf['Frequency']
  pred = mdf['Predicted_Frequency']

  obs_entropy = entropy(obs) / np.log(len(obs))
  pred_entropy = entropy(pred) / np.log(len(pred))
  alldf_dict['obs gt entropy'].append(obs_entropy)
  alldf_dict['pred gt entropy'].append(pred_entropy)

  df = orig_df[orig_df['Category'] == 'del']
  df = df[df['Length'] <= 28]
  df['Frequency'] = _lib.normalize_frequency(df)
  obs_dl = []
  for del_len in range(1, 28+1):
    freq = sum(df[df['Length'] == del_len]['Frequency'])
    obs_dl.append(freq)
  pred_dl = _predict2.deletion_length_distribution(seq, cutsite)

  obs_entropy = entropy(obs_dl) / np.log(len(obs_dl))
  pred_entropy = entropy(pred_dl) / np.log(len(pred_dl))
  alldf_dict['obs dl entropy'].append(obs_entropy)
  alldf_dict['pred dl entropy'].append(pred_entropy)

  alldf_dict['_Experiment'].append(exp)

  return alldf_dict

示例#11

0

显示文件

文件： fi_ins_breakdown.py 项目： gifford-lab/inDelphi-dataprocessinganalysis

def calc_statistics(df, exp, alldf_dict):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    # Denominator is ins
    df = df[df['Category'] == 'ins']
    if sum(df['Count']) <= 500:
        return
    df['Frequency'] = _lib.normalize_frequency(df)

    for ins_len in range(1, 15 + 1):
        df_s = df[df['Length'] == ins_len]

        criteria = (df_s['Ins Template Length'] >= 6)
        frac_templated = sum(df_s[criteria]['Frequency'])

        criteria = (df_s['Ins Fivehomopolymer']
                    == 'yes') & (df_s['Ins Template Length'] < 6)
        frac_fivehomopolymer = sum(df_s[criteria]['Frequency'])

        criteria = (df_s['Ins Fivehomopolymer']
                    == 'no') & (df_s['Ins Template Length'] < 6)
        frac_other = sum(df_s[criteria]['Frequency'])

        alldf_dict['Frac 5hm'].append(frac_fivehomopolymer)
        alldf_dict['Frac templated'].append(frac_templated)
        alldf_dict['Frac other'].append(frac_other)
        alldf_dict['Frac total'].append(sum(df_s['Frequency']))

        alldf_dict['Length'].append(ins_len)
        alldf_dict['_Experiment'].append(exp)

    # Get all insertions longer than 15 bp
    df_s = df[df['Length'] > 15]

    criteria = (df_s['Ins Template Length'] >= 6)
    frac_templated = sum(df_s[criteria]['Frequency'])

    criteria = (df_s['Ins Fivehomopolymer']
                == 'yes') & (df_s['Ins Template Length'] < 6)
    frac_fivehomopolymer = sum(df_s[criteria]['Frequency'])

    criteria = (df_s['Ins Fivehomopolymer']
                == 'no') & (df_s['Ins Template Length'] < 6)
    frac_other = sum(df_s[criteria]['Frequency'])

    alldf_dict['Frac 5hm'].append(frac_fivehomopolymer)
    alldf_dict['Frac templated'].append(frac_templated)
    alldf_dict['Frac other'].append(frac_other)
    alldf_dict['Frac total'].append(sum(df_s['Frequency']))

    alldf_dict['Length'].append(16)
    alldf_dict['_Experiment'].append(exp)

    return alldf_dict

示例#12

0

显示文件

def calc_statistics(df, exp, alldf_dict):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    longdup_len = int(exp.split('_')[1])

    # Denominator is ins
    df = _lib.crispr_subset(df)
    if sum(df['Count']) <= 500:
        return
    df['Frequency'] = _lib.normalize_frequency(df)

    criteria = (df['Category'] == 'del') & (df['Length'] == longdup_len)
    freq = sum(df[criteria]['Frequency'])

    alldf_dict['Length'].append(longdup_len)
    alldf_dict['Frequency'].append(freq)
    alldf_dict['_Experiment'].append(exp)

    return alldf_dict

示例#13

0

显示文件

def calc_statistics(df, exp, alldf_dict):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    # Denominator is all dels
    df = df[df['Category'] == 'del']

    if sum(df['Count']) <= 1000:
        return

    df['Frequency'] = _lib.normalize_frequency(df)
    if sum(df['Frequency']) == 0:
        return

    for del_len in range(1, 80 + 1):
        cum_freq = sum(df[df['Length'] <= del_len]['Frequency'])
        alldf_dict['Cumulative Frequency'].append(cum_freq)
        alldf_dict['Length'].append(del_len)

        alldf_dict['_Experiment'].append(exp)

    return alldf_dict

示例#14

0

显示文件

def calc_statistics(df, exp, alldf_dict):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    # Denominator is all non-noise categories
    df = _lib.notnoise_subset(df)
    df['Frequency'] = _lib.normalize_frequency(df)
    if sum(df['Frequency']) == 0:
        return

    # Consider only deletions, anywhere
    del_df = _lib.del_subset(df)
    # Get left side
    for del_pos in range(-10, -1 + 1):
        total_freq = sum(
            del_df[del_df['Genotype Position'] == del_pos]['Frequency'])
        alldf_dict[str(del_pos)].append(total_freq)

    # Get right side
    for del_pos in range(1, 10 + 1):
        criteria = (del_df['Genotype Position'] - del_df['Length'] == del_pos)
        total_freq = sum(del_df[criteria]['Frequency'])
        alldf_dict[str(del_pos)].append(total_freq)

    editing_rate = sum(_lib.crispr_subset(df)['Frequency']) / sum(
        df['Frequency'])
    alldf_dict['Editing Rate'].append(editing_rate)
    alldf_dict['_Experiment'].append(exp)

    # Test alternative hypothesis: is asymmetry actually meaningful? If -1 arises from 0gt and sequencing mismatch, and +1 arises from Ngt and sequencing mismatch, then we should see asymmetry in 0gt vs Ngt.
    def detect_0gt_microhomology(row):
        # Returns a frequency
        if row['Category'] != 'del':
            return 0
        cutsite = int(row['_Cutsite'])
        seq = row['_Sequence Context']
        gt_pos = int(row['Genotype Position'])
        del_len = int(row['Length'])

        left = seq[cutsite - del_len:cutsite]
        right = seq[cutsite:cutsite + del_len]
        if len(left) != len(right):
            return 0

        mhs = []
        mh = [0]
        for idx, (c1, c2) in enumerate(zip(left, right)):
            if c1 == c2:
                mh.append(idx + 1)
            else:
                mhs.append(mh)
                mh = [idx + 1]
        mhs.append(mh)

        for mh in mhs:
            if gt_pos in mh:
                if 0 in mh:
                    return row['Frequency']
        return 0

    freq_0gt = sum(del_df.apply(detect_0gt_microhomology, axis=1))
    alldf_dict['0gt Frequency'].append(freq_0gt)

    criteria = (del_df['Genotype Position'] - del_df['Length'] == 0)
    freq_Ngt = sum(del_df[criteria]['Frequency'])
    alldf_dict['Ngt Frequency'].append(freq_Ngt)
    return

示例#15

0

显示文件

文件： e4_indelphi_vis.py 项目： gifford-lab/inDelphi-dataprocessinganalysis

def calc_statistics(orig_df, exp, rate_model, bp_model, alldf_dict, rs, data_nm):
  # Calculate statistics on df, saving to alldf_dict
  # Deletion positions

  df = _lib.mh_del_subset(orig_df)
  df = _lib.indels_without_mismatches_subset(df)
  if sum(df['Count']) <= 1000:
    return
  
  obs_d = defaultdict(list)

  df = orig_df
  # Grab observed deletions, MH and MH-less
  for del_len in range(1, 59+1):
    crit = (df['Category'] == 'del') & (df['Indel with Mismatches'] != 'yes') & (df['Length'] == del_len)
    s = df[crit]

    mh_s = s[s['Microhomology-Based'] == 'yes']
    for idx, row in mh_s.iterrows():
      obs_d['Count'].append(row['Count'])
      obs_d['Genotype Position'].append(row['Genotype Position'])
      obs_d['Length'].append(row['Length'])
      obs_d['Category'].append('del')

    mhless_s = s[s['Microhomology-Based'] != 'yes']
    obs_d['Length'].append(del_len)
    obs_d['Count'].append(sum(mhless_s['Count']))
    obs_d['Genotype Position'].append('e')
    obs_d['Category'].append('del')

  obs_df = pd.DataFrame(obs_d) 

  # Grab observed 1 bp insertions
  ins_crit = (orig_df['Category'] == 'ins') & (orig_df['Length'] == 1) & (orig_df['Indel with Mismatches'] != 'yes')
  ins_df = orig_df[ins_crit]
  truncated_ins_d = defaultdict(list)
  for ins_base in list('ACGT'):
    crit = (ins_df['Inserted Bases'] == ins_base)
    tot_count = sum(ins_df[crit]['Count'])
    truncated_ins_d['Count'].append(tot_count)
    truncated_ins_d['Inserted Bases'].append(ins_base)
    truncated_ins_d['Category'].append('ins')
    truncated_ins_d['Length'].append(1)
  ins_df = pd.DataFrame(truncated_ins_d)
  obs_df = obs_df.append(ins_df, ignore_index = True)

  obs_df['Frequency'] = _lib.normalize_frequency(obs_df)

  crispr_subset = _lib.crispr_subset(orig_df)
  frac_explained = sum(obs_df['Count']) / sum(crispr_subset['Count'])
  # print frac_explained
  # Save this for aggregate plotting
  alldf_dict['Fraction Explained'].append(frac_explained)

  # Predict MH dels and MH-less dels
  _predict2.init_model()
  seq, cutsite = _lib.get_sequence_cutsite(orig_df)
  pred_df = _predict2.predict_indels(seq, cutsite, 
                                     rate_model, bp_model)


  mdf = obs_df.merge(pred_df, how = 'outer', on = ['Category', 'Genotype Position', 'Inserted Bases', 'Length'])
  mdf['Frequency'].fillna(value = 0, inplace = True)
  mdf['Predicted_Frequency'].fillna(value = 0, inplace = True)
  r = pearsonr(mdf['Frequency'], mdf['Predicted_Frequency'])[0]

  # Merge observed and predicted, double check correlation
  # Store in a way that I can plot it.

  data_nm_out_dir = out_dir + data_nm + '/'
  util.ensure_dir_exists(data_nm_out_dir)
  exp_out_dir = data_nm_out_dir + exp + '/'
  util.ensure_dir_exists(exp_out_dir)
  out_fn = exp_out_dir + '%.3f.csv' % (r)
  mdf.to_csv(out_fn)

  # Store in alldf_dict
  alldf_dict['_Experiment'].append(exp)
  alldf_dict['rs'].append(rs)

  return alldf_dict

示例#16

0

显示文件

def calc_statistics(orig_df, exp, rate_model, bp_model, alldf_dict, rs):
    # Calculate statistics on df, saving to alldf_dict
    # Deletion positions

    df = _lib.mh_del_subset(orig_df)
    df = _lib.indels_without_mismatches_subset(df)
    if sum(df['Count']) <= 1000:
        return

    ins_crit = (orig_df['Category'] == 'ins') & (orig_df['Length'] == 1)
    ins_df = orig_df[ins_crit]
    truncated_ins_d = defaultdict(list)
    for ins_base in list('ACGT'):
        crit = (ins_df['Inserted Bases'] == ins_base)
        tot_count = sum(ins_df[crit]['Count'])
        truncated_ins_d['Count'].append(tot_count)
        truncated_ins_d['Inserted Bases'].append(ins_base)
        truncated_ins_d['Category'].append('ins')
        truncated_ins_d['Length'].append(1)
    ins_df = pd.DataFrame(truncated_ins_d)
    df = df.append(ins_df, ignore_index=True)
    df['Frequency'] = _lib.normalize_frequency(df)

    _predict2.init_model()

    seq, cutsite = _lib.get_sequence_cutsite(orig_df)
    pred_df = _predict2.predict_mhdel(seq, cutsite)

    # Predict rate of 1 bp insertions
    # Featurize first
    del_score = _predict2.total_deletion_score(seq, cutsite)
    dlpred = _predict2.deletion_length_distribution(seq, cutsite)
    norm_entropy = entropy(dlpred) / np.log(len(dlpred))
    ohmapper = {
        'A': [1, 0, 0, 0],
        'C': [0, 1, 0, 0],
        'G': [0, 0, 1, 0],
        'T': [0, 0, 0, 1]
    }
    fivebase = seq[cutsite - 1]
    onebp_features = ohmapper[fivebase] + [norm_entropy] + [del_score]
    onebp_features = np.array(onebp_features).reshape(1, -1)
    rate_1bpins = float(rate_model.predict(onebp_features))

    # Predict 1 bp genotype frequencies
    pred_1bpins_d = defaultdict(list)
    for ins_base in bp_model[fivebase]:
        freq = bp_model[fivebase][ins_base]
        freq *= rate_1bpins / (1 - rate_1bpins)

        pred_1bpins_d['Category'].append('ins')
        pred_1bpins_d['Length'].append(1)
        pred_1bpins_d['Inserted Bases'].append(ins_base)
        pred_1bpins_d['Predicted_Frequency'].append(freq)

    pred_1bpins_df = pd.DataFrame(pred_1bpins_d)
    pred_df = pred_df.append(pred_1bpins_df, ignore_index=True)
    pred_df['Predicted_Frequency'] /= sum(pred_df['Predicted_Frequency'])

    join_cols = ['Category', 'Genotype Position', 'Length', 'Inserted Bases']
    mdf = df.merge(pred_df, how='outer', on=join_cols)
    mdf['Frequency'].fillna(value=0, inplace=True)
    mdf['Predicted_Frequency'].fillna(value=0, inplace=True)
    obs = mdf['Frequency']
    pred = mdf['Predicted_Frequency']
    r = pearsonr(obs, pred)[0]
    alldf_dict['gt_r'].append(r)

    obs_entropy = entropy(obs) / np.log(len(obs))
    pred_entropy = entropy(pred) / np.log(len(pred))
    alldf_dict['obs entropy'].append(obs_entropy)
    alldf_dict['pred entropy'].append(pred_entropy)

    alldf_dict['_Experiment'].append(exp)
    alldf_dict['rs'].append(rs)

    return alldf_dict