Exemplo n.º 1
0
def remove_garbage(variants, garbage):
    garbage_variants = {}
    N = len(variants)
    for varid in common.extract_vids(variants):
        if varid in garbage:
            garbage_variants[varid] = variants[varid]
            del variants[varid]
    assert len(variants) == N - len(garbage_ids)
    return garbage_variants
Exemplo n.º 2
0
def _check_clusters(variants, clusters, garbage):
  for C in clusters:
      assert len(C) > 0

  vids = common.extract_vids(variants)
  clustered = [child for C in clusters for child in C]
  garbage = set(garbage)
  clustered = set(clustered)
  assert len(clustered & garbage) == 0
  assert set(vids) == (clustered | garbage)
Exemplo n.º 3
0
def make_varids_contiguous(variants, garbage, clusters):
    mapping = {}
    for new_idx, old_varid in enumerate(common.extract_vids(variants)):
        mapping[old_varid] = 's%s' % new_varidx

    new_variants = {mapping[V]: variants[V] for V in variants.keys()}
    for V in new_variants.keys():
        new_variants[V]['id'] = V

    new_clusters = [
        common.sort_vids([mapping[V] for V in C]) for C in clusters
    ]

    assert set(new_variants.keys()) == \
      set([V for C in new_clusters for V in C]) == \
      set([V['id'] for V in new_variants.values()])
    assert not np.any(np.array([len(C) for C in new_clusters]) == 0)

    return (new_variants, new_clusters)
Exemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(
        description='LOL HI THERE',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('ssm_fn')
    parser.add_argument('mutphi_fn')
    args = parser.parse_args()

    variants = inputparser.load_ssms(args.ssm_fn)
    vids = common.extract_vids(variants)
    var_reads = np.array([variants[V]['var_reads'] for V in vids])
    total_reads = np.array([variants[V]['total_reads'] for V in vids])
    omega_v = np.array([variants[V]['omega_v'] for V in vids])

    mle_phi = (1 / omega_v) * (var_reads / total_reads)
    assert np.all(0 <= mle_phi)
    mle_phi = np.minimum(1, mle_phi)

    clusters = [[V] for V in vids]
    llhs = [0]
    counts = [1]
    mphi = mutphi.calc_mutphi([mle_phi], llhs, [clusters], args.ssm_fn, counts)
    mutstat.write(mphi, args.mutphi_fn)
Exemplo n.º 5
0
def _fit_phis(adj, superclusters, supervars, method, iterations, parallel):
    # Calling `import` on each function call should be cheap, as Python caches a
    # reference to the module after the first load.
    if method in ('graddesc_old', 'rprop_old'):
        import phi_fitter_iterative
        eta = phi_fitter_iterative.fit_etas(adj, superclusters, supervars,
                                            method[:-4], iterations, parallel)

    elif method == 'rprop':
        import phi_fitter_lol
        eta = phi_fitter_lol.fit_etas(adj,
                                      superclusters,
                                      supervars,
                                      'rprop',
                                      iterations,
                                      parallel,
                                      eta_init='mle')

    elif method == 'projection':
        import phi_fitter_projection
        eta = phi_fitter_projection.fit_etas(adj, superclusters, supervars)

    elif method == 'proj_rprop':
        import phi_fitter_projection
        import phi_fitter_lol
        eta_proj = phi_fitter_projection.fit_etas(adj, superclusters,
                                                  supervars)
        eta = phi_fitter_lol.fit_etas(adj,
                                      superclusters,
                                      supervars,
                                      'rprop',
                                      iterations,
                                      parallel,
                                      eta_init=eta_proj)

    elif method == 'debug':
        import phi_fitter_iterative
        import phi_fitter_projection
        import phi_fitter_lol
        import time
        fitters = {
            #'rprop_init_mle': lambda: phi_fitter_iterative.fit_etas(adj, superclusters, supervars, 'rprop', iterations, parallel, eta_init=None),
            'lol_init_mle':
            lambda: phi_fitter_lol.fit_etas(adj,
                                            superclusters,
                                            supervars,
                                            'rprop',
                                            iterations,
                                            parallel,
                                            eta_init='mle'),
            'lol_init_dirichlet':
            lambda: phi_fitter_lol.fit_etas(adj,
                                            superclusters,
                                            supervars,
                                            'rprop',
                                            iterations,
                                            parallel,
                                            eta_init='dirichlet'),
            'projection':
            lambda: phi_fitter_projection.fit_etas(adj, superclusters,
                                                   supervars),
        }
        #fitters['lol_init_proj'] = lambda: phi_fitter_lol.fit_etas(adj, superclusters, supervars, 'rprop', iterations, parallel, eta_init=fitters['projection']())
        #fitters['lol_init_prev'] = lambda: phi_fitter_lol.fit_etas(adj, superclusters, supervars, 'rprop', iterations, parallel, eta_init=last_eta[0])

        Z = util.make_ancestral_from_adj(adj)
        svids = common.extract_vids(supervars)
        total_reads = np.array(
            [supervars[svid]['total_reads'] for svid in svids])
        var_reads = np.array([supervars[svid]['var_reads'] for svid in svids])
        omega = np.array([supervars[svid]['omega_v'] for svid in svids])

        etas = {}
        scores = {}
        times = {}
        zeros = {}
        l1_dists = {}
        l2_dists = {}
        for name, F in fitters.items():
            time_start = time.perf_counter_ns()
            etas[name] = F()
            time_end = time.perf_counter_ns()
            phi = np.dot(Z, etas[name])
            scores[name] = _calc_llh(phi, var_reads, total_reads, omega)
            times[name] = (time_end - time_start) / 1e6
            zeros[name] = np.sum(phi == 0)
            l1_dists[name] = util.lpdist(var_reads / (total_reads * omega),
                                         phi[1:],
                                         p=1)
            l2_dists[name] = util.lpdist(var_reads / (total_reads * omega),
                                         phi[1:],
                                         p=2)

        eta = etas['lol_init_mle']
        last_eta[0] = np.copy(eta)

        names = sorted(etas.keys())
        sep = '\t'
        if True and not hasattr(_fit_phis, 'printed_header'):
            print(*names, sep=sep)
            _fit_phis.printed_header = True
        print(*['%.3f' % scores[name][2] for name in names],
              np.nan,
              *['%.3f' % times[name] for name in names],
              np.nan,
              *[zeros[name] for name in names],
              np.nan,
              *['%.3f' % l1_dists[name] for name in names],
              np.nan,
              *['%.3f' % l2_dists[name] for name in names],
              sep=sep,
              flush=True)

    else:
        raise Exception('Unknown phi fitter %s' % method)

    assert np.allclose(1, np.sum(eta, axis=0))
    Z = util.make_ancestral_from_adj(adj)
    phi = np.dot(Z, eta)
    return (phi, eta)
Exemplo n.º 6
0
def make_superclusters(supervars):
  N = len(supervars)
  svids = common.extract_vids(supervars)
  superclusters = [[S] for S in svids]
  return superclusters