Exemplo n.º 1
def make_mutrel_from_trees_and_unique_clusterings(structs, llhs, clusterings):
  Relative to `make_mutrel_from_trees_and_single_clustering`, this function is
  slower and more memory intensive, but also more flexible. It differs in two

  1. It doesn't assume that the user has already computed counts for all unique
  samples -- i.e., it allows duplicate samples.

  2. It allows unique clusterings for every sample.
    assert len(structs) == len(llhs) == len(clusterings)
    weights = util.softmax(llhs)
    vids = None

    for struct, clustering, weight in zip(structs, clusterings, weights):
        adjm = util.convert_parents_to_adjmatrix(struct)
        mrel = make_mutrel_from_cluster_adj(adjm, clustering)
        if vids is None:
            vids = mrel.vids
            soft_mutrel = np.zeros(mrel.rels.shape)
            assert mrel.vids == vids
        soft_mutrel += weight * mrel.rels

    soft_mutrel = fix_rounding_errors(soft_mutrel)
    return mutrel.Mutrel(
Exemplo n.º 2
def make_mutrel_from_trees_and_single_clustering(structs, llhs, counts,
    # Oftentimes, we will have many samples of the same adjacency matrix paired
    # with the same clustering. This will produce the same mutrel. As computing
    # the mutrel from adjm + clustering is expensive, we want to avoid repeating
    # this unnecessarily. Instead, we just modify the associated weight of the
    # the pairing to reflect this.
    # Observe that if we have `C` copies of the LLH `W`, we obtain
    # equivalent post-softmax linear-space weights under either of the following
    # two methods:
    # 1. (naive) Represent the associated samples `C` separate times in the softmax
    # 2. (smart) Set `W' = W + log(C)`, as `exp(W') = Cexp(W)`
    weights = util.softmax(llhs + np.log(counts))
    vids = None

    for struct, weight in zip(structs, weights):
        adjm = util.convert_parents_to_adjmatrix(struct)
        crel = make_clustrel_from_cluster_adj(adjm)

        if vids is None:
            vids = crel.vids
            soft_clustrel = np.zeros(crel.rels.shape)
            assert crel.vids == vids
        soft_clustrel += weight * crel.rels

    soft_clustrel = fix_rounding_errors(soft_clustrel)
    clustrel = mutrel.Mutrel(rels=soft_clustrel, vids=vids)
    mrel = make_mutrel_from_clustrel(clustrel, clustering)
    return mrel
Exemplo n.º 3
def _compute_cna_influence(struct, cna_events, ssm_segs, ssm_pops, ssm_phases, ssm_timing):
  assert len(ssm_segs) == len(ssm_pops) == len(ssm_phases) == len(ssm_timing)
  M = len(ssm_segs)
  C = len(cna_events)

  # For `cna_influence`, we have an `MxC` matrix, where `cna_influence[i,j] =
  # 1` iff SSM `i` is influenced by CNA `j`. That is, SSM `i` occurred in the
  # same phase on the same segment as CNA `j` in an ancestral population to
  # where `j` occurred, or `i` occurred in the same phase on the same segment
  # as `j`  in the same population with timing such that `i` was before (not
  # after) `j`.
  infl = np.zeros((M, C), dtype=np.int8)
  adjm = util.convert_parents_to_adjmatrix(struct)
  anc = util.make_ancestral_from_adj(adjm)
  np.fill_diagonal(anc, 0)

  for cna_idx, event in enumerate(cna_events):
    anc_pops = np.flatnonzero(anc[event.pop])
    assert event.pop not in anc_pops
    ancestral_ssm_mask = np.logical_and.reduce((
      np.isin(ssm_pops, anc_pops),
      ssm_segs == event.seg,
      ssm_phases == event.phase,
    before_cna_ssm_mask = np.logical_and(
      ssm_pops == event.pop,
      ssm_timing == TIMING_BEFORE,
    ssm_mask = np.logical_or(ancestral_ssm_mask, before_cna_ssm_mask)
    infl[ssm_mask, cna_idx] = 1

  return infl
Exemplo n.º 4
def calc_cadi(eta, struct):
  Compute the clone and ancestor diversity index (CADI), which is the joint
  entropy of eta and the subclones ancestral to a clone.

  >>> eta = np.array([[0.5], [0.2], [0.2], [0.1]])
  >>> struct = [0, 1, 1]
  >>> cadi = calc_cadi(eta, struct)
  >>> np.isclose(cadi[0], 2.1219280948873624)
    K, S = eta.shape

    adj = util.convert_parents_to_adjmatrix(struct)
    anc = util.make_ancestral_from_adj(adj, check_validity=True)
    assert anc.shape == (K, K)
    A = np.sum(anc, axis=0) - 1
    A = np.repeat(A[1:][:, np.newaxis], S, axis=1)
    assert np.all(A >= 1)

    eta = _fix_eta(eta)
    assert A.shape == eta.shape

    H_joint = -ma.sum(eta * (ma.log2(eta) - np.log2(A)), axis=0)
    assert H_joint.shape == (S, )
    return H_joint
Exemplo n.º 5
def calc_cmdi(eta, clusters, struct):
    '''Compute the clone and mutation diversity index (CMDI), which is the joint
  entropy of eta and the mutations presnt in a clone (i.e., the mutations
  specific to it as well as the mutations inherited from its ancestors).'''
    K, S = eta.shape

    adj = util.convert_parents_to_adjmatrix(struct)
    anc = util.make_ancestral_from_adj(adj, check_validity=True)
    assert anc.shape == (K, K)

    vids, mutmem = util.make_membership_mat(clusters)
    M = len(vids)
    # Root node has no associated mutations.
    mutmem = np.insert(mutmem, 0, 0, axis=1)
    assert mutmem.shape == (M, K)
    assert np.sum(mutmem) == M
    # `mutanc[i,j] = 1` iff mutation `i` occurred in node `j` or a node ancestral
    # to it.
    mutanc = np.dot(mutmem, anc)
    # `mutanc_cnt[i]` = number of mutations that occurred in clone `i` and all
    # clones ancestral to it.
    mutanc_cnt = np.sum(mutanc, axis=0)

    assert mutanc_cnt[0] == 0 and np.all(mutanc_cnt[1:] > 0)
    M_k = np.repeat(mutanc_cnt[1:][:, np.newaxis], S, axis=1)
    eta = _fix_eta(eta)
    assert eta.shape == M_k.shape

    H_joint = -ma.sum(eta * (ma.log2(eta) - np.log2(M_k)), axis=0)
    assert H_joint.shape == (S, )
    return H_joint
Exemplo n.º 6
def _calc_num_pops(parents):
    # Calculate number of populations in each subclone.
    adj = util.convert_parents_to_adjmatrix(parents)
    K = len(adj)
    assert adj.shape == (K, K)
    anc = util.make_ancestral_from_adj(adj)
    C = np.sum(anc, axis=1)
    assert C[0] == K
    return C[1:].astype(np.int)
Exemplo n.º 7
def compute_parent_dist(structs, weights):
    K = len(structs[0]) + 1
    parent_dist = np.zeros((K, K))
    assert np.all(weights >= 0) and np.isclose(1, np.sum(weights))

    for struct, weight in zip(structs, weights):
        adjm = util.convert_parents_to_adjmatrix(struct)
        np.fill_diagonal(adjm, 0)
        parent_dist += weight * adjm

    assert np.all(0 == parent_dist[:, 0])
    parent_dist = parent_dist[:, 1:]
    parent_dist = evalutil.fix_rounding_errors(parent_dist)

    assert np.all(0 <= parent_dist) and np.all(parent_dist <= 1)
    assert np.allclose(1, np.sum(parent_dist, axis=0))
    return parent_dist
Exemplo n.º 8
def main():
    parser = argparse.ArgumentParser(
        description='LOL HI THERE',
    args = parser.parse_args()

    params = inputparser.load_params(args.params_fn)
    adjm = util.convert_parents_to_adjmatrix(params['structure'])
    with open(args.pickle_fn, 'wb') as outf:
                'adjm': adjm,
                'clusters': params['clusters'],
                'vids_good': [V for C in params['clusters'] for V in C],
                'vids_garbage': params['garbage'],
            }, outf)
Exemplo n.º 9
def generate_tree(K, S, alpha, tree_type, eta_min=1e-30):
  parents = make_parents(K, tree_type)
  #leaves = np.flatnonzero(np.sum(adjm, axis=1) == 0)
  adjm = util.convert_parents_to_adjmatrix(parents)
  Z = util.make_ancestral_from_adj(adjm) # (K+1)x(K+1)
  eta = np.random.dirichlet(alpha = (K+1)*[alpha], size=S).T # (K+1)xS

  # In general, we want etas on leaves to be more "peaked" -- that is, only a
  # few subclones come to dominate, so they should have large etas relative to
  # internal nodes. We accomplish this by using a smaller alpha for these.
  #eta[leaves] += np.random.dirichlet(alpha = len(leaves)*[1e0], size = S).T

  # Given the true phis, we want enumeration to be able to recover the true
  # tree (as well as other trees, potentially). For this to work, there needs
  # to be a well-defined ordering based on phis, which means that we can't have
  # `eta = 0` exactly. Without this minimum eta, especially given only one
  # sample, we can end up with two populations that have exactly the same phi,
  # which means their ordering is arbitrary.
  eta = np.maximum(eta_min, eta)

  eta /= np.sum(eta, axis=0)
  phi = np.dot(Z, eta) # (Kx1)xS
  assert np.allclose(1, phi[0])
  return (parents, phi, eta)
Exemplo n.º 10
def _make_noderels(struct):
  adjm = util.convert_parents_to_adjmatrix(struct)
  rels = util.compute_node_relations(adjm)
  return rels
Exemplo n.º 11
def _generate_cna_events(K, H, C, ploidy, struct):
  assert len(struct) == K
  adjm = util.convert_parents_to_adjmatrix(struct)
  anc = util.make_ancestral_from_adj(adjm)

  cn_seg_probs = np.random.dirichlet(alpha = H*[5])
  cn_phase_probs = np.random.dirichlet(alpha = ploidy*[5])
  cn_pop_probs = np.random.dirichlet(alpha = K*[5])
  # Directions: 0=deletion, 1=gain
  direction_probs = np.random.dirichlet(alpha = 2*[5])
  lam = 1.5

  attempts = 0
  max_attempts = 5000*C

  events = []
  triplets = set()
  directions = {}
  deletions = {}

  while len(events) < C:
    attempts += 1
    if attempts > max_attempts:
      raise TooManyAttemptsError('Could not generate configuration without duplicates in %s attempts' % max_attempts)

    cn_seg = np.random.choice(H, p=cn_seg_probs)
    cn_phase = np.random.choice(ploidy, p=cn_phase_probs)
    # Add one so that no CNAs are assigned to the root.
    cn_pop = np.random.choice(K, p=cn_pop_probs) + 1
    triplet = (cn_seg, cn_phase, cn_pop)
    doublet = (cn_seg, cn_phase)

    if triplet in triplets:

    if doublet in directions:
      direction = directions[doublet]
      direction = np.random.choice(2, p=direction_probs)

    if direction == DIRECTION_GAIN:
      delta = np.ceil(np.random.exponential(scale=1/lam)).astype(np.int)
      assert delta >= 1
      # We only ever have one allele to lose, so can never lose more than one.
      delta = -1
      if doublet in deletions:
        same_branch_nodes = set(np.flatnonzero(anc[cn_pop])) | set(np.flatnonzero(anc[:,cn_pop]))
        same_branch_deletions = deletions[doublet] & same_branch_nodes
        if len(same_branch_deletions) > 0:
        deletions[doublet] = set()

    if doublet not in directions:
      directions[doublet] = direction
    events.append(Cna(cn_pop, cn_seg, cn_phase, delta))

  return events