def tree_threeway_counts(tree, lca_depths, alphabet=DnaPairs, attr='Sequence'): """From tree and array of lca_depths, returns n*n*n array of Count objects. n is number of leaves. lca_depths: array (leaf * leaf) of depths of last common ancestor. alphabet: pair alphabet for input sequences. Returns dict containing counts for (i, j, k) and (j, i, k) where k is the outgroup of the three sequences. Will pick an arbitrary node to be the outgroup if there is a polytomy. Note: Leaves of tree must have sequences already assigned. """ outgroup_last = tree.outgroupLast leaves = list(tree.traverse()) result = {} for first, second, third in three_item_combos(leaves): new_first, new_second, new_third = outgroup_last(first, second, third) #get the sequence from each node seq_1 = getattr(new_first, attr) seq_2 = getattr(new_second, attr) seq_3 = getattr(new_third, attr) result[(new_first.Id, new_second.Id, new_third.Id)] = \ Counts.fromTriple(seq_1, seq_2, seq_3, alphabet) #don't forget to do counts from both the non-outgroups result[(new_second.Id, new_first.Id, new_third.Id)] = \ Counts.fromTriple(seq_2, seq_1, seq_3, alphabet) return result
def tree_threeway_counts_sample(tree, lca_depths, alphabet=DnaPairs, \ attr='Sequence', n=1000, check_rates=True, clean_f=None): """Like tree_threeway_counts, but takes random sample (w/o replacement).""" leaves = list(tree.traverse()) num_leaves = len(leaves) #do normal threeway counts if number of triples < n num_triples = num_leaves * (num_leaves - 1) * (num_leaves - 2) / 3 if num_triples < n: counts = tree_threeway_counts(tree, lca_depths, alphabet, attr) if clean_f: result = {} for k, v in counts.items(): result[k] = clean_f(v) return result else: return counts #if we got here, need to sample outgroup_last = tree.outgroupLast i = 0 seen = {} result = {} while i < n and len(seen) < num_triples: #bail out if same node picked twice, or if resampling same combo curr = choice(leaves), choice(leaves), choice(leaves) ids = tuple([c.Id for c in curr]) if len(dict.fromkeys(ids)) < len(curr): #picked same thing twice continue if curr in seen: continue first, second, third = curr new_first, new_second, new_third = outgroup_last(first, second, third) seq_1 = getattr(new_first, attr) seq_2 = getattr(new_second, attr) seq_3 = getattr(new_third, attr) counts = Counts.fromTriple(seq_1, seq_2, seq_3, alphabet) if clean_f: counts = clean_f(counts) key = (new_first.Id, new_second.Id, new_third.Id) #check rates if we need to if check_rates: try: #skip probs with zero rows if not min(max(counts._data, 1)): continue probs = counts.toProbs() rates = probs.toRates() except (ZeroDivisionError, OverflowError, ValueError, \ FloatingPointError): continue result[key] = counts i += 1 return result
def tree_threeway_counts_sample(tree, lca_depths, alphabet=DnaPairs, \ attr='Sequence', n=1000, check_rates=True, clean_f=None): """Like tree_threeway_counts, but takes random sample (w/o replacement).""" leaves = list(tree.traverse()) num_leaves = len(leaves) #do normal threeway counts if number of triples < n num_triples = num_leaves * (num_leaves - 1) * (num_leaves-2) / 3 if num_triples < n: counts = tree_threeway_counts(tree, lca_depths, alphabet, attr) if clean_f: result = {} for k, v in counts.items(): result[k] = clean_f(v) return result else: return counts #if we got here, need to sample outgroup_last = tree.outgroupLast i = 0 seen = {} result = {} while i < n and len(seen) < num_triples: #bail out if same node picked twice, or if resampling same combo curr = choice(leaves), choice(leaves), choice(leaves) ids = tuple([c.Id for c in curr]) if len(dict.fromkeys(ids)) < len(curr): #picked same thing twice continue if curr in seen: continue first, second, third = curr new_first, new_second, new_third = outgroup_last(first, second, third) seq_1 = getattr(new_first, attr) seq_2 = getattr(new_second, attr) seq_3 = getattr(new_third, attr) counts = Counts.fromTriple(seq_1, seq_2, seq_3, alphabet) if clean_f: counts = clean_f(counts) key = (new_first.Id, new_second.Id, new_third.Id) #check rates if we need to if check_rates: try: #skip probs with zero rows if not min(max(counts._data,1)): continue probs = counts.toProbs() rates = probs.toRates() except (ZeroDivisionError, OverflowError, ValueError, \ FloatingPointError): continue result[key] = counts i += 1 return result
def tree_twoway_counts(tree, alphabet=DnaPairs, average=True, attr='Sequence'): """From tree, return dict of Count objects. Note: if average is True, only has counts in m[i,j] or m[j,i], not both. """ leaves = list(tree.traverse()) result = {} if average: #return symmetric matrix for first, second in two_item_combos(leaves): seq_1 = getattr(first, attr) seq_2 = getattr(second, attr) result[(first.Id, second.Id)] = \ Counts.fromPair(seq_1, seq_2, alphabet) else: for first, second in two_item_combos(leaves): seq_1 = getattr(first, attr) seq_2 = getattr(second, attr) result[(first.Id, second.Id)] = \ Counts.fromPair(seq_1, seq_2, alphabet,False) result[(second.Id, first.Id)] = \ Counts.fromPair(seq_2, seq_1, alphabet,False) return result
def dna_count_cleaner(counts): """Cleans DNA counts to just the 4-letter alphabet.""" return Counts(counts._data[:4, :4], DnaPairs)