def grow_tree_recover(m, n, k, proba_bounds, verbose=True): reference_tree = random_discrete_tree(m, n, k, proba_bounds=proba_bounds) if verbose: reference_tree.root.ascii() observations, labels = reference_tree.root.observe() inferred_tree = estimate_tree_topology_multiclass(observations, labels=labels) if verbose: Phylo.draw_ascii(inferred_tree) NoahClade.tree_Fscore(inferred_tree, reference_tree) print(NoahClade.equal_topology(inferred_tree, reference_tree))
def comp_depths(depths, Ns, method, k, n_trees=12, proba_bounds=(0.75, 0.95), std_bounds=(0.1, 0.3), discrete=True, oldstats=None): # Ns is array of n if discrete: transition_maker = NoahClade.NoahClade.gen_symmetric_transition #transition_maker = NoahClade.NoahClade.jukes_cantor_transition else: transition_maker = NoahClade.NoahClade.gen_linear_transition stats = [] itr = 1 total_iters = len(depths) * n_trees * len(Ns) for depth in depths: m = 2**depth for _ in range(n_trees): if discrete: ref_tree = NoahClade.NoahClade.complete_binary( depth, proba_bounds=proba_bounds, n=1, k=k) else: assert (False) ref_tree = NoahClade.NoahClade.random_gaussian_tree( m, 1, std_bounds=std_bounds) for n in Ns: root_data = np.random.choice(a=k, size=n) ref_tree.root.gen_subtree_data(root_data, transition_maker, proba_bounds=proba_bounds, std_bounds=std_bounds) observations, labels = ref_tree.root.observe() inferred_tree = method(observations, labels=labels) #inferred_tree.root.ascii() F1, precision, recall, RF = NoahClade.tree_Fscore( inferred_tree, ref_tree) stats.append({ "n": n, "m": m, "method": method.__name__, "F1%": 100 * F1, "precision%": 100 * precision, "recall%": 100 * recall, "RF": RF, "correct": int(RF == 0) }) print("{0} / {1}\t{2}\tRF:{3}\tF1 {4:.1f}%\tn {5}".format( itr, total_iters, 2**depth, RF, 100 * F1, n)) itr += 1 if oldstats is None: return pd.DataFrame(stats) else: return oldstats.append(pd.DataFrame(stats))
def test_correction(m, n, k, proba_bounds): score = 0 NNN = 50 for _ in range(NNN): ref_tree = random_JC_tree(m=m, n=n, k=k, proba_bounds=proba_bounds) observations, labels = ref_tree.root.observe() T2 = NJ_hamming(observations, labels) T3 = NJ_JC(observations, labels) ref_tree.root = NoahClade.NoahClade.convertClade(ref_tree.root) ref_tree.root.reset_taxasets(labels) T2.root = NoahClade.NoahClade.convertClade(T2.root) T2.root.reset_taxasets(labels) T3.root = NoahClade.NoahClade.convertClade(T3.root) T3.root.reset_taxasets(labels) plain_F1, _, _, _ = NoahClade.tree_Fscore(T2, ref_tree) jc_F1, _, _, _ = NoahClade.tree_Fscore(T3, ref_tree) if jc_F1 > plain_F1: score += 1 print("plain {0:.3f}%\t\tjc {1:.3f}%".format(100 * plain_F1, 100 * jc_F1)) return score / NNN
def load_tree_recover(reference_tree, n, k=None, proba_bounds=(0.50, 0.95), verbose=True, format='newick'): # if tree is a file, load the tree it contains # if tree is a tree object, generate data, possibly using preexisting transition matrices if isinstance(reference_tree, str): reference_tree = Phylo.read(reference_tree, format) reference_tree.root = NoahClade.NoahClade.convertClade(reference_tree.root) if hasattr(reference_tree.root, 'transition') and hasattr(reference_tree.root.transition, 'matrix'): k = reference_tree.root.transition.shape[1] root_data = np.random.choice(a=k, size=n) transition_maker = NoahClade.NoahClade.gen_symmetric_transition reference_tree.root.gen_subtree_data(root_data, transition_maker, num_classes=k, proba_bounds=proba_bounds) if verbose: reference_tree.root.ascii() observations, labels = reference_tree.root.observe() reference_tree.root.reset_taxasets(labels=labels) inferred_tree = estimate_tree_topology_multiclass(observations, labels=labels) inferred_tree.root.reset_taxasets(labels=labels) if verbose: Phylo.draw_ascii(inferred_tree) NoahClade.tree_Fscore(inferred_tree, reference_tree) NoahClade.equal_topology(inferred_tree, reference_tree)
def load_observations_recover(reference_tree_path=None, observation_matrix_path=None, comparison_tree_path=None, verbose=False, format='newick', method=estimate_tree_topology_multiclass): reference_tree = Phylo.read(reference_tree_path, format) reference_tree.root = NoahClade.NoahClade.convertClade(reference_tree.root) if verbose: print("="*40) print("REFERENCE") even_branch_lengths(reference_tree) reference_tree.root.ascii() observations = np.genfromtxt(observation_matrix_path, delimiter=",") assert observations.shape[0] == reference_tree.count_terminals() # one row per leaf labels = ["_"+str(i) if i < 10 else str(i) for i in range(1, observations.shape[0]+1)] inferred_tree = method(observations, labels=labels) reference_tree.root.reset_taxasets(labels=labels) inferred_tree.root.reset_taxasets(labels=labels) if verbose: print("INFERRED") even_branch_lengths(inferred_tree) inferred_tree.root.ascii() print("=== Compare to gold standard ===") F1_gold, _, _, _ = NoahClade.tree_Fscore(inferred_tree, reference_tree) print(NoahClade.equal_topology(inferred_tree, reference_tree)) print(F1_gold) if comparison_tree_path is not None: external_inferred = Phylo.read(comparison_tree_path, format) external_inferred.root = NoahClade.NoahClade.convertClade(external_inferred.root) external_inferred.root.reset_taxasets(labels=labels) print("=== Compare to Matlab version ===") F1_matlab, _, _, _ = NoahClade.tree_Fscore(inferred_tree, external_inferred) print(NoahClade.equal_topology(inferred_tree, external_inferred)) print(F1_matlab) if verbose: print("MATLAB") even_branch_lengths(external_inferred) external_inferred.root.ascii()
def comp_scorers(m, Ns, k, scorers, n_trees=12, obs_per_tree=1, proba_bounds=(0.75, 0.95), std_bounds=(0.1, 0.3), baselines=[], discrete=True): # Ns is array of n if discrete: transition_maker = NoahClade.NoahClade.gen_symmetric_transition #transition_maker = NoahClade.NoahClade.jukes_cantor_transition else: transition_maker = NoahClade.NoahClade.gen_linear_transition stats = [] itr = 1 total_iters = n_trees*obs_per_tree*len(Ns)*(len(scorers)+len(baselines)) for _ in range(n_trees): if discrete: ref_tree = NoahClade.random_discrete_tree(m, 1, k, proba_bounds=proba_bounds) else: ref_tree = NoahClade.random_gaussian_tree(m, 1, std_bounds=std_bounds) for _ in range(obs_per_tree): for n in Ns: root_data = np.random.choice(a=k, size=n) ref_tree.root.gen_subtree_data(root_data, transition_maker, proba_bounds=proba_bounds, std_bounds=std_bounds) observations, labels = ref_tree.root.observe() #print("ref") #ref_tree.root.ascii() scorer_methods = [] for scorer in scorers: scorer_method = partial(estimate_tree_topology, scorer=scorer, discrete=discrete) scorer_method.__name__ = scorer.__name__[6:] scorer_methods.append(scorer_method) for method in scorer_methods+baselines: inferred_tree = method(observations, labels=labels) #inferred_tree.root.ascii() F1, precision, recall, RF = NoahClade.tree_Fscore(inferred_tree, ref_tree) stats.append({"n": n, "method": method.__name__, "F1%":100*F1, "precision%":100*precision, "recall%":100*recall, "RF":RF,}) print("{0} / {1}\t{2}\tRF:{3}\tF1 {4:.1f}%\tn {5}".format(itr, total_iters, method.__name__, RF, 100*F1, n)) itr += 1 return stats
T1 = neighbor_joining(observations, labels) T2 = NJ_hamming(observations, labels) T3 = NJ_JC(observations, labels) hamming = scipy.spatial.distance.squareform( scipy.spatial.distance.pdist(observations, metric='hamming')) corrected = distance_correction(hamming, 4) #pd.DataFrame(corrected) hamming.max() corrected.max() #distance_correction(0.74, k=4) #1 - 0.9666666666666667*4/3 #-np.log(1e-16) # how come the Jukes-Cantor correction does worse even under a Jukes Cantor model ref_tree.root = NoahClade.NoahClade.convertClade(ref_tree.root) _ = ref_tree.root.reset_taxasets(labels) T1.root = NoahClade.NoahClade.convertClade(T1.root) _ = T1.root.reset_taxasets(labels) T2.root = NoahClade.NoahClade.convertClade(T2.root) _ = T2.root.reset_taxasets(labels) T3.root = NoahClade.NoahClade.convertClade(T3.root) _ = T3.root.reset_taxasets(labels) NoahClade.tree_Fscore(T1, T2) NoahClade.tree_Fscore(T2, T3) NoahClade.tree_Fscore(T2, ref_tree) NoahClade.tree_Fscore(T3, ref_tree) T3.root.ascii()
precision = float(both) / len(splits_inferred_sym) recall = float(both) / len(splits_reference_sym) F1 = 2 * (precision * recall) / (precision + recall) return F1, precision, recall, RF def score_stable_rank(A, M): M_A = M[np.ix_(A, ~A)] s = np.linalg.svd(M_A, compute_uv=False) s_sq = s**2 return s_sq.sum() / s_sq[0] #frobenius norm sq / operator norm sq if __name__ == "__main__": ref_tree = NoahClade.random_discrete_tree(64, 1000, 4, proba_bounds=(0.75, 0.95)) #ref_tree = NoahClade.NoahClade.complete_binary(6, n=500, k=4, proba_bounds=(0.75, 0.95)) observations, labels = ref_tree.root.observe() inferred_tree = estimate_tree_topology_Jukes_Cantor(observations, labels=labels) F1, _, _, RF, = diagnose(inferred_tree, ref_tree, labels) print("F1: {0:.3f}\nRF: {1}".format(F1, RF)) Phylo.draw(inferred_tree, label_func=lambda x: None, branch_labels=lambda x: "***" if x.name[:3] == "***" else None) #Phylo.draw(ref_tree, label_func=lambda x: x.name[-2:] if x.is_terminal() else None) inferred_tree2 = estimate_tree_topology_Jukes_Cantor( observations, labels=labels, scorer=score_stable_rank) F12, _, _, RF2, = diagnose(inferred_tree2, ref_tree, labels)