Пример #1
0
def grow_tree_recover(m, n, k, proba_bounds, verbose=True):
    reference_tree = random_discrete_tree(m, n, k, proba_bounds=proba_bounds)
    if verbose:
        reference_tree.root.ascii()
    observations, labels = reference_tree.root.observe()
    inferred_tree = estimate_tree_topology_multiclass(observations, labels=labels)
    if verbose:
        Phylo.draw_ascii(inferred_tree)
    NoahClade.tree_Fscore(inferred_tree, reference_tree)
    print(NoahClade.equal_topology(inferred_tree, reference_tree))
Пример #2
0
def comp_depths(depths,
                Ns,
                method,
                k,
                n_trees=12,
                proba_bounds=(0.75, 0.95),
                std_bounds=(0.1, 0.3),
                discrete=True,
                oldstats=None):
    # Ns is array of n
    if discrete:
        transition_maker = NoahClade.NoahClade.gen_symmetric_transition
        #transition_maker = NoahClade.NoahClade.jukes_cantor_transition
    else:
        transition_maker = NoahClade.NoahClade.gen_linear_transition
    stats = []
    itr = 1
    total_iters = len(depths) * n_trees * len(Ns)

    for depth in depths:
        m = 2**depth
        for _ in range(n_trees):
            if discrete:
                ref_tree = NoahClade.NoahClade.complete_binary(
                    depth, proba_bounds=proba_bounds, n=1, k=k)
            else:
                assert (False)
                ref_tree = NoahClade.NoahClade.random_gaussian_tree(
                    m, 1, std_bounds=std_bounds)

            for n in Ns:
                root_data = np.random.choice(a=k, size=n)
                ref_tree.root.gen_subtree_data(root_data,
                                               transition_maker,
                                               proba_bounds=proba_bounds,
                                               std_bounds=std_bounds)
                observations, labels = ref_tree.root.observe()

                inferred_tree = method(observations, labels=labels)
                #inferred_tree.root.ascii()
                F1, precision, recall, RF = NoahClade.tree_Fscore(
                    inferred_tree, ref_tree)
                stats.append({
                    "n": n,
                    "m": m,
                    "method": method.__name__,
                    "F1%": 100 * F1,
                    "precision%": 100 * precision,
                    "recall%": 100 * recall,
                    "RF": RF,
                    "correct": int(RF == 0)
                })
                print("{0} / {1}\t{2}\tRF:{3}\tF1 {4:.1f}%\tn {5}".format(
                    itr, total_iters, 2**depth, RF, 100 * F1, n))
                itr += 1
    if oldstats is None:
        return pd.DataFrame(stats)
    else:
        return oldstats.append(pd.DataFrame(stats))
Пример #3
0
def test_correction(m, n, k, proba_bounds):
    score = 0
    NNN = 50
    for _ in range(NNN):
        ref_tree = random_JC_tree(m=m, n=n, k=k, proba_bounds=proba_bounds)
        observations, labels = ref_tree.root.observe()
        T2 = NJ_hamming(observations, labels)
        T3 = NJ_JC(observations, labels)
        ref_tree.root = NoahClade.NoahClade.convertClade(ref_tree.root)
        ref_tree.root.reset_taxasets(labels)
        T2.root = NoahClade.NoahClade.convertClade(T2.root)
        T2.root.reset_taxasets(labels)
        T3.root = NoahClade.NoahClade.convertClade(T3.root)
        T3.root.reset_taxasets(labels)
        plain_F1, _, _, _ = NoahClade.tree_Fscore(T2, ref_tree)
        jc_F1, _, _, _ = NoahClade.tree_Fscore(T3, ref_tree)
        if jc_F1 > plain_F1:
            score += 1
        print("plain {0:.3f}%\t\tjc {1:.3f}%".format(100 * plain_F1,
                                                     100 * jc_F1))
    return score / NNN
Пример #4
0
def load_tree_recover(reference_tree, n, k=None, proba_bounds=(0.50, 0.95), verbose=True, format='newick'):
    # if tree is a file, load the tree it contains
    # if tree is a tree object, generate data, possibly using preexisting transition matrices
    if isinstance(reference_tree, str):
        reference_tree = Phylo.read(reference_tree, format)
        reference_tree.root = NoahClade.NoahClade.convertClade(reference_tree.root)
    if hasattr(reference_tree.root, 'transition') and hasattr(reference_tree.root.transition, 'matrix'):
        k = reference_tree.root.transition.shape[1]
    root_data = np.random.choice(a=k, size=n)
    transition_maker = NoahClade.NoahClade.gen_symmetric_transition
    reference_tree.root.gen_subtree_data(root_data, transition_maker, num_classes=k, proba_bounds=proba_bounds)

    if verbose:
        reference_tree.root.ascii()
    observations, labels = reference_tree.root.observe()
    reference_tree.root.reset_taxasets(labels=labels)
    inferred_tree = estimate_tree_topology_multiclass(observations, labels=labels)
    inferred_tree.root.reset_taxasets(labels=labels)
    if verbose:
        Phylo.draw_ascii(inferred_tree)
    NoahClade.tree_Fscore(inferred_tree, reference_tree)
    NoahClade.equal_topology(inferred_tree, reference_tree)
Пример #5
0
def load_observations_recover(reference_tree_path=None, observation_matrix_path=None, comparison_tree_path=None, verbose=False, format='newick', method=estimate_tree_topology_multiclass):
    reference_tree = Phylo.read(reference_tree_path, format)
    reference_tree.root = NoahClade.NoahClade.convertClade(reference_tree.root)
    if verbose:
        print("="*40)
        print("REFERENCE")
        even_branch_lengths(reference_tree)
        reference_tree.root.ascii()

    observations = np.genfromtxt(observation_matrix_path, delimiter=",")
    assert observations.shape[0] == reference_tree.count_terminals() # one row per leaf
    labels = ["_"+str(i) if i < 10 else str(i) for i in range(1, observations.shape[0]+1)]
    inferred_tree = method(observations, labels=labels)
    reference_tree.root.reset_taxasets(labels=labels)
    inferred_tree.root.reset_taxasets(labels=labels)
    if verbose:
        print("INFERRED")
        even_branch_lengths(inferred_tree)
        inferred_tree.root.ascii()

    print("=== Compare to gold standard ===")
    F1_gold, _, _, _ = NoahClade.tree_Fscore(inferred_tree, reference_tree)
    print(NoahClade.equal_topology(inferred_tree, reference_tree))
    print(F1_gold)

    if comparison_tree_path is not None:
        external_inferred = Phylo.read(comparison_tree_path, format)
        external_inferred.root = NoahClade.NoahClade.convertClade(external_inferred.root)
        external_inferred.root.reset_taxasets(labels=labels)
        print("=== Compare to Matlab version ===")
        F1_matlab, _, _, _ = NoahClade.tree_Fscore(inferred_tree, external_inferred)
        print(NoahClade.equal_topology(inferred_tree, external_inferred))
        print(F1_matlab)
        if verbose:
            print("MATLAB")
            even_branch_lengths(external_inferred)
            external_inferred.root.ascii()
def comp_scorers(m, Ns, k, scorers, n_trees=12, obs_per_tree=1, proba_bounds=(0.75, 0.95), std_bounds=(0.1, 0.3), baselines=[], discrete=True):
    # Ns is array of n
    if discrete:
        transition_maker = NoahClade.NoahClade.gen_symmetric_transition
        #transition_maker = NoahClade.NoahClade.jukes_cantor_transition
    else:
        transition_maker = NoahClade.NoahClade.gen_linear_transition
    stats = []
    itr = 1
    total_iters = n_trees*obs_per_tree*len(Ns)*(len(scorers)+len(baselines))

    for _ in range(n_trees):
        if discrete:
            ref_tree = NoahClade.random_discrete_tree(m, 1, k, proba_bounds=proba_bounds)
        else:
            ref_tree = NoahClade.random_gaussian_tree(m, 1, std_bounds=std_bounds)
        for _ in range(obs_per_tree):
            for n in Ns:
                root_data = np.random.choice(a=k, size=n)
                ref_tree.root.gen_subtree_data(root_data, transition_maker, proba_bounds=proba_bounds, std_bounds=std_bounds)
                observations, labels = ref_tree.root.observe()
                #print("ref")
                #ref_tree.root.ascii()
                scorer_methods = []
                for scorer in scorers:
                    scorer_method = partial(estimate_tree_topology, scorer=scorer, discrete=discrete)
                    scorer_method.__name__ = scorer.__name__[6:]
                    scorer_methods.append(scorer_method)
                for method in scorer_methods+baselines:
                    inferred_tree = method(observations, labels=labels)
                    #inferred_tree.root.ascii()
                    F1, precision, recall, RF = NoahClade.tree_Fscore(inferred_tree, ref_tree)
                    stats.append({"n": n, "method": method.__name__, "F1%":100*F1, "precision%":100*precision, "recall%":100*recall, "RF":RF,})
                    print("{0} / {1}\t{2}\tRF:{3}\tF1 {4:.1f}%\tn {5}".format(itr, total_iters, method.__name__, RF, 100*F1, n))
                    itr += 1
    return stats
Пример #7
0
    T1 = neighbor_joining(observations, labels)
    T2 = NJ_hamming(observations, labels)
    T3 = NJ_JC(observations, labels)

    hamming = scipy.spatial.distance.squareform(
        scipy.spatial.distance.pdist(observations, metric='hamming'))
    corrected = distance_correction(hamming, 4)
    #pd.DataFrame(corrected)
    hamming.max()
    corrected.max()
    #distance_correction(0.74, k=4)
    #1 - 0.9666666666666667*4/3
    #-np.log(1e-16)

    # how come the Jukes-Cantor correction does worse even under a Jukes Cantor model

    ref_tree.root = NoahClade.NoahClade.convertClade(ref_tree.root)
    _ = ref_tree.root.reset_taxasets(labels)
    T1.root = NoahClade.NoahClade.convertClade(T1.root)
    _ = T1.root.reset_taxasets(labels)
    T2.root = NoahClade.NoahClade.convertClade(T2.root)
    _ = T2.root.reset_taxasets(labels)
    T3.root = NoahClade.NoahClade.convertClade(T3.root)
    _ = T3.root.reset_taxasets(labels)
    NoahClade.tree_Fscore(T1, T2)
    NoahClade.tree_Fscore(T2, T3)
    NoahClade.tree_Fscore(T2, ref_tree)
    NoahClade.tree_Fscore(T3, ref_tree)

    T3.root.ascii()
Пример #8
0
    precision = float(both) / len(splits_inferred_sym)
    recall = float(both) / len(splits_reference_sym)
    F1 = 2 * (precision * recall) / (precision + recall)
    return F1, precision, recall, RF


def score_stable_rank(A, M):
    M_A = M[np.ix_(A, ~A)]
    s = np.linalg.svd(M_A, compute_uv=False)
    s_sq = s**2
    return s_sq.sum() / s_sq[0]  #frobenius norm sq / operator norm sq


if __name__ == "__main__":
    ref_tree = NoahClade.random_discrete_tree(64,
                                              1000,
                                              4,
                                              proba_bounds=(0.75, 0.95))
    #ref_tree = NoahClade.NoahClade.complete_binary(6, n=500, k=4, proba_bounds=(0.75, 0.95))
    observations, labels = ref_tree.root.observe()
    inferred_tree = estimate_tree_topology_Jukes_Cantor(observations,
                                                        labels=labels)
    F1, _, _, RF, = diagnose(inferred_tree, ref_tree, labels)
    print("F1: {0:.3f}\nRF: {1}".format(F1, RF))
    Phylo.draw(inferred_tree,
               label_func=lambda x: None,
               branch_labels=lambda x: "***" if x.name[:3] == "***" else None)
    #Phylo.draw(ref_tree, label_func=lambda x: x.name[-2:] if x.is_terminal() else None)

    inferred_tree2 = estimate_tree_topology_Jukes_Cantor(
        observations, labels=labels, scorer=score_stable_rank)
    F12, _, _, RF2, = diagnose(inferred_tree2, ref_tree, labels)