Пример #1
0
def ds1_phylo_model_demo(inst):
    """Demonstrate how phylogenetic models and likelihoods work using DS1."""
    inst.read_fasta_file("data/DS1.fasta")
    # Just use the first tree for likelihood comparison.
    inst.tree_collection.erase(1, 10)
    branch_lengths = np.array(inst.tree_collection.trees[0].branch_lengths,
                              copy=False)
    branch_lengths[:] = 0.1

    inst.prepare_for_phylo_likelihood(SIMPLE_SPECIFICATION, 2)
    jc69_likelihood = np.array(inst.log_likelihoods())

    # Showing off phylo_model_param_block_map.
    gtr_specification = libsbn.PhyloModelSpecification(substitution="GTR",
                                                       site="constant",
                                                       clock="none")
    inst.prepare_for_phylo_likelihood(gtr_specification, 2)
    phylo_model_param_block_map = inst.get_phylo_model_param_block_map()
    phylo_model_param_block_map["GTR rates"][:] = 1.0
    phylo_model_param_block_map["frequencies"][:] = 0.25
    print("\nHere's a look at phylo_model_param_block_map:")
    pprint.pprint(phylo_model_param_block_map)
    print("\nWe can see that we are changing the phylo_model_params matrix:")
    print(inst.get_phylo_model_params(), "\n")
    assert jc69_likelihood == pytest.approx(np.array(inst.log_likelihoods()))
Пример #2
0
def log_prob_conditioned_branch_only(newick_file, fasta_file, subst_model, frequencies, rescaling=False, **subst_model_params):
    if isinstance(subst_model, treeflow.substitution_model.JC):
        subst_model_string = 'JC69'
        param_updates = { }
    elif isinstance(subst_model, treeflow.substitution_model.GTR):
        subst_model_string = 'GTR'
        param_updates = {
            'GTR rates': np.array(subst_model_params['rates']),
            'frequencies': np.array(frequencies)
        }
    elif isinstance(subst_model, treeflow.substitution_model.HKY):
        subst_model_string = 'GTR'
        kappa = subst_model_params['kappa']
        rates = np.ones(6)
        rates[1] = kappa
        rates[4] = kappa
        param_updates = {
            'GTR rates': rates,
            'frequencies': np.array(frequencies)
        }
    else:
        raise ValueError('Unsupported substitution model')

    inst = libsbn.rooted_instance('treeflow')
    inst.read_newick_file(newick_file)
    inst.read_fasta_file(fasta_file)
    inst.set_rescaling(rescaling)
    model_specification = libsbn.PhyloModelSpecification(subst_model_string, 'constant','strict')
    inst.prepare_for_phylo_likelihood(model_specification, 1)

    phylo_model_param_block_map = inst.get_phylo_model_param_block_map()
    phylo_model_param_block_map["clock rate"][:] = 1.0

    for key, value in param_updates.items():
        phylo_model_param_block_map[key][:] = value

    parent_id_vector = np.array(inst.tree_collection.trees[0].parent_id_vector())
    root_id = parent_id_vector.shape[0]
    root_children = np.nonzero(parent_id_vector == root_id)

    branch_lengths = np.array(inst.tree_collection.trees[0].branch_lengths, copy=False)

    def libsbn_func(x):
        branch_lengths[:-1] = x
        gradient = inst.gradients()[0]
        grad_array = np.array(gradient.branch_lengths, dtype=np.float32)[:-1]
        grad_array[root_children] = np.sum(grad_array[root_children])
        return np.array(gradient.log_likelihood, dtype=np.float32), grad_array

    libsbn_func_vec = np.vectorize(libsbn_func, [np.float32, np.float32], signature='(n)->(),(n)')

    @tf.custom_gradient
    def libsbn_tf_func(x):
        logp, grad_val = tf.numpy_function(libsbn_func_vec, [x], [tf.float32, tf.float32])
        def grad(dlogp):
            return tf.expand_dims(dlogp, -1) * grad_val
        return logp, grad

    return libsbn_tf_func, inst
Пример #3
0
def test_elbo_innards():
    """ From Mathieu:
    tree.mars.distance mu: -1.728809 sigma: 0.459529 sample: 0.184472
    tree.saturn.distance mu: -2.410943 sigma: 0.748569 sample: 0.027993
    tree.jupiter.distance mu: -2.410977 sigma: 0.748571 sample: 0.045583
    like: -81.446550 prior: 4.327275 logQ: 5.330697
    elbo: -82.449972
    """

    phylo_model_specification = libsbn.PhyloModelSpecification(
        substitution="JC69", site="constant", clock="strict")

    burro = vip.burrito.Burrito(
        mcmc_nexus_path="data/hello_out.t",
        burn_in_fraction=0,
        fasta_path="data/hello.fasta",
        phylo_model_specification=phylo_model_specification,
        branch_model_name="split",
        scalar_model_name="lognormal",
        optimizer_name="simple",
        particle_count=1,
        thread_count=1,
    )
    branch_model = burro.branch_model

    px_branch_lengths = burro.sample_topologies(1)
    branch_lengths = np.array(px_branch_lengths[0], copy=False)
    theta_sample = np.array([0.184472, 0.027993, 0.045583])
    branch_lengths[:] = theta_sample
    px_theta_sample = np.array([theta_sample])

    mathieu_q_params = np.array([[-1.728809, 0.459529], [-2.410943, 0.748569],
                                 [-2.410977, 0.748571]])
    px_branch_representation = branch_model.px_branch_representation()
    branch_rep = px_branch_representation[0]
    # So if the 0th entry of branch_rep is 1, then we are setting the 1th entry of our
    # parameters to the 0th entry of mathieu's, which is in terms of branches.
    branch_model.scalar_model.q_params[branch_rep, :] = mathieu_q_params

    assert np.array(burro.inst.log_likelihoods())[0] == approx(-81.446550)
    assert burro.branch_model.log_prior(px_theta_sample)[0] == approx(4.327275)
    assert burro.branch_model.log_prob(
        px_theta_sample, px_branch_representation) == approx(5.330697,
                                                             rel=1e-5)
Пример #4
0
def fixed(
    data_path,
    *,
    branch_model_name,
    scalar_model_name,
    optimizer_name,
    step_count,
    particle_count,
    thread_count
):
    data_path = os.path.normpath(data_path)
    data_id = os.path.basename(data_path)
    mcmc_nexus_path = os.path.join(data_path, data_id + "_out.t")
    fasta_path = os.path.join(data_path, data_id + ".fasta")
    burn_in_fraction = 0.1
    particle_count_for_final_elbo_estimate = 10000
    phylo_model_specification = libsbn.PhyloModelSpecification(
        substitution="JC69", site="constant", clock="strict"
    )
    # Read MCMC run and get split lengths.
    mcmc_inst = libsbn.unrooted_instance("mcmc_inst")
    mcmc_inst.read_nexus_file(mcmc_nexus_path)
    burn_in_count = int(burn_in_fraction * mcmc_inst.tree_count())
    mcmc_inst.tree_collection.erase(0, burn_in_count)
    mcmc_inst.process_loaded_trees()
    ragged = [np.array(a) for a in mcmc_inst.split_lengths()]
    mcmc_split_lengths = pd.concat(
        [pd.DataFrame({"variable": idx, "value": a}) for idx, a in enumerate(ragged)],
        sort=False,
    )
    last_sampled_split_lengths = np.array([a[-1] for a in ragged])

    burro = vip.burrito.Burrito(
        mcmc_nexus_path=mcmc_nexus_path,
        burn_in_fraction=burn_in_fraction,
        fasta_path=fasta_path,
        phylo_model_specification=phylo_model_specification,
        branch_model_name=branch_model_name,
        scalar_model_name=scalar_model_name,
        optimizer_name=optimizer_name,
        particle_count=particle_count,
        thread_count=thread_count,
    )
    burro.branch_model.mode_match(last_sampled_split_lengths)

    start_time = timeit.default_timer()
    burro.gradient_steps(step_count)
    gradient_time = timeit.default_timer() - start_time
    opt_trace = pd.DataFrame({"elbo": burro.opt.trace}).reset_index()

    # We sample from our fit model as many times as there were trees in our MCMC sample.
    fit_sample = pd.DataFrame(burro.branch_model.sample_all(mcmc_inst.tree_count()))
    fit_sample["type"] = "vb"
    mcmc_split_lengths["type"] = "mcmc"
    fitting_results = pd.concat(
        [fit_sample.melt(id_vars="type"), mcmc_split_lengths], sort=False
    )
    fitting_results["variable"] = fitting_results["variable"].astype(str)
    final_elbo = burro.estimate_elbo(
        particle_count=particle_count_for_final_elbo_estimate
    )

    run_details = {"gradient_time": gradient_time, "final_elbo": final_elbo}

    return run_details, opt_trace, fitting_results
Пример #5
0
"""Some basic testing and demo code for the libsbn module.

If you want to see the results of the print statements, use `pytest -s`.
"""

import json
import pprint
import pytest
import numpy as np
import libsbn
import libsbn.beagle_flags as beagle_flags

SIMPLE_SPECIFICATION = libsbn.PhyloModelSpecification(substitution="JC69",
                                                      site="constant",
                                                      clock="none")


def convert_dict_to_int(dictionary):
    """Change the values of a dict to ints."""
    return {k: int(v) for k, v in dictionary.items()}


def hello_demo():
    """Demonstrate basic phylogenetic likelihood calculation using the "hello"
    data set."""
    inst = libsbn.unrooted_instance("charlie")
    inst.tree_collection = libsbn.UnrootedTreeCollection(
        [libsbn.UnrootedTree.of_parent_id_vector([3, 3, 3])],
        ["mars", "saturn", "jupiter"],
    )
    inst.read_fasta_file("data/hello.fasta")