def ds1_phylo_model_demo(inst): """Demonstrate how phylogenetic models and likelihoods work using DS1.""" inst.read_fasta_file("data/DS1.fasta") # Just use the first tree for likelihood comparison. inst.tree_collection.erase(1, 10) branch_lengths = np.array(inst.tree_collection.trees[0].branch_lengths, copy=False) branch_lengths[:] = 0.1 inst.prepare_for_phylo_likelihood(SIMPLE_SPECIFICATION, 2) jc69_likelihood = np.array(inst.log_likelihoods()) # Showing off phylo_model_param_block_map. gtr_specification = libsbn.PhyloModelSpecification(substitution="GTR", site="constant", clock="none") inst.prepare_for_phylo_likelihood(gtr_specification, 2) phylo_model_param_block_map = inst.get_phylo_model_param_block_map() phylo_model_param_block_map["GTR rates"][:] = 1.0 phylo_model_param_block_map["frequencies"][:] = 0.25 print("\nHere's a look at phylo_model_param_block_map:") pprint.pprint(phylo_model_param_block_map) print("\nWe can see that we are changing the phylo_model_params matrix:") print(inst.get_phylo_model_params(), "\n") assert jc69_likelihood == pytest.approx(np.array(inst.log_likelihoods()))
def log_prob_conditioned_branch_only(newick_file, fasta_file, subst_model, frequencies, rescaling=False, **subst_model_params): if isinstance(subst_model, treeflow.substitution_model.JC): subst_model_string = 'JC69' param_updates = { } elif isinstance(subst_model, treeflow.substitution_model.GTR): subst_model_string = 'GTR' param_updates = { 'GTR rates': np.array(subst_model_params['rates']), 'frequencies': np.array(frequencies) } elif isinstance(subst_model, treeflow.substitution_model.HKY): subst_model_string = 'GTR' kappa = subst_model_params['kappa'] rates = np.ones(6) rates[1] = kappa rates[4] = kappa param_updates = { 'GTR rates': rates, 'frequencies': np.array(frequencies) } else: raise ValueError('Unsupported substitution model') inst = libsbn.rooted_instance('treeflow') inst.read_newick_file(newick_file) inst.read_fasta_file(fasta_file) inst.set_rescaling(rescaling) model_specification = libsbn.PhyloModelSpecification(subst_model_string, 'constant','strict') inst.prepare_for_phylo_likelihood(model_specification, 1) phylo_model_param_block_map = inst.get_phylo_model_param_block_map() phylo_model_param_block_map["clock rate"][:] = 1.0 for key, value in param_updates.items(): phylo_model_param_block_map[key][:] = value parent_id_vector = np.array(inst.tree_collection.trees[0].parent_id_vector()) root_id = parent_id_vector.shape[0] root_children = np.nonzero(parent_id_vector == root_id) branch_lengths = np.array(inst.tree_collection.trees[0].branch_lengths, copy=False) def libsbn_func(x): branch_lengths[:-1] = x gradient = inst.gradients()[0] grad_array = np.array(gradient.branch_lengths, dtype=np.float32)[:-1] grad_array[root_children] = np.sum(grad_array[root_children]) return np.array(gradient.log_likelihood, dtype=np.float32), grad_array libsbn_func_vec = np.vectorize(libsbn_func, [np.float32, np.float32], signature='(n)->(),(n)') @tf.custom_gradient def libsbn_tf_func(x): logp, grad_val = tf.numpy_function(libsbn_func_vec, [x], [tf.float32, tf.float32]) def grad(dlogp): return tf.expand_dims(dlogp, -1) * grad_val return logp, grad return libsbn_tf_func, inst
def test_elbo_innards(): """ From Mathieu: tree.mars.distance mu: -1.728809 sigma: 0.459529 sample: 0.184472 tree.saturn.distance mu: -2.410943 sigma: 0.748569 sample: 0.027993 tree.jupiter.distance mu: -2.410977 sigma: 0.748571 sample: 0.045583 like: -81.446550 prior: 4.327275 logQ: 5.330697 elbo: -82.449972 """ phylo_model_specification = libsbn.PhyloModelSpecification( substitution="JC69", site="constant", clock="strict") burro = vip.burrito.Burrito( mcmc_nexus_path="data/hello_out.t", burn_in_fraction=0, fasta_path="data/hello.fasta", phylo_model_specification=phylo_model_specification, branch_model_name="split", scalar_model_name="lognormal", optimizer_name="simple", particle_count=1, thread_count=1, ) branch_model = burro.branch_model px_branch_lengths = burro.sample_topologies(1) branch_lengths = np.array(px_branch_lengths[0], copy=False) theta_sample = np.array([0.184472, 0.027993, 0.045583]) branch_lengths[:] = theta_sample px_theta_sample = np.array([theta_sample]) mathieu_q_params = np.array([[-1.728809, 0.459529], [-2.410943, 0.748569], [-2.410977, 0.748571]]) px_branch_representation = branch_model.px_branch_representation() branch_rep = px_branch_representation[0] # So if the 0th entry of branch_rep is 1, then we are setting the 1th entry of our # parameters to the 0th entry of mathieu's, which is in terms of branches. branch_model.scalar_model.q_params[branch_rep, :] = mathieu_q_params assert np.array(burro.inst.log_likelihoods())[0] == approx(-81.446550) assert burro.branch_model.log_prior(px_theta_sample)[0] == approx(4.327275) assert burro.branch_model.log_prob( px_theta_sample, px_branch_representation) == approx(5.330697, rel=1e-5)
def fixed( data_path, *, branch_model_name, scalar_model_name, optimizer_name, step_count, particle_count, thread_count ): data_path = os.path.normpath(data_path) data_id = os.path.basename(data_path) mcmc_nexus_path = os.path.join(data_path, data_id + "_out.t") fasta_path = os.path.join(data_path, data_id + ".fasta") burn_in_fraction = 0.1 particle_count_for_final_elbo_estimate = 10000 phylo_model_specification = libsbn.PhyloModelSpecification( substitution="JC69", site="constant", clock="strict" ) # Read MCMC run and get split lengths. mcmc_inst = libsbn.unrooted_instance("mcmc_inst") mcmc_inst.read_nexus_file(mcmc_nexus_path) burn_in_count = int(burn_in_fraction * mcmc_inst.tree_count()) mcmc_inst.tree_collection.erase(0, burn_in_count) mcmc_inst.process_loaded_trees() ragged = [np.array(a) for a in mcmc_inst.split_lengths()] mcmc_split_lengths = pd.concat( [pd.DataFrame({"variable": idx, "value": a}) for idx, a in enumerate(ragged)], sort=False, ) last_sampled_split_lengths = np.array([a[-1] for a in ragged]) burro = vip.burrito.Burrito( mcmc_nexus_path=mcmc_nexus_path, burn_in_fraction=burn_in_fraction, fasta_path=fasta_path, phylo_model_specification=phylo_model_specification, branch_model_name=branch_model_name, scalar_model_name=scalar_model_name, optimizer_name=optimizer_name, particle_count=particle_count, thread_count=thread_count, ) burro.branch_model.mode_match(last_sampled_split_lengths) start_time = timeit.default_timer() burro.gradient_steps(step_count) gradient_time = timeit.default_timer() - start_time opt_trace = pd.DataFrame({"elbo": burro.opt.trace}).reset_index() # We sample from our fit model as many times as there were trees in our MCMC sample. fit_sample = pd.DataFrame(burro.branch_model.sample_all(mcmc_inst.tree_count())) fit_sample["type"] = "vb" mcmc_split_lengths["type"] = "mcmc" fitting_results = pd.concat( [fit_sample.melt(id_vars="type"), mcmc_split_lengths], sort=False ) fitting_results["variable"] = fitting_results["variable"].astype(str) final_elbo = burro.estimate_elbo( particle_count=particle_count_for_final_elbo_estimate ) run_details = {"gradient_time": gradient_time, "final_elbo": final_elbo} return run_details, opt_trace, fitting_results
"""Some basic testing and demo code for the libsbn module. If you want to see the results of the print statements, use `pytest -s`. """ import json import pprint import pytest import numpy as np import libsbn import libsbn.beagle_flags as beagle_flags SIMPLE_SPECIFICATION = libsbn.PhyloModelSpecification(substitution="JC69", site="constant", clock="none") def convert_dict_to_int(dictionary): """Change the values of a dict to ints.""" return {k: int(v) for k, v in dictionary.items()} def hello_demo(): """Demonstrate basic phylogenetic likelihood calculation using the "hello" data set.""" inst = libsbn.unrooted_instance("charlie") inst.tree_collection = libsbn.UnrootedTreeCollection( [libsbn.UnrootedTree.of_parent_id_vector([3, 3, 3])], ["mars", "saturn", "jupiter"], ) inst.read_fasta_file("data/hello.fasta")