def get_sample_results(sequence_length, ntaxa, nj_like, branch_length_sampler): """ @param sequence_length: the length of each sequence in the sampled alignment @param ntaxa: the number of sequences in the sampled tree @param nj_like: True to create subsequent distance matrices using a generalized neighbor-joining-like approach @param branch_length_sampler: the length of each branch is independently sampled by this function @return: a numpy array conformant to the global header list """ # initialize the array that will be returned attribute_array = np.zeros((len(g_headers),), dtype=np.int) # first sample a tree and get its set of informative splits tree = TreeSampler.sample_agglomerated_tree(ntaxa) true_splits = tree.get_nontrivial_splits() # sample the branch lengths for branch in tree.get_branches(): branch.length = branch_length_sampler() # sample a distance matrix try: D = sample_distance_matrix(tree, sequence_length) except InfiniteDistanceError as e: return incr_attribute(attribute_array, 'nsamples.rejected.inf') except ZeroDistanceError as e: return incr_attribute(attribute_array, 'nsamples.rejected.zero') except BuildTreeTopology.InvalidSpectralSplitException, e: return incr_attribute(attribute_array, 'nsamples.rejected.fail')
def get_sample_results(sequence_length, ntaxa, nj_like, branch_length_sampler): """ @param sequence_length: the length of each sequence in the sampled alignment @param ntaxa: the number of sequences in the sampled tree @param nj_like: True to create subsequent distance matrices using a generalized neighbor-joining-like approach @param branch_length_sampler: the length of each branch is independently sampled by this function @return: a numpy array conformant to the global header list """ # initialize the array that will be returned attribute_array = np.zeros((len(g_headers), ), dtype=np.int) # first sample a tree and get its set of informative splits tree = TreeSampler.sample_agglomerated_tree(ntaxa) true_splits = tree.get_nontrivial_splits() # sample the branch lengths for branch in tree.get_branches(): branch.length = branch_length_sampler() # sample a distance matrix try: D = sample_distance_matrix(tree, sequence_length) except InfiniteDistanceError as e: return incr_attribute(attribute_array, 'nsamples.rejected.inf') except ZeroDistanceError as e: return incr_attribute(attribute_array, 'nsamples.rejected.zero') except BuildTreeTopology.InvalidSpectralSplitException, e: return incr_attribute(attribute_array, 'nsamples.rejected.fail')
def process(ntaxa, nseconds, branch_length_sampler): """ The sampling functor returns a branch length and has a string cast. @param ntaxa: the number of taxa in the sampled trees @param nseconds: allow this many seconds to run or None to run forever @param branch_length_sampler: a sampling functor @return: a multi-line string that summarizes the results """ data_rows = [] start_time = time.time() try: while True: elapsed_time = time.time() - start_time if nseconds and elapsed_time > nseconds: break # sample the tree tree = TreeSampler.sample_agglomerated_tree(ntaxa) # get the atteson bound for branch in tree.get_branches(): branch.length = branch_length_sampler() atteson_bound = 0.5 * min(b.length for b in tree.get_branches()) # get the spectral bound D = np.array(tree.get_distance_matrix()) k = len(D) spectral_bound = get_stability(D) / k # store the row row = [atteson_bound, spectral_bound, tree.get_newick_string()] data_rows.append(row) except KeyboardInterrupt, e: pass
def gen_trees(): n_base_leaves = 4 n_expected_extra_leaves = 1 expected_branch_length = 1 while True: yield TreeSampler.sample_tree(n_base_leaves, n_expected_extra_leaves, expected_branch_length)
def gen_trees(): n_base_leaves = 4 n_expected_extra_leaves = 1 expected_branch_length = 1 while True: yield TreeSampler.sample_tree( n_base_leaves, n_expected_extra_leaves, expected_branch_length)
def get_response_content(fs): flags = get_flags(fs) nseconds = 5 tm = time.time() rejected_s = None nerrors = 0 nchecked = 0 while time.time() < tm + nseconds and not rejected_s: nchecked += 1 # Sample a Newick tree. true_f = TreeSampler.sample_tree(fs.nleaves, 0, 1.0) true_s = NewickIO.get_newick_string(true_f) true_tree = Newick.parse(true_s, Newick.NewickTree) # Get the leaf and internal vertex ids for the true tree. internal_ids = set(id(x) for x in true_tree.gen_internal_nodes()) leaf_ids = set(id(x) for x in true_tree.gen_tips()) nleaves = len(leaf_ids) # Get the harmonic valuations for all vertices of the tree. id_to_full_val_list = [ Harmonic.get_harmonic_valuations(true_tree, i) for i in range(1, nleaves) ] # Check for small valuations at the leaves. try: for id_to_full_val in id_to_full_val_list: for x in leaf_ids: value = id_to_full_val[x] if abs(value) < 1e-8: raise CheckTreeError('the true tree is too symmetric') except CheckTreeError as e: nerrors += 1 continue # Assign the leaf values and assign None to internal values. id_to_val_list = [] for id_to_full_val in id_to_full_val_list: d = {} for x in leaf_ids: s = -1 if id_to_full_val[x] < 0 else 1 d[x] = s for x in internal_ids: d[x] = None id_to_val_list.append(d) # Define the topology in a different format. id_to_adj = get_id_to_adj(true_tree) # Check the tree for self-compatibility under the given conditions. id_to_vals = SeekEigenLacing.rec_eigen(id_to_adj, id_to_val_list, flags) if not id_to_vals: rejected_s = true_s # make the report out = StringIO() if rejected_s: print >> out, 'rejected a true tree:' print >> out, rejected_s else: print >> out, 'no true tree was rejected' print >> out print >> out, nchecked, 'trees were sampled total' print >> out, nerrors, 'trees were too symmetric' return out.getvalue()
def get_response_content(fs): flags = get_flags(fs) nseconds = 5 tm = time.time() rejected_s = None nerrors = 0 nchecked = 0 while time.time() < tm + nseconds and not rejected_s: nchecked += 1 # Sample a Newick tree. true_f = TreeSampler.sample_tree(fs.nleaves, 0, 1.0) true_s = NewickIO.get_newick_string(true_f) true_tree = Newick.parse(true_s, Newick.NewickTree) # Get the leaf and internal vertex ids for the true tree. internal_ids = set(id(x) for x in true_tree.gen_internal_nodes()) leaf_ids = set(id(x) for x in true_tree.gen_tips()) nleaves = len(leaf_ids) # Get the harmonic valuations for all vertices of the tree. id_to_full_val_list = [Harmonic.get_harmonic_valuations( true_tree, i) for i in range(1, nleaves)] # Check for small valuations at the leaves. try: for id_to_full_val in id_to_full_val_list: for x in leaf_ids: value = id_to_full_val[x] if abs(value) < 1e-8: raise CheckTreeError('the true tree is too symmetric') except CheckTreeError as e: nerrors += 1 continue # Assign the leaf values and assign None to internal values. id_to_val_list = [] for id_to_full_val in id_to_full_val_list: d = {} for x in leaf_ids: s = -1 if id_to_full_val[x] < 0 else 1 d[x] = s for x in internal_ids: d[x] = None id_to_val_list.append(d) # Define the topology in a different format. id_to_adj = get_id_to_adj(true_tree) # Check the tree for self-compatibility under the given conditions. id_to_vals = SeekEigenLacing.rec_eigen( id_to_adj, id_to_val_list, flags) if not id_to_vals: rejected_s = true_s # make the report out = StringIO() if rejected_s: print >> out, 'rejected a true tree:' print >> out, rejected_s else: print >> out, 'no true tree was rejected' print >> out print >> out, nchecked, 'trees were sampled total' print >> out, nerrors, 'trees were too symmetric' return out.getvalue()
def main(): filename = 'counterexamples.out' fout = open(filename, 'wt') print 'Does monotonically transforming the pairwise leaf distances affect the compatibility' print 'of the split found using principal coordinate analysis?' print 'I am looking through random trees for a tree that is split incompatibly' print 'when distances are squared.' print 'Use control-c to stop the program when you get bored.' try: count = 0 ncounterexamples = 0 nerrors = 0 while True: count += 1 # get a random tree n_base_leaves = 4 n_expected_extra_leaves = 1 expected_branch_length = 1 tree = TreeSampler.sample_tree(n_base_leaves, n_expected_extra_leaves, expected_branch_length) # get the set of valid partitions implied by the tree valid_parts = TreeComparison.get_partitions(tree) ordered_tip_names = [tip.get_name() for tip in tree.gen_tips()] # assert that the partition implied by the correct formula is valid D = np.array(tree.get_distance_matrix(ordered_tip_names)) loadings = get_principal_coordinate(D) nonneg_leaf_set = frozenset( tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0) neg_leaf_set = frozenset( tip for tip, v in zip(ordered_tip_names, loadings) if v < 0) part = frozenset([nonneg_leaf_set, neg_leaf_set]) if part not in valid_parts: nerrors += 1 print >> fout, 'error: a partition that was supposed to be valid was found to be invalid' print >> fout, 'tree:', NewickIO.get_newick_string(tree) print >> fout, 'invalid partition:', partition_to_string(part) print >> fout # check the validity of the partition implied by the incorrect formula Q = D * D loadings = get_principal_coordinate(Q) nonneg_leaf_set = frozenset( tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0) neg_leaf_set = frozenset( tip for tip, v in zip(ordered_tip_names, loadings) if v < 0) part = frozenset([nonneg_leaf_set, neg_leaf_set]) if part not in valid_parts: ncounterexamples += 1 print >> fout, 'found a counterexample!' print >> fout, 'tree:', NewickIO.get_newick_string(tree) print >> fout, 'invalid partition:', partition_to_string(part) print >> fout except KeyboardInterrupt, e: print 'trees examined:', count print 'errors:', nerrors print 'counterexamples:', ncounterexamples
def process(ntaxa, nseconds, nsamples, branch_length_sampler, use_pbar): """ @param ntaxa: the number of taxa per tree @param nseconds: stop after this many seconds @param nsamples: stop after this many samples @param branch_length_sampler: this function samples branch lengths independently @param use_pbar: True iff a progress bar should be used @return: a multi-line string of the contents of an R table """ a_successes = 0 a_failures = 0 b_successes = 0 b_failures = 0 # Repeatedly analyze samples. # We might have to stop early if we run out of time or if ctrl-c is pressed. # If we have to stop early, then show the results of the progress so far. termination_reason = 'no reason for termination was given' start_time = time.time() pbar = Progress.Bar(nsamples) if use_pbar else None try: for sample_index in range(nsamples): # check the time if nseconds and time.time() - start_time > nseconds: raise TimeoutError() # sample a tree tree = TreeSampler.sample_agglomerated_tree(ntaxa) for branch in tree.get_branches(): branch.length = branch_length_sampler() D = np.array(tree.get_distance_matrix()) # get the split defined by the tree original_split = get_split(D) # get the stability of the split stability = get_stability(D) # sample a perturbation matrix that should not change the split E = sample_perturbation_matrix(ntaxa, stability/2) # evaluate the split induced by the unerperturbed perturbed distance matrix perturbed_split = get_split(D + E) if original_split == perturbed_split: a_successes += 1 else: a_failures += 1 # evaluage the split induced by the overperturbed distance matrix perturbed_split = get_split(D + E*200) if original_split == perturbed_split: b_successes += 1 else: b_failures += 1 # update the progress bar if pbar: pbar.update(sample_index + 1) else: termination_reason = 'the requested number of samples was attained' except KeyboardInterrupt, e: termination_reason = 'keyboard interrupt'
def get_response_content(fs): flags_a = get_flags_a(fs) flags_b = get_flags_b(fs) data = CheckTreeData(flags_a, flags_b) nseconds = 5 tm = time.time() while time.time() < tm + nseconds: # Sample a pair of Newick trees. true_f = TreeSampler.sample_tree(fs.nleaves, 0, 1.0) test_f = TreeSampler.sample_tree(fs.nleaves, 0, 1.0) true_s = NewickIO.get_newick_string(true_f) test_s = NewickIO.get_newick_string(test_f) true_tree = Newick.parse(true_s, Newick.NewickTree) test_tree = Newick.parse(test_s, Newick.NewickTree) # Add the pairwise check to the data borg. try: found_difference = check_tree_pair(true_tree, test_tree, data) except CheckTreeError as e: data.add_error(e) # Check to see if we should stop early. if found_difference and fs.halt_on_difference: break # make the report out = StringIO() if data.report: print >> out, 'found a difference in rejection power' print >> out print >> out, data.report print >> out else: print >> out, 'failed to find a difference in rejection power' print >> out print >> out, 'search summary:' m = data.acceptance_matrix print >> out, 'A reject, B reject:', m[0, 0] print >> out, 'A reject, B accept:', m[0, 1] print >> out, 'A accept, B reject:', m[1, 0] print >> out, 'A accept, B accept:', m[1, 1] print >> out, data.nerrors, 'tree symmetry errors' return out.getvalue()
def get_response_content(fs): """ @param fs: a FieldStorage object containing the cgi arguments @return: a (response_headers, response_text) pair """ out = StringIO() # get some samples for i in range(fs.ntrees): tree = TreeSampler.sample_tree(fs.leafbase, fs.leafmean, fs.branchmean) # write the tree print >> out, NewickIO.get_newick_string(tree) # return the response return out.getvalue()
def main(): filename = 'counterexamples.out' fout = open(filename, 'wt') print 'Does monotonically transforming the pairwise leaf distances affect the compatibility' print 'of the split found using principal coordinate analysis?' print 'I am looking through random trees for a tree that is split incompatibly' print 'when distances are squared.' print 'Use control-c to stop the program when you get bored.' try: count = 0 ncounterexamples = 0 nerrors = 0 while True: count += 1 # get a random tree n_base_leaves = 4 n_expected_extra_leaves = 1 expected_branch_length = 1 tree = TreeSampler.sample_tree(n_base_leaves, n_expected_extra_leaves, expected_branch_length) # get the set of valid partitions implied by the tree valid_parts = TreeComparison.get_partitions(tree) ordered_tip_names = [tip.get_name() for tip in tree.gen_tips()] # assert that the partition implied by the correct formula is valid D = np.array(tree.get_distance_matrix(ordered_tip_names)) loadings = get_principal_coordinate(D) nonneg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0) neg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v < 0) part = frozenset([nonneg_leaf_set, neg_leaf_set]) if part not in valid_parts: nerrors += 1 print >> fout, 'error: a partition that was supposed to be valid was found to be invalid' print >> fout, 'tree:', NewickIO.get_newick_string(tree) print >> fout, 'invalid partition:', partition_to_string(part) print >> fout # check the validity of the partition implied by the incorrect formula Q = D * D loadings = get_principal_coordinate(Q) nonneg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0) neg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v < 0) part = frozenset([nonneg_leaf_set, neg_leaf_set]) if part not in valid_parts: ncounterexamples += 1 print >> fout, 'found a counterexample!' print >> fout, 'tree:', NewickIO.get_newick_string(tree) print >> fout, 'invalid partition:', partition_to_string(part) print >> fout except KeyboardInterrupt, e: print 'trees examined:', count print 'errors:', nerrors print 'counterexamples:', ncounterexamples
def process(ntaxa, length, nseconds, builders, branch_length_sampler): """ @param ntaxa: the number of taxa in the sampled trees @param length: the length of sequences used to sample the distance matrix @param nseconds: allow this many seconds to run @param builders: tree builder objects @param branch_length_sampler: returns a tree drawn from some distribution @return: a multi-line string that summarizes the results """ start_time = time.time() # track the number of samples that failed for various reasons n_zero_errors = 0 n_infinite_errors = 0 n_failed_spectral_splits = 0 # define the number of attempts that fall into each of the four categories non_atteson_results = [[0, 0], [0, 0]] atteson_results = [[0, 0], [0, 0]] #pachter_results = [[0, 0], [0, 0]] # evaluate the quality of reconstructions from a bunch of different samples try: while True: elapsed_time = time.time() - start_time if nseconds and elapsed_time > nseconds: break # sample the tree topology and get its set of implied full label splits tree = TreeSampler.sample_agglomerated_tree(ntaxa) true_splits = tree.get_nontrivial_splits() # sample the branch lengths for branch in tree.get_branches(): branch.length = branch_length_sampler() try: D = sample_distance_matrix(tree, length) a, b = [ builder.evaluate(true_splits, D) for builder in builders ] if BuildTreeTopology.is_atteson(tree, D): atteson_results[a][b] += 1 #elif BuildTreeTopology.is_quartet_additive(tree, D) and BuildTreeTopology.is_quartet_consistent(tree, D): #pachter_results[a][b] += 1 else: non_atteson_results[a][b] += 1 except InfiniteDistanceError as e: n_infinite_errors += 1 except ZeroDistanceError as e: n_zero_errors += 1 except BuildTreeTopology.InvalidSpectralSplitException, e: n_failed_spectral_splits += 1 except KeyboardInterrupt, e: pass
def process(ntaxa, nseconds, branch_length_sampler): """ @param ntaxa: the number of taxa in the sampled trees @param nseconds: allow this many seconds to run or None to run forever @param branch_length_sampler: a functor that returns a branch length and has a string cast @return: a multi-line string that summarizes the results """ start_time = time.time() # initialize some state that will be tracked over the entire run degenerate_count = 0 invalid_split_count = 0 valid_split_count = 0 spectral_error_count = 0 atteson_error_count = 0 counterexample_D = None counterexample_tree = None # do a bunch of reconstructions from sampled distance matrices try: while True: elapsed_time = time.time() - start_time if nseconds and elapsed_time > nseconds: break # sample the tree topology and get its set of implied full label splits tree = TreeSampler.sample_agglomerated_tree(ntaxa) true_splits = tree.get_nontrivial_splits() # sample the branch lengths for branch in tree.get_branches(): branch.length = branch_length_sampler() # sample the atteson distance matrix D = sample_atteson_distance_matrix(tree) # assert that the atteson condition is true if not BuildTreeTopology.is_atteson(tree, D): atteson_error_count += 1 else: try: # see if the eigensplit is in the set of true splits eigensplit = BuildTreeTopology.split_using_eigenvector(D) if eigensplit in true_splits: valid_split_count += 1 else: invalid_split_count += 1 counterexample_D = D counterexample_tree = tree break except BuildTreeTopology.DegenerateSplitException, e: degenerate_count += 1 except BuildTreeTopology.InvalidSpectralSplitException, e: spectral_error_count += 1
def process(ntaxa, length, nseconds, builders, branch_length_sampler): """ @param ntaxa: the number of taxa in the sampled trees @param length: the length of sequences used to sample the distance matrix @param nseconds: allow this many seconds to run @param builders: tree builder objects @param branch_length_sampler: returns a tree drawn from some distribution @return: a multi-line string that summarizes the results """ start_time = time.time() # track the number of samples that failed for various reasons n_zero_errors = 0 n_infinite_errors = 0 n_failed_spectral_splits = 0 # define the number of attempts that fall into each of the four categories non_atteson_results = [[0, 0], [0, 0]] atteson_results = [[0, 0], [0, 0]] #pachter_results = [[0, 0], [0, 0]] # evaluate the quality of reconstructions from a bunch of different samples try: while True: elapsed_time = time.time() - start_time if nseconds and elapsed_time > nseconds: break # sample the tree topology and get its set of implied full label splits tree = TreeSampler.sample_agglomerated_tree(ntaxa) true_splits = tree.get_nontrivial_splits() # sample the branch lengths for branch in tree.get_branches(): branch.length = branch_length_sampler() try: D = sample_distance_matrix(tree, length) a, b = [builder.evaluate(true_splits, D) for builder in builders] if BuildTreeTopology.is_atteson(tree, D): atteson_results[a][b] += 1 #elif BuildTreeTopology.is_quartet_additive(tree, D) and BuildTreeTopology.is_quartet_consistent(tree, D): #pachter_results[a][b] += 1 else: non_atteson_results[a][b] += 1 except InfiniteDistanceError as e: n_infinite_errors += 1 except ZeroDistanceError as e: n_zero_errors += 1 except BuildTreeTopology.InvalidSpectralSplitException, e: n_failed_spectral_splits += 1 except KeyboardInterrupt, e: pass
def process(ntaxa): """ @param ntaxa: use this many taxa per tree @return: a multi-line string that summarizes the results """ np.set_printoptions(linewidth=200) # sample an xtree topology xtree = TreeSampler.sample_agglomerated_tree(ntaxa) # convert the xtree to a FelTree, although I guess this might not be necessary tree_string = xtree.get_newick_string() tree = NewickIO.parse(tree_string, FelTree.NewickTree) # get ordered ids and the number of leaves and some auxiliary variables ordered_ids = get_ordered_ids(tree) nleaves = len(list(tree.gen_tips())) id_to_index = dict((myid, i) for i, myid in enumerate(ordered_ids)) # sample random branch lengths sample_branch_lengths(tree) # get the weighted tree string weighted_tree_string = NewickIO.get_newick_string(tree) # get the distance matrix relating all vertices D = np.array(tree.get_partial_distance_matrix(ordered_ids)) # create a mass vector that sums to one m = np.array([random.randrange(1, 10) for i in range(len(D))], dtype=float) m /= sum(m) # get the S matrix S = edm_to_S(D, m) # get the pseudoinverse of S S_pinv = np.linalg.pinv(S) # make the response out = StringIO() print >> out, 'newick tree:', weighted_tree_string print >> out print >> out, 'm:' print >> out, m print >> out print >> out, 'D:' print >> out, D print >> out print >> out, 'S:' print >> out, S print >> out print >> out, 'pseudoinverse of S:' print >> out, S_pinv print >> out return out.getvalue().strip()
def process(ntaxa, length, nseconds, branch_length_sampler, use_nj, use_modified_nj, use_all_spectral, use_one_spectral): """ @param ntaxa: the number of taxa in the sampled trees @param length: the length of sequences used to sample the distance matrix @param nseconds: allow this many seconds to run or None to run forever @param branch_length_sampler: a functor that returns a branch length and has a string cast @return: a multi-line string that summarizes the results """ start_time = time.time() # initialize the builder object builder = Builder() # track the number of samples that failed for various reasons n_zero_errors = 0 n_infinite_errors = 0 n_failed_spectral_splits = 0 # do a bunch of reconstructions of sampled distance matrices try: while True: elapsed_time = time.time() - start_time if nseconds and elapsed_time > nseconds: break # sample the tree topology and get its set of implied full label splits tree = TreeSampler.sample_agglomerated_tree(ntaxa) true_splits = tree.get_nontrivial_splits() # sample the branch lengths for branch in tree.get_branches(): branch.length = branch_length_sampler() try: D = sample_distance_matrix(tree, length) # determine whether or not the distance matrix is Atteson with respect to the tree atteson = BuildTreeTopology.is_atteson(tree, D) # record information about the splits builder.evaluate(true_splits, D, atteson, use_nj, use_modified_nj, use_all_spectral, use_one_spectral) except InfiniteDistanceError as e: n_infinite_errors += 1 except ZeroDistanceError as e: n_zero_errors += 1 except BuildTreeTopology.InvalidSpectralSplitException, e: n_failed_spectral_splits += 1 except KeyboardInterrupt, e: pass
def process(ntaxa, nseconds, seqlen, nsamples, branch_length_sampler, use_pbar): """ @param ntaxa: the number of taxa per tree @param nseconds: stop after this many seconds @param seqlen: use this sequence length @param nsamples: stop after this many samples per sequence length @param branch_length_sampler: this function samples branch lengths independently @param use_pbar: True iff a progress bar should be used @return: a multi-line string of the contents of an R table """ # initialize the global rejection counts nrejected_zero = 0 nrejected_inf = 0 nrejected_fail = 0 naccepted = 0 # Initialize the accumulation matrix. # The rows specify the size of the smaller side of the initial split. # The columns specify the compatibility status of the split. nsmall_sizes = (ntaxa / 2) + 1 accum = np.zeros((nsmall_sizes, 2), dtype=np.int) # Repeatedly analyze samples. # We might have to stop early if we run out of time or if ctrl-c is pressed. # If we have to stop early, then show the results of the progress so far. termination_reason = 'no reason for termination was given' start_time = time.time() pbar = Progress.Bar(nsamples) if use_pbar else None try: for sample_index in range(nsamples): # keep trying to get an accepted sample while True: # check the time if nseconds and time.time() - start_time > nseconds: raise TimeoutError() # first sample a tree and get its set of informative splits tree = TreeSampler.sample_agglomerated_tree(ntaxa) true_splits = tree.get_nontrivial_splits() # sample the branch lengths for branch in tree.get_branches(): branch.length = branch_length_sampler() # Attempt to sample a distance matrix. # If the sample was rejected then note the reason and go back to the drawing board. try: D = sample_distance_matrix(tree, seqlen) except InfiniteDistanceError as e: nrejected_inf += 1 continue except ZeroDistanceError as e: nrejected_zero += 1 continue # Attempt to estimate the primary split of the tree from the distance matrix. # If there was a technical failure then note it and go back to the drawing board. # Otherwise note the compatibility and balance of the split. try: eigensplit = BuildTreeTopology.split_using_eigenvector(D) small_size = min(len(side) for side in eigensplit) if eigensplit in true_splits: compatibility = 1 else: compatibility = 0 except BuildTreeTopology.DegenerateSplitException, e: small_size = 0 compatibility = 1 except BuildTreeTopology.InvalidSpectralSplitException, e: nrejected_fail += 1 continue
def process(ntaxa): np.set_printoptions(linewidth=200) out = StringIO() # sample an xtree topology xtree = TreeSampler.sample_agglomerated_tree(ntaxa) # sample an xtree with exponentially distributed branch lengths mu = 2.0 for branch in xtree.get_branches(): branch.length = random.expovariate(1/mu) # convert the xtree to a FelTree so we can use the internal vertices tree_string = xtree.get_newick_string() tree = NewickIO.parse(tree_string, FelTree.NewickTree) # get ordered ids and the number of leaves and some auxiliary variables ordered_ids = get_ordered_ids(tree) nleaves = len(list(tree.gen_tips())) id_to_index = dict((myid, i) for i, myid in enumerate(ordered_ids)) # get the distance matrix relating all of the points D_full = np.array(tree.get_full_distance_matrix(ordered_ids)) # Now do the projection so that # the resulting points are in the subspace whose basis vectors are the axes of the leaf ellipsoid. # First get the points such that the n rows in X are points in n-1 dimensional space. X = Euclid.edm_to_points(D_full) print >> out, 'points with centroid at origin:' print >> out, X print >> out # Translate all of the points so that the origin is at the centroid of the leaves. X -= np.mean(X[:nleaves], 0) print >> out, 'points with centroid of leaves at origin:' print >> out, X print >> out # Extract the subset of points that define the leaves. L = X[:nleaves] # Find the orthogonal transformation of the leaves onto their MDS axes. # According to the python svd documentation, singular values are sorted most important to least important. U, s, Vt = np.linalg.svd(L) # Transform all of the points (including the internal vertices) according to this orthogonal transformation. # The axes are now the axes of the Steiner circumscribed ellipsoid of the leaf vertices. # I am using M.T[:k].T to get the first k columns of M. Z = np.dot(X, Vt.T) print >> out, 'orthogonally transformed points (call this Z):' print >> out, Z print >> out Y = Z.T[:(nleaves-1)].T print >> out, 'projection of the points onto the axes of the leaf ellipsoid,' print >> out, '(these are the first columns of Z; call this projected matrix Y):' print >> out, Y print >> out # Show the inner products. inner_products_of_columns = np.dot(Y.T, Y) print >> out, "pairwise inner products of the columns of Y (that is, Y'Y)" print >> out, inner_products_of_columns print >> out # Show other inner products. inner_products_of_columns = np.dot(Y[:5].T, Y[:5]) print >> out, "pairwise inner products of the first few columns of Y" print >> out, inner_products_of_columns print >> out # Extract the subset of points that define the points of articulation. # Note that the origin is the centroid of the leaves. R = X[nleaves:] Y_leaves = Y[:nleaves] W = np.dot(np.linalg.pinv(L), Y_leaves) print >> out, 'leaf projection using pseudoinverse (first few rows of Y):' print >> out, np.dot(L, W) print >> out print >> out, 'projection of points of articulation using pseudoinverse (remaining rows of Y):' print >> out, np.dot(R, W) print >> out # Get all of the points in high dimensional space. X = Euclid.edm_to_points(D_full) # Get the MDS onto the lower dimensional space. X = X.T[:(nleaves-1)].T assert np.allclose(sum(X, 0), 0) print >> out, 'all points projected onto the first principal axes of the full ellipsoid:' print >> out, X print >> out # Look at only the leaves in this space. L = X[:nleaves] L -= np.mean(L, 0) print >> out, 'leaves projected onto the first principal axes of the full ellipsoid and then centered:' print >> out, L print >> out # Re-project the leaves onto the axes of leaf ellipsoid. D_leaves = Euclid.dccov_to_edm(np.dot(L, L.T)) Y = Euclid.edm_to_points(D_leaves) print >> out, 'leaves further projected onto principal axes of their own ellipsoid:' print >> out, Y print >> out # Try something else D_all = Euclid.dccov_to_edm(np.dot(X, X.T)) Y = Euclid.edm_to_points(D_all).T[:(nleaves-1)].T print >> out, 'all points further projected onto their own principal axes of inertia:' print >> out, Y print >> out # Try the same thing some more D_again = Euclid.dccov_to_edm(np.dot(Y, Y.T)) Z = Euclid.edm_to_points(D_again).T[:(nleaves-1)].T print >> out, 'all points further projected onto their own principal axes of inertia (second iteration):' print >> out, Z print >> out return out.getvalue().strip()
def process(nseconds=None): """ @param nseconds: allow this many seconds to run or None to run forever @return: a multi-line string that summarizes the results """ start_time = time.time() nsampled_trees = 0 # track the number of observations of each number of cuts on each axis for each hyperellipse internal_important_axis_to_ncuts_dict = {} internal_unimportant_axis_to_ncuts_dict = {} external_axis_to_ncuts_dict = {} # track the number of bad axes of each principality for each hyperellipse internal_important_bad_axis_dict = {} internal_unimportant_bad_axis_dict = {} external_bad_axis_dict = {} try: while True: elapsed_time = time.time() - start_time if nseconds and elapsed_time > nseconds: break # pick a random number of taxa to use as leaves in the tree ntaxa = random.randrange(3, 12) # sample an xtree topology xtree = TreeSampler.sample_agglomerated_tree(ntaxa) # sample an xtree with exponentially distributed branch lengths mu = 2.0 for branch in xtree.get_branches(): branch.length = random.expovariate(1 / mu) # convert the xtree to a FelTree so we can use the internal vertices tree_string = xtree.get_newick_string() tree = NewickIO.parse(tree_string, FelTree.NewickTree) # get ordered ids and the number of leaves and some auxiliary variables ordered_ids = get_ordered_ids(tree) nleaves = len(list(tree.gen_tips())) id_to_index = dict((myid, i) for i, myid in enumerate(ordered_ids)) # compute the set of pairs of indices corresponding to branches neighbor_index_pairs = set() for parent in tree.preorder(): for child in parent.gen_children(): parent_index = id_to_index[id(parent)] child_index = id_to_index[id(child)] index_pair = frozenset((parent_index, child_index)) neighbor_index_pairs.add(index_pair) # get the distance matrix relating all of the points D_full = np.array(tree.get_full_distance_matrix(ordered_ids)) # analyze the intersections of the axes of the ellipsoid that includes internal points internal_projection = do_internal_projection(D_full) npoints, naxes = internal_projection.shape # analyze low axes for axis in range(0, nleaves - 1): if any( abs(internal_projection[i, axis]) < g_loading_epsilon for i in range(npoints)): internal_important_bad_axis_dict[ axis] = internal_important_bad_axis_dict.get(axis, 0) + 1 else: ncuts = 0 for indexa, indexb in neighbor_index_pairs: if internal_projection[ indexa, axis] * internal_projection[indexb, axis] < 0: ncuts += 1 ncuts_dict = internal_important_axis_to_ncuts_dict.get( axis, {}) ncuts_dict[ncuts] = ncuts_dict.get(ncuts, 0) + 1 internal_important_axis_to_ncuts_dict[axis] = ncuts_dict # analyze high axes for axis in range(nleaves - 1, naxes): if any( abs(internal_projection[i, axis]) < g_loading_epsilon for i in range(npoints)): internal_unimportant_bad_axis_dict[ axis] = internal_unimportant_bad_axis_dict.get( axis, 0) + 1 else: ncuts = 0 for indexa, indexb in neighbor_index_pairs: if internal_projection[ indexa, axis] * internal_projection[indexb, axis] < 0: ncuts += 1 ncuts_dict = internal_unimportant_axis_to_ncuts_dict.get( axis, {}) ncuts_dict[ncuts] = ncuts_dict.get(ncuts, 0) + 1 internal_unimportant_axis_to_ncuts_dict[axis] = ncuts_dict # analyze the intersections of the axes of the ellipsoid that includes only leaf points external_projection = do_external_projection(D_full, nleaves) npoints, naxes = external_projection.shape for axis in range(naxes): if any( abs(external_projection[i, axis]) < g_loading_epsilon for i in range(npoints)): external_bad_axis_dict[axis] = external_bad_axis_dict.get( axis, 0) + 1 else: ncuts = 0 for indexa, indexb in neighbor_index_pairs: if external_projection[ indexa, axis] * external_projection[indexb, axis] < 0: ncuts += 1 ncuts_dict = external_axis_to_ncuts_dict.get(axis, {}) ncuts_dict[ncuts] = ncuts_dict.get(ncuts, 0) + 1 external_axis_to_ncuts_dict[axis] = ncuts_dict # increment the count of sampled trees nsampled_trees += 1 except KeyboardInterrupt, e: pass
def process(ntaxa, nseconds): """ @param nseconds: allow this many seconds to run or None to run forever @return: a multi-line string that summarizes the results """ start_time = time.time() nsamples_rejected = 0 nsamples_accepted = 0 pattern_to_topo_surrogate = {} pattern_to_tree_string = {} counterexample_message = 'no counterexample was found' try: while True: elapsed_time = time.time() - start_time if nseconds and elapsed_time > nseconds: break # sample an xtree topology xtree = TreeSampler.sample_agglomerated_tree(ntaxa) # convert the xtree to a FelTree, although I guess this might not be necessary tree_string = xtree.get_newick_string() tree = NewickIO.parse(tree_string, FelTree.NewickTree) # get ordered ids and the number of leaves and some auxiliary variables ordered_ids = get_ordered_ids(tree) nleaves = len(list(tree.gen_tips())) id_to_index = dict((myid, i) for i, myid in enumerate(ordered_ids)) # force every branch length to be the unit length reset_branch_lengths(tree) # get the unweighted distance matrix among tips in convenient hashable form D_unit = np.array(tree.get_partial_distance_matrix(ordered_ids)) topo_surrogate = tuple(tuple(row.tolist()) for row in D_unit) # sample random branch lengths sample_branch_lengths(tree) # get the weighted tree string weighted_tree_string = NewickIO.get_newick_string(tree) # get the distance matrix relating the leaves D = np.array(tree.get_partial_distance_matrix(ordered_ids)) # get the projections onto the MDS axes of the leaves X = Euclid.edm_to_points(D) # if any coordinate is near zero then reject the sample if np.min(np.abs(X)) < g_epsilon: nsamples_rejected += 1 continue # do an orthogonal transformation that puts the first point in the positive orthant canonizing_vector = np.array(point_to_orthant(X[0])) X *= canonizing_vector # get the canonical sign pattern sign_pattern = tuple(point_to_orthant(row) for row in X) # compare the topo surrogate of this sign pattern to the one in memory expected_topo_surrogate = pattern_to_topo_surrogate.get(sign_pattern, None) if expected_topo_surrogate: if topo_surrogate != expected_topo_surrogate: remembered_tree_string = pattern_to_tree_string[sign_pattern] msg = 'these trees have the same sign pattern but different topologies: {%s, %s}' % (weighted_tree_string, remembered_tree_string) raise CounterexampleError(msg) else: pattern_to_topo_surrogate[sign_pattern] = topo_surrogate pattern_to_tree_string[sign_pattern] = weighted_tree_string # increment the count of accepted samples nsamples_accepted += 1 except KeyboardInterrupt, e: pass
def process(nseconds=None): """ @param nseconds: allow this many seconds to run or None to run forever @return: a multi-line string that summarizes the results """ start_time = time.time() nsampled_trees = 0 counterexample_message = 'no counterexample was found' northants_passed = 0 northants_failed = 0 ncontrol_orthants_passed = 0 ncontrol_orthants_failed = 0 branch_cut_hist = {} control_branch_cut_hist = {} try: while True: elapsed_time = time.time() - start_time if nseconds and elapsed_time > nseconds: break # pick a random number of taxa to use as leaves in the tree ntaxa = random.randrange(3, 12) # sample an xtree topology xtree = TreeSampler.sample_agglomerated_tree(ntaxa) # sample an xtree with exponentially distributed branch lengths mu = 2.0 for branch in xtree.get_branches(): branch.length = random.expovariate(1 / mu) # convert the xtree to a FelTree so we can use the internal vertices tree_string = xtree.get_newick_string() tree = NewickIO.parse(tree_string, FelTree.NewickTree) # get ordered ids and the number of leaves and some auxiliary variables ordered_ids = get_ordered_ids(tree) nleaves = len(list(tree.gen_tips())) id_to_index = dict((myid, i) for i, myid in enumerate(ordered_ids)) # compute the set of pairs of indices corresponding to branches neighbor_index_pairs = set() for parent in tree.preorder(): for child in parent.gen_children(): parent_index = id_to_index[id(parent)] child_index = id_to_index[id(child)] index_pair = frozenset((parent_index, child_index)) neighbor_index_pairs.add(index_pair) # get the distance matrix relating all of the points D_full = np.array(tree.get_full_distance_matrix(ordered_ids)) D_bad = np.sqrt(D_full) for D in (D_full, D_bad): # get the projections onto the MDS axes of the leaves projection = do_projection(D, nleaves) npoints, naxes = projection.shape # recursively split the points by hyperplanes of principal axes next_id_set_list = [set(ordered_ids)] for axis in range(naxes): id_set_list = next_id_set_list # create the list of sets of points in principal orthants next_id_set_list = [] for id_set in id_set_list: neg_id_set = set( myid for myid in id_set if projection[id_to_index[myid], axis] < 0) nonneg_id_set = set( myid for myid in id_set if projection[id_to_index[myid], axis] >= 0) for next_set in (neg_id_set, nonneg_id_set): if len(next_set) > 1: next_id_set_list.append(next_set) # each set of points should be connected for id_set in next_id_set_list: bconnected = is_connected(tree, id_set) if bconnected and (D is D_full): northants_passed += 1 elif (not bconnected) and (D is D_full): northants_failed += 1 msg = 'found a counterexample in principal orthant %d of the tree %s' % ( axis + 1, tree_string) raise CounterexampleError(msg) elif bconnected and (D is not D_full): ncontrol_orthants_passed += 1 elif (not bconnected) and (D is not D_full): ncontrol_orthants_failed += 1 # define the applicable histogram hist = branch_cut_hist if D is D_full else control_branch_cut_hist # check the number of cuts per branch for i, j in neighbor_index_pairs: ncuts = sum( 1 for axis in range(naxes) if projection[i, axis] * projection[j, axis] < 0) hist[ncuts] = hist.get(ncuts, 0) + 1 # increment the count of sampled trees nsampled_trees += 1 except KeyboardInterrupt, e: pass
def examine_mds_splits(): """ Examine properties of the hyperplane orthogonal to the MDS axis of a hyperellipse. The hyperellipse is the Steiner circumscribed hyperellipse that intersects points of the embedded leaves of a tree. Earlier results show that the hyperplane orthogonal to the principal axis of this hyperellipse should separate the leaves in a way that is compatible with the topology of the tree. Here we investigate the conjecture that this same hyperplane also splits internal vertices in a way that is compatible with the topology of the tree. """ count = 0 ncontrol_noneuclidean_counterexamples = 0 ncontrol_secondary_counterexamples = 0 print 'Does the principal hyperplane of the leaves always intersect the tree at exactly one point?' print 'Press control-C to stop looking for a counterexample...' try: while True: # pick a random number of taxa to use as leaves in the tree ntaxa = random.randrange(3, 12) # sample an xtree with exponentially distributed branch lengths xtree = TreeSampler.sample_agglomerated_tree(ntaxa) for branch in xtree.get_branches(): mu = 2.0 branch.length = random.expovariate(1/mu) # convert the xtree to a FelTree so we can use the internal vertices tree_string = xtree.get_newick_string() tree = NewickIO.parse(tree_string, FelTree.NewickTree) # get the full id splits of the tree, including internal nodes id_set = set(id(node) for node in tree.preorder()) d = TreeComparison._get_branch_id_to_node_id_set(tree) full_id_splits = set(frozenset((frozenset(x), frozenset(id_set-x))) for x in d.values()) # get ordered ids and the number of leaves ordered_ids = get_ordered_ids(tree) nleaves = len(list(tree.gen_tips())) # get the projection D_full = np.array(tree.get_full_distance_matrix(ordered_ids)) projected_points = do_projection(D_full, nleaves) # get the split implied by the principal hyperplane of the leaves left_ids = set(i for i, point in zip(ordered_ids, projected_points) if point[0] < 0) right_ids = id_set - left_ids split = frozenset((frozenset(left_ids), frozenset(right_ids))) # if the split is not compatible with the tree then we have found a counterexample if split not in full_id_splits: print 'counterexample:' print tree_string break # now do a control where I look at the wrong eigenvector left_ids = set(i for i, point in zip(ordered_ids, projected_points) if point[1] < 0) right_ids = id_set - left_ids split = frozenset((frozenset(left_ids), frozenset(right_ids))) if split not in full_id_splits: ncontrol_secondary_counterexamples += 1 # now do a control that should provide the occasional counterexample D_control = np.sqrt(D_full) projected_points = do_projection(D_control, nleaves) left_ids = set(i for i, point in zip(ordered_ids, projected_points) if point[0] < 0) right_ids = id_set - left_ids split = frozenset((frozenset(left_ids), frozenset(right_ids))) if split not in full_id_splits: ncontrol_noneuclidean_counterexamples += 1 # increment the count count += 1 except KeyboardInterrupt, e: print 'Checked', count, 'trees and found no counterexample.' print 'Found', ncontrol_secondary_counterexamples, 'control counterexamples where I use the wrong eigenvector.' print 'Found', ncontrol_noneuclidean_counterexamples, 'control counterexamples where I use the wrong distance matrix.'
def examine_mds_splits(): """ Examine properties of the hyperplane orthogonal to the MDS axis of a hyperellipse. The hyperellipse is the Steiner circumscribed hyperellipse that intersects points of the embedded leaves of a tree. Earlier results show that the hyperplane orthogonal to the principal axis of this hyperellipse should separate the leaves in a way that is compatible with the topology of the tree. Here we investigate the conjecture that this same hyperplane also splits internal vertices in a way that is compatible with the topology of the tree. """ count = 0 ncontrol_noneuclidean_counterexamples = 0 ncontrol_secondary_counterexamples = 0 print 'Does the principal hyperplane of the leaves always intersect the tree at exactly one point?' print 'Press control-C to stop looking for a counterexample...' try: while True: # pick a random number of taxa to use as leaves in the tree ntaxa = random.randrange(3, 12) # sample an xtree with exponentially distributed branch lengths xtree = TreeSampler.sample_agglomerated_tree(ntaxa) for branch in xtree.get_branches(): mu = 2.0 branch.length = random.expovariate(1 / mu) # convert the xtree to a FelTree so we can use the internal vertices tree_string = xtree.get_newick_string() tree = NewickIO.parse(tree_string, FelTree.NewickTree) # get the full id splits of the tree, including internal nodes id_set = set(id(node) for node in tree.preorder()) d = TreeComparison._get_branch_id_to_node_id_set(tree) full_id_splits = set( frozenset((frozenset(x), frozenset(id_set - x))) for x in d.values()) # get ordered ids and the number of leaves ordered_ids = get_ordered_ids(tree) nleaves = len(list(tree.gen_tips())) # get the projection D_full = np.array(tree.get_full_distance_matrix(ordered_ids)) projected_points = do_projection(D_full, nleaves) # get the split implied by the principal hyperplane of the leaves left_ids = set(i for i, point in zip(ordered_ids, projected_points) if point[0] < 0) right_ids = id_set - left_ids split = frozenset((frozenset(left_ids), frozenset(right_ids))) # if the split is not compatible with the tree then we have found a counterexample if split not in full_id_splits: print 'counterexample:' print tree_string break # now do a control where I look at the wrong eigenvector left_ids = set(i for i, point in zip(ordered_ids, projected_points) if point[1] < 0) right_ids = id_set - left_ids split = frozenset((frozenset(left_ids), frozenset(right_ids))) if split not in full_id_splits: ncontrol_secondary_counterexamples += 1 # now do a control that should provide the occasional counterexample D_control = np.sqrt(D_full) projected_points = do_projection(D_control, nleaves) left_ids = set(i for i, point in zip(ordered_ids, projected_points) if point[0] < 0) right_ids = id_set - left_ids split = frozenset((frozenset(left_ids), frozenset(right_ids))) if split not in full_id_splits: ncontrol_noneuclidean_counterexamples += 1 # increment the count count += 1 except KeyboardInterrupt, e: print 'Checked', count, 'trees and found no counterexample.' print 'Found', ncontrol_secondary_counterexamples, 'control counterexamples where I use the wrong eigenvector.' print 'Found', ncontrol_noneuclidean_counterexamples, 'control counterexamples where I use the wrong distance matrix.'
def process(nseconds=None): """ @param nseconds: allow this many seconds to run or None to run forever @return: a multi-line string that summarizes the results """ start_time = time.time() nsampled_trees = 0 counterexample_message = 'no counterexample was found' nvertex_connectivity_failures = 0 nfragment_fragment_collisions = 0 nfragment_vertex_collisions = 0 ncontrol_vertex_connectivity_failures = 0 ncontrol_fragment_fragment_collisions = 0 ncontrol_fragment_vertex_collisions = 0 try: while True: elapsed_time = time.time() - start_time if nseconds and elapsed_time > nseconds: break # pick a random number of taxa to use as leaves in the tree ntaxa = random.randrange(3, 12) # sample an xtree topology xtree = TreeSampler.sample_agglomerated_tree(ntaxa) # sample an xtree with exponentially distributed branch lengths mu = 2.0 for branch in xtree.get_branches(): branch.length = random.expovariate(1 / mu) # convert the xtree to a FelTree so we can use the internal vertices tree_string = xtree.get_newick_string() tree = NewickIO.parse(tree_string, FelTree.NewickTree) # get ordered ids and the number of leaves and some auxiliary variables ordered_ids = get_ordered_ids(tree) nleaves = len(list(tree.gen_tips())) id_to_index = dict((myid, i) for i, myid in enumerate(ordered_ids)) # compute the set of pairs of indices corresponding to branches neighbor_index_pairs = set() for parent in tree.preorder(): for child in parent.gen_children(): parent_index = id_to_index[id(parent)] child_index = id_to_index[id(child)] index_pair = frozenset((parent_index, child_index)) neighbor_index_pairs.add(index_pair) # get the distance matrix relating all of the points D_full = np.array(tree.get_full_distance_matrix(ordered_ids)) D_bad = np.sqrt(D_full) for D in (D_full, D_bad): # get the projections onto the MDS axes of the leaves projection = do_projection(D, nleaves) npoints, naxes_total = projection.shape # look for a counterexample for each possible number of principal hyperplanes for naxes in range(1, naxes_total + 1): # some orthants are occupied by a fragment of an edge forbidden_orthants = set() for indexa, indexb in neighbor_index_pairs: # get the endpoints of the edge in the Euclidean subspace pta = projection[indexa][:naxes] ptb = projection[indexb][:naxes] # look at the orthants blocked by the fragments of this edge orthants = get_blocked_orthants(pta, ptb) if orthants & forbidden_orthants: if D is D_full: nfragment_fragment_collisions += 1 msg = 'two edge fragments occupy the same orthant in %d dimensions in the tree %s' % ( naxes, tree_string) raise CounterexampleError(msg) else: ncontrol_fragment_fragment_collisions += 1 forbidden_orthants.update(orthants) # no vertex should share an orthant with an edge fragment for i in range(npoints): p = projection[i][:naxes] orthant = point_to_orthant(p) if orthant in forbidden_orthants: if D is D_full: nfragment_vertex_collisions += 1 msg = 'a vertex occupies the same orthant as an edge fragment in %d dimensions in the tree %s' % ( naxes, tree_string) raise CounterexampleError(msg) else: ncontrol_fragment_vertex_collisions += 1 # now partition the vertices by orthant and check their connectivity orthant_to_id_set = {} for i in range(npoints): p = projection[i][:naxes] orthant = point_to_orthant(p) id_set = orthant_to_id_set.get(orthant, set()) id_set.add(ordered_ids[i]) orthant_to_id_set[orthant] = id_set for id_set in orthant_to_id_set.values(): if not is_connected(tree, id_set): if D is D_full: nvertex_connectivity_failures += 1 msg = 'found disconnected vertices in an orthant in %d dimensions in the tree %s' % ( naxes, tree_string) raise CounterexampleError(msg) else: ncontrol_vertex_connectivity_failures += 1 # increment the count of sampled trees nsampled_trees += 1 except KeyboardInterrupt, e: pass
def process(nseconds=None): """ @param nseconds: allow this many seconds to run or None to run forever @return: a multi-line string that summarizes the results """ start_time = time.time() nsampled_trees = 0 counterexample_message = 'no counterexample was found' nvertex_connectivity_failures = 0 nfragment_fragment_collisions = 0 nfragment_vertex_collisions = 0 ncontrol_vertex_connectivity_failures = 0 ncontrol_fragment_fragment_collisions = 0 ncontrol_fragment_vertex_collisions = 0 try: while True: elapsed_time = time.time() - start_time if nseconds and elapsed_time > nseconds: break # pick a random number of taxa to use as leaves in the tree ntaxa = random.randrange(3, 12) # sample an xtree topology xtree = TreeSampler.sample_agglomerated_tree(ntaxa) # sample an xtree with exponentially distributed branch lengths mu = 2.0 for branch in xtree.get_branches(): branch.length = random.expovariate(1/mu) # convert the xtree to a FelTree so we can use the internal vertices tree_string = xtree.get_newick_string() tree = NewickIO.parse(tree_string, FelTree.NewickTree) # get ordered ids and the number of leaves and some auxiliary variables ordered_ids = get_ordered_ids(tree) nleaves = len(list(tree.gen_tips())) id_to_index = dict((myid, i) for i, myid in enumerate(ordered_ids)) # compute the set of pairs of indices corresponding to branches neighbor_index_pairs = set() for parent in tree.preorder(): for child in parent.gen_children(): parent_index = id_to_index[id(parent)] child_index = id_to_index[id(child)] index_pair = frozenset((parent_index, child_index)) neighbor_index_pairs.add(index_pair) # get the distance matrix relating all of the points D_full = np.array(tree.get_full_distance_matrix(ordered_ids)) D_bad = np.sqrt(D_full) for D in (D_full, D_bad): # get the projections onto the MDS axes of the leaves projection = do_projection(D, nleaves) npoints, naxes_total = projection.shape # look for a counterexample for each possible number of principal hyperplanes for naxes in range(1, naxes_total+1): # some orthants are occupied by a fragment of an edge forbidden_orthants = set() for indexa, indexb in neighbor_index_pairs: # get the endpoints of the edge in the Euclidean subspace pta = projection[indexa][:naxes] ptb = projection[indexb][:naxes] # look at the orthants blocked by the fragments of this edge orthants = get_blocked_orthants(pta, ptb) if orthants & forbidden_orthants: if D is D_full: nfragment_fragment_collisions += 1 msg = 'two edge fragments occupy the same orthant in %d dimensions in the tree %s' % (naxes, tree_string) raise CounterexampleError(msg) else: ncontrol_fragment_fragment_collisions += 1 forbidden_orthants.update(orthants) # no vertex should share an orthant with an edge fragment for i in range(npoints): p = projection[i][:naxes] orthant = point_to_orthant(p) if orthant in forbidden_orthants: if D is D_full: nfragment_vertex_collisions += 1 msg = 'a vertex occupies the same orthant as an edge fragment in %d dimensions in the tree %s' % (naxes, tree_string) raise CounterexampleError(msg) else: ncontrol_fragment_vertex_collisions += 1 # now partition the vertices by orthant and check their connectivity orthant_to_id_set = {} for i in range(npoints): p = projection[i][:naxes] orthant = point_to_orthant(p) id_set = orthant_to_id_set.get(orthant, set()) id_set.add(ordered_ids[i]) orthant_to_id_set[orthant] = id_set for id_set in orthant_to_id_set.values(): if not is_connected(tree, id_set): if D is D_full: nvertex_connectivity_failures += 1 msg = 'found disconnected vertices in an orthant in %d dimensions in the tree %s' % (naxes, tree_string) raise CounterexampleError(msg) else: ncontrol_vertex_connectivity_failures += 1 # increment the count of sampled trees nsampled_trees += 1 except KeyboardInterrupt, e: pass
def process(nseconds=None): """ @param nseconds: allow this many seconds to run or None to run forever @return: a multi-line string that summarizes the results """ start_time = time.time() nsampled_trees = 0 # track the number of observations of each number of cuts on each axis for each hyperellipse internal_important_axis_to_ncuts_dict = {} internal_unimportant_axis_to_ncuts_dict = {} external_axis_to_ncuts_dict = {} # track the number of bad axes of each principality for each hyperellipse internal_important_bad_axis_dict = {} internal_unimportant_bad_axis_dict = {} external_bad_axis_dict = {} try: while True: elapsed_time = time.time() - start_time if nseconds and elapsed_time > nseconds: break # pick a random number of taxa to use as leaves in the tree ntaxa = random.randrange(3, 12) # sample an xtree topology xtree = TreeSampler.sample_agglomerated_tree(ntaxa) # sample an xtree with exponentially distributed branch lengths mu = 2.0 for branch in xtree.get_branches(): branch.length = random.expovariate(1/mu) # convert the xtree to a FelTree so we can use the internal vertices tree_string = xtree.get_newick_string() tree = NewickIO.parse(tree_string, FelTree.NewickTree) # get ordered ids and the number of leaves and some auxiliary variables ordered_ids = get_ordered_ids(tree) nleaves = len(list(tree.gen_tips())) id_to_index = dict((myid, i) for i, myid in enumerate(ordered_ids)) # compute the set of pairs of indices corresponding to branches neighbor_index_pairs = set() for parent in tree.preorder(): for child in parent.gen_children(): parent_index = id_to_index[id(parent)] child_index = id_to_index[id(child)] index_pair = frozenset((parent_index, child_index)) neighbor_index_pairs.add(index_pair) # get the distance matrix relating all of the points D_full = np.array(tree.get_full_distance_matrix(ordered_ids)) # analyze the intersections of the axes of the ellipsoid that includes internal points internal_projection = do_internal_projection(D_full) npoints, naxes = internal_projection.shape # analyze low axes for axis in range(0, nleaves-1): if any(abs(internal_projection[i, axis]) < g_loading_epsilon for i in range(npoints)): internal_important_bad_axis_dict[axis] = internal_important_bad_axis_dict.get(axis, 0) + 1 else: ncuts = 0 for indexa, indexb in neighbor_index_pairs: if internal_projection[indexa, axis] * internal_projection[indexb, axis] < 0: ncuts += 1 ncuts_dict = internal_important_axis_to_ncuts_dict.get(axis, {}) ncuts_dict[ncuts] = ncuts_dict.get(ncuts, 0) + 1 internal_important_axis_to_ncuts_dict[axis] = ncuts_dict # analyze high axes for axis in range(nleaves-1, naxes): if any(abs(internal_projection[i, axis]) < g_loading_epsilon for i in range(npoints)): internal_unimportant_bad_axis_dict[axis] = internal_unimportant_bad_axis_dict.get(axis, 0) + 1 else: ncuts = 0 for indexa, indexb in neighbor_index_pairs: if internal_projection[indexa, axis] * internal_projection[indexb, axis] < 0: ncuts += 1 ncuts_dict = internal_unimportant_axis_to_ncuts_dict.get(axis, {}) ncuts_dict[ncuts] = ncuts_dict.get(ncuts, 0) + 1 internal_unimportant_axis_to_ncuts_dict[axis] = ncuts_dict # analyze the intersections of the axes of the ellipsoid that includes only leaf points external_projection = do_external_projection(D_full, nleaves) npoints, naxes = external_projection.shape for axis in range(naxes): if any(abs(external_projection[i, axis]) < g_loading_epsilon for i in range(npoints)): external_bad_axis_dict[axis] = external_bad_axis_dict.get(axis, 0) + 1 else: ncuts = 0 for indexa, indexb in neighbor_index_pairs: if external_projection[indexa, axis] * external_projection[indexb, axis] < 0: ncuts += 1 ncuts_dict = external_axis_to_ncuts_dict.get(axis, {}) ncuts_dict[ncuts] = ncuts_dict.get(ncuts, 0) + 1 external_axis_to_ncuts_dict[axis] = ncuts_dict # increment the count of sampled trees nsampled_trees += 1 except KeyboardInterrupt, e: pass
def process(ntaxa, nseconds): """ @param nseconds: allow this many seconds to run or None to run forever @return: a multi-line string that summarizes the results """ start_time = time.time() nsamples_rejected = 0 nsamples_accepted = 0 pattern_to_topo_surrogate = {} pattern_to_tree_string = {} counterexample_message = 'no counterexample was found' try: while True: elapsed_time = time.time() - start_time if nseconds and elapsed_time > nseconds: break # sample an xtree topology xtree = TreeSampler.sample_agglomerated_tree(ntaxa) # convert the xtree to a FelTree, although I guess this might not be necessary tree_string = xtree.get_newick_string() tree = NewickIO.parse(tree_string, FelTree.NewickTree) # get ordered ids and the number of leaves and some auxiliary variables ordered_ids = get_ordered_ids(tree) nleaves = len(list(tree.gen_tips())) id_to_index = dict((myid, i) for i, myid in enumerate(ordered_ids)) # force every branch length to be the unit length reset_branch_lengths(tree) # get the unweighted distance matrix among tips in convenient hashable form D_unit = np.array(tree.get_partial_distance_matrix(ordered_ids)) topo_surrogate = tuple(tuple(row.tolist()) for row in D_unit) # sample random branch lengths sample_branch_lengths(tree) # get the weighted tree string weighted_tree_string = NewickIO.get_newick_string(tree) # get the distance matrix relating the leaves D = np.array(tree.get_partial_distance_matrix(ordered_ids)) # get the projections onto the MDS axes of the leaves X = Euclid.edm_to_points(D) # if any coordinate is near zero then reject the sample if np.min(np.abs(X)) < g_epsilon: nsamples_rejected += 1 continue # do an orthogonal transformation that puts the first point in the positive orthant canonizing_vector = np.array(point_to_orthant(X[0])) X *= canonizing_vector # get the canonical sign pattern sign_pattern = tuple(point_to_orthant(row) for row in X) # compare the topo surrogate of this sign pattern to the one in memory expected_topo_surrogate = pattern_to_topo_surrogate.get( sign_pattern, None) if expected_topo_surrogate: if topo_surrogate != expected_topo_surrogate: remembered_tree_string = pattern_to_tree_string[ sign_pattern] msg = 'these trees have the same sign pattern but different topologies: {%s, %s}' % ( weighted_tree_string, remembered_tree_string) raise CounterexampleError(msg) else: pattern_to_topo_surrogate[sign_pattern] = topo_surrogate pattern_to_tree_string[sign_pattern] = weighted_tree_string # increment the count of accepted samples nsamples_accepted += 1 except KeyboardInterrupt, e: pass
def process(ntaxa): np.set_printoptions(linewidth=200) out = StringIO() # sample an xtree topology xtree = TreeSampler.sample_agglomerated_tree(ntaxa) # sample an xtree with exponentially distributed branch lengths mu = 2.0 for branch in xtree.get_branches(): branch.length = random.expovariate(1 / mu) # convert the xtree to a FelTree so we can use the internal vertices tree_string = xtree.get_newick_string() tree = NewickIO.parse(tree_string, FelTree.NewickTree) # get ordered ids and the number of leaves and some auxiliary variables ordered_ids = get_ordered_ids(tree) nleaves = len(list(tree.gen_tips())) id_to_index = dict((myid, i) for i, myid in enumerate(ordered_ids)) # get the distance matrix relating all of the points D_full = np.array(tree.get_full_distance_matrix(ordered_ids)) # Now do the projection so that # the resulting points are in the subspace whose basis vectors are the axes of the leaf ellipsoid. # First get the points such that the n rows in X are points in n-1 dimensional space. X = Euclid.edm_to_points(D_full) print >> out, 'points with centroid at origin:' print >> out, X print >> out # Translate all of the points so that the origin is at the centroid of the leaves. X -= np.mean(X[:nleaves], 0) print >> out, 'points with centroid of leaves at origin:' print >> out, X print >> out # Extract the subset of points that define the leaves. L = X[:nleaves] # Find the orthogonal transformation of the leaves onto their MDS axes. # According to the python svd documentation, singular values are sorted most important to least important. U, s, Vt = np.linalg.svd(L) # Transform all of the points (including the internal vertices) according to this orthogonal transformation. # The axes are now the axes of the Steiner circumscribed ellipsoid of the leaf vertices. # I am using M.T[:k].T to get the first k columns of M. Z = np.dot(X, Vt.T) print >> out, 'orthogonally transformed points (call this Z):' print >> out, Z print >> out Y = Z.T[:(nleaves - 1)].T print >> out, 'projection of the points onto the axes of the leaf ellipsoid,' print >> out, '(these are the first columns of Z; call this projected matrix Y):' print >> out, Y print >> out # Show the inner products. inner_products_of_columns = np.dot(Y.T, Y) print >> out, "pairwise inner products of the columns of Y (that is, Y'Y)" print >> out, inner_products_of_columns print >> out # Show other inner products. inner_products_of_columns = np.dot(Y[:5].T, Y[:5]) print >> out, "pairwise inner products of the first few columns of Y" print >> out, inner_products_of_columns print >> out # Extract the subset of points that define the points of articulation. # Note that the origin is the centroid of the leaves. R = X[nleaves:] Y_leaves = Y[:nleaves] W = np.dot(np.linalg.pinv(L), Y_leaves) print >> out, 'leaf projection using pseudoinverse (first few rows of Y):' print >> out, np.dot(L, W) print >> out print >> out, 'projection of points of articulation using pseudoinverse (remaining rows of Y):' print >> out, np.dot(R, W) print >> out # Get all of the points in high dimensional space. X = Euclid.edm_to_points(D_full) # Get the MDS onto the lower dimensional space. X = X.T[:(nleaves - 1)].T assert np.allclose(sum(X, 0), 0) print >> out, 'all points projected onto the first principal axes of the full ellipsoid:' print >> out, X print >> out # Look at only the leaves in this space. L = X[:nleaves] L -= np.mean(L, 0) print >> out, 'leaves projected onto the first principal axes of the full ellipsoid and then centered:' print >> out, L print >> out # Re-project the leaves onto the axes of leaf ellipsoid. D_leaves = Euclid.dccov_to_edm(np.dot(L, L.T)) Y = Euclid.edm_to_points(D_leaves) print >> out, 'leaves further projected onto principal axes of their own ellipsoid:' print >> out, Y print >> out # Try something else D_all = Euclid.dccov_to_edm(np.dot(X, X.T)) Y = Euclid.edm_to_points(D_all).T[:(nleaves - 1)].T print >> out, 'all points further projected onto their own principal axes of inertia:' print >> out, Y print >> out # Try the same thing some more D_again = Euclid.dccov_to_edm(np.dot(Y, Y.T)) Z = Euclid.edm_to_points(D_again).T[:(nleaves - 1)].T print >> out, 'all points further projected onto their own principal axes of inertia (second iteration):' print >> out, Z print >> out return out.getvalue().strip()