Пример #1
0
def get_response_content(fs):
    # read the query tree
    query_tree = NewickIO.parse(fs.query, FelTree.NewickTree)
    # read the reference tree
    reference_tree = NewickIO.parse(fs.reference, FelTree.NewickTree)
    # calculate the loss using the requested loss function
    if fs.uniform:
        loss_numerator = TreeComparison.get_split_distance(
            query_tree, reference_tree)
    elif fs.weighted:
        loss_numerator = TreeComparison.get_weighted_split_distance(
            query_tree, reference_tree)
    # do the normalization if requested
    if fs.normalize:
        if fs.uniform:
            loss_denominator = float(
                TreeComparison.get_nontrivial_split_count(reference_tree))
        elif fs.weighted:
            loss_denominator = float(
                TreeComparison.get_weighted_split_count(reference_tree))
    else:
        loss_denominator = 1
    # return the response
    if loss_denominator:
        return str(loss_numerator / loss_denominator) + '\n'
    else:
        return 'normalization failed\n'
Пример #2
0
def get_response_content(fs):
    # read the query tree
    query_tree = NewickIO.parse(fs.query, FelTree.NewickTree)
    # read the reference tree
    reference_tree = NewickIO.parse(fs.reference, FelTree.NewickTree)
    # calculate the loss using the requested loss function
    if fs.uniform:
        loss_numerator = TreeComparison.get_split_distance(
                query_tree, reference_tree)
    elif fs.weighted:
        loss_numerator = TreeComparison.get_weighted_split_distance(
                query_tree, reference_tree)
    # do the normalization if requested
    if fs.normalize:
        if fs.uniform:
            loss_denominator = float(
                    TreeComparison.get_nontrivial_split_count(reference_tree))
        elif fs.weighted:
            loss_denominator = float(
                    TreeComparison.get_weighted_split_count(reference_tree))
    else:
        loss_denominator = 1
    # return the response
    if loss_denominator:
        return str(loss_numerator / loss_denominator) + '\n'
    else:
        return 'normalization failed\n'
Пример #3
0
 def test_mito_matrix(self):
     D = g_mito_matrix
     n = len(D)
     observed_Q = get_Q_matrix(D)
     expected_Q = g_mito_matrix_q
     # assert that the diagonal elements of the observed Q matrix are exactly zero
     for i, row in enumerate(observed_Q):
         self.assertEqual(row[i], 0)
     # assert that the observed Q matrix is approximately equal to the expected Q matrix
     abs_tol = .001
     for i in range(n):
         for j in range(n):
             abs_delta = abs(observed_Q[i][j] - expected_Q[i][j])
             self.failUnless(abs_delta < abs_tol)
     # use neighbor joining to reconstruct the tree
     observed_tree = make_tree(D, g_mito_states)
     # load the expected tree
     expected_tree = NewickIO.parse(g_mito_tree_string, FelTree.NewickTree)
     # for the observed and expected trees calculate the induced partitions and corresponding branch lengths
     observed_partitions_and_lengths = TreeComparison.get_partitions_and_branch_lengths(
         observed_tree)
     expected_partitions_and_lengths = TreeComparison.get_partitions_and_branch_lengths(
         expected_tree)
     # the number of partitions should be the same
     self.assertEqual(len(observed_partitions_and_lengths),
                      len(expected_partitions_and_lengths))
     # the partitions should be the same
     observed_partitions = set(
         [part for part, length in observed_partitions_and_lengths])
     expected_partitions = set(
         [part for part, length in expected_partitions_and_lengths])
     observed_only = observed_partitions - expected_partitions
     expected_only = expected_partitions - observed_partitions
     lines = [
         'observed partitions include: ' + str(observed_only),
         'expected partitions include: ' + str(expected_only)
     ]
     self.assertEqual(observed_partitions, expected_partitions,
                      '\n'.join(lines))
     # corresponding partitions should have the same lengths
     observed_part_to_length = dict(observed_partitions_and_lengths)
     expected_part_to_length = dict(expected_partitions_and_lengths)
     lines = []
     for part in observed_partitions:
         observed_length = observed_part_to_length[part]
         expected_length = expected_part_to_length[part]
         abs_tol = .00001
         abs_delta = abs(observed_length - expected_length)
         if abs_delta > abs_tol:
             lines.append('partition:' + str(part))
             lines.append('observed branch length:' + str(observed_length))
             lines.append('expected branch length:' + str(expected_length))
     error_message = '\n'.join(lines)
     self.failIf(error_message, error_message)
Пример #4
0
 def __init__(self, tree, epsilon):
     """
     @param tree: a newick tree in the felsenstein-inspired format
     @param epsilon: determines whether loadings are considered negligible
     """
     # clear some flags that describe events that occur during reconstruction
     self.is_negligible = False
     self.is_incomplete = False
     self.is_conflicting = False
     # define the trees
     self.tree = tree
     self.reconstructed_tree = None
     # set the threshold for loading negligibility
     self.epsilon = epsilon
     # define some arbitrary ordering of tip names
     self.ordered_names = [node.get_name() for node in tree.gen_tips()]
     # get the distance matrix with respect to this ordering
     D = tree.get_distance_matrix(self.ordered_names)
     # get the Gower doubly centered matrix
     G = MatrixUtil.double_centered(np.array(D))
     # get the eigendecomposition of the Gower matrix
     eigenvalues, eigenvector_transposes = np.linalg.eigh(G)
     eigenvectors = eigenvector_transposes.T
     self.sorted_eigensystem = list(
         reversed(
             list(
                 sorted((abs(w), v)
                        for w, v in zip(eigenvalues, eigenvectors)))))
     # build the tree recursively using the sorted eigensystem
     indices = set(range(len(self.ordered_names)))
     try:
         # try to reconstruct the tree
         root = self._build_tree(indices, 0)
         root.set_branch_length(None)
         output_tree = Newick.NewickTree(root)
         # convert the tree to the FelTree format
         newick_string = NewickIO.get_newick_string(output_tree)
         self.reconstructed_tree = NewickIO.parse(newick_string,
                                                  FelTree.NewickTree)
     except NegligibleError:
         self.is_negligible = True
     except IncompleteError:
         self.is_incomplete = True
     else:
         # compare the splits defined by the reconstructed tree
         # to splits in the original tree
         expected_partitions = TreeComparison.get_nontrivial_partitions(
             self.tree)
         observed_partitions = TreeComparison.get_nontrivial_partitions(
             self.reconstructed_tree)
         invalid_partitions = observed_partitions - expected_partitions
         if invalid_partitions:
             self.is_conflicting = True
Пример #5
0
 def __init__(self, tree, epsilon):
     """
     @param tree: a newick tree in the felsenstein-inspired format
     @param epsilon: determines whether loadings are considered negligible
     """
     # clear some flags that describe events that occur during reconstruction
     self.is_negligible = False
     self.is_incomplete = False
     self.is_conflicting = False
     # define the trees
     self.tree = tree
     self.reconstructed_tree = None
     # set the threshold for loading negligibility
     self.epsilon = epsilon
     # define some arbitrary ordering of tip names
     self.ordered_names = [node.get_name() for node in tree.gen_tips()]
     # get the distance matrix with respect to this ordering
     D = tree.get_distance_matrix(self.ordered_names)
     # get the Gower doubly centered matrix
     G = MatrixUtil.double_centered(np.array(D))
     # get the eigendecomposition of the Gower matrix
     eigenvalues, eigenvector_transposes = np.linalg.eigh(G)
     eigenvectors = eigenvector_transposes.T
     self.sorted_eigensystem = list(reversed(list(sorted((abs(w), v) for w, v in zip(eigenvalues, eigenvectors)))))
     # build the tree recursively using the sorted eigensystem
     indices = set(range(len(self.ordered_names)))
     try:
         # try to reconstruct the tree
         root = self._build_tree(indices, 0)
         root.set_branch_length(None)
         output_tree = Newick.NewickTree(root)
         # convert the tree to the FelTree format
         newick_string = NewickIO.get_newick_string(output_tree)
         self.reconstructed_tree = NewickIO.parse(
                 newick_string, FelTree.NewickTree)
     except NegligibleError:
         self.is_negligible = True
     except IncompleteError:
         self.is_incomplete = True
     else:
         # compare the splits defined by the reconstructed tree
         # to splits in the original tree
         expected_partitions = TreeComparison.get_nontrivial_partitions(
                 self.tree)
         observed_partitions = TreeComparison.get_nontrivial_partitions(
                 self.reconstructed_tree)
         invalid_partitions = observed_partitions - expected_partitions
         if invalid_partitions:
             self.is_conflicting = True
Пример #6
0
 def run(self, distance_matrices, ordered_names):
     """
     This function stores the losses for each reconstruction.
     @param distance_matrices: a sequence of distance matrices
     @param ordered_names: order of taxa in the distance matrix
     """
     if self.start_time is not None:
         msg = 'each simulation object should be run only once'
         raise HandlingError(msg)
     if not distance_matrices:
         raise HandlingErrror('no distance matrices were provided')
     tip_name_set = set(node.name for node in self.original_tree.gen_tips())
     if tip_name_set != set(ordered_names):
         raise HandlingError('leaf name mismatch')
     self.start_time = time.time()
     # Define the reference tree and its maximum cost
     # under different loss functions.
     reference_tree = self.original_tree
     max_error_count = TreeComparison.get_nontrivial_split_count(
         reference_tree)
     max_loss_value = TreeComparison.get_weighted_split_count(
         reference_tree)
     for distance_matrix in distance_matrices:
         # create the tree builder
         tree_builder = NeighborhoodJoining.TreeBuilder(
             distance_matrix, ordered_names, self.splitter)
         # set parameters of the validating tree builder
         tree_builder.set_fallback_name(self.fallback_name)
         # build the tree
         try:
             query_tree = tree_builder.build()
         except NeighborhoodJoining.NeighborhoodJoiningError as e:
             raise HandlingError(e)
         # Note the number and weight of partition errors
         # during the reconstruction.
         error_count = TreeComparison.get_split_distance(
             query_tree, reference_tree)
         loss_value = TreeComparison.get_weighted_split_distance(
             query_tree, reference_tree)
         # make sure that the summary is internally consistent
         assert error_count <= max_error_count, (error_count,
                                                 max_error_count)
         assert loss_value <= max_loss_value, (loss_value, max_loss_value)
         # save the reconstruction characteristics to use later
         self.error_counts.append(error_count)
         self.loss_values.append(loss_value)
         self.max_error_counts.append(max_error_count)
         self.max_loss_values.append(max_loss_value)
     self.stop_time = time.time()
Пример #7
0
 def test_mito_matrix(self):
     D = g_mito_matrix
     n = len(D)
     observed_Q = get_Q_matrix(D)
     expected_Q = g_mito_matrix_q
     # assert that the diagonal elements of the observed Q matrix are exactly zero
     for i, row in enumerate(observed_Q):
         self.assertEqual(row[i], 0)
     # assert that the observed Q matrix is approximately equal to the expected Q matrix
     abs_tol = .001
     for i in range(n):
         for j in range(n):
             abs_delta = abs(observed_Q[i][j] - expected_Q[i][j])
             self.failUnless(abs_delta < abs_tol)
     # use neighbor joining to reconstruct the tree
     observed_tree = make_tree(D, g_mito_states)
     # load the expected tree
     expected_tree = NewickIO.parse(g_mito_tree_string, FelTree.NewickTree)
     # for the observed and expected trees calculate the induced partitions and corresponding branch lengths
     observed_partitions_and_lengths = TreeComparison.get_partitions_and_branch_lengths(observed_tree)
     expected_partitions_and_lengths = TreeComparison.get_partitions_and_branch_lengths(expected_tree)
     # the number of partitions should be the same
     self.assertEqual(len(observed_partitions_and_lengths), len(expected_partitions_and_lengths))
     # the partitions should be the same
     observed_partitions = set([part for part, length in observed_partitions_and_lengths])
     expected_partitions = set([part for part, length in expected_partitions_and_lengths])
     observed_only = observed_partitions - expected_partitions
     expected_only = expected_partitions - observed_partitions
     lines = [
             'observed partitions include: ' + str(observed_only),
             'expected partitions include: ' + str(expected_only)
             ]
     self.assertEqual(observed_partitions, expected_partitions, '\n'.join(lines))
     # corresponding partitions should have the same lengths
     observed_part_to_length = dict(observed_partitions_and_lengths)
     expected_part_to_length = dict(expected_partitions_and_lengths)
     lines = []
     for part in observed_partitions:
         observed_length = observed_part_to_length[part]
         expected_length = expected_part_to_length[part]
         abs_tol = .00001
         abs_delta = abs(observed_length - expected_length)
         if abs_delta > abs_tol:
             lines.append('partition:' + str(part))
             lines.append('observed branch length:' + str(observed_length))
             lines.append('expected branch length:' + str(expected_length))
     error_message = '\n'.join(lines)
     self.failIf(error_message, error_message)
Пример #8
0
 def __call__(self, tree):
     # get the partitions implied by the tree
     valid_partitions = TreeComparison.get_partitions(tree)
     # Get the partition implied by the Fiedler split
     # of the graph derived from the tree.
     tip_nodes = list(tree.gen_tips())
     D = tree.get_partial_distance_matrix(
             [id(node) for node in tip_nodes])
     y = get_vector(D).tolist()
     name_selection = frozenset(node.get_name()
             for node, elem in zip(tip_nodes, y) if elem > 0)
     name_complement = frozenset(node.get_name()
             for node, elem in zip(tip_nodes, y) if elem <= 0)
     name_partition = frozenset((name_selection, name_complement))
     if name_partition not in valid_partitions:
         msg = '\n'.join([
             'invalid partition found:',
             'tree:', NewickIO.get_newick_string(tree),
             'invalid partition:', name_partition])
         if not self.fout:
             self.fout = open(self.counterexample_filename, 'wt')
         print >> self.fout, msg
         print msg
         self.ncounterexamples += 1
     # do not stop looking, even if a counterexample is found
     return False
Пример #9
0
 def __call__(self, tree):
     # get the partitions implied by the tree
     valid_partitions = TreeComparison.get_partitions(tree)
     # Get the partition implied by the Fiedler split
     # of the graph derived from the tree.
     tip_nodes = list(tree.gen_tips())
     D = tree.get_partial_distance_matrix([id(node) for node in tip_nodes])
     y = get_vector(D).tolist()
     name_selection = frozenset(node.get_name()
                                for node, elem in zip(tip_nodes, y)
                                if elem > 0)
     name_complement = frozenset(node.get_name()
                                 for node, elem in zip(tip_nodes, y)
                                 if elem <= 0)
     name_partition = frozenset((name_selection, name_complement))
     if name_partition not in valid_partitions:
         msg = '\n'.join([
             'invalid partition found:', 'tree:',
             NewickIO.get_newick_string(tree), 'invalid partition:',
             name_partition
         ])
         if not self.fout:
             self.fout = open(self.counterexample_filename, 'wt')
         print >> self.fout, msg
         print msg
         self.ncounterexamples += 1
     # do not stop looking, even if a counterexample is found
     return False
Пример #10
0
def get_response_content(fs):
    # get the newick trees.
    trees = []
    for tree_string in iterutils.stripped_lines(StringIO(fs.trees)):
        # parse each tree and make sure that it conforms to various requirements
        tree = NewickIO.parse(tree_string, FelTree.NewickTree)
        tip_names = [tip.get_name() for tip in tree.gen_tips()]
        if len(tip_names) < 4:
            raise HandlingError('expected at least four tips but found ' +
                                str(len(tip_names)))
        if any(name is None for name in tip_names):
            raise HandlingError('each terminal node must be labeled')
        if len(set(tip_names)) != len(tip_names):
            raise HandlingError('each terminal node label must be unique')
        trees.append(tree)
    # begin the response
    out = StringIO()
    # look at each tree
    nerrors = 0
    ncounterexamples = 0
    for tree in trees:
        # get the set of valid partitions implied by the tree
        valid_parts = TreeComparison.get_partitions(tree)
        ordered_tip_names = [tip.get_name() for tip in tree.gen_tips()]
        # assert that the partition implied by the correct formula is valid
        D = np.array(tree.get_distance_matrix(ordered_tip_names))
        loadings = get_principal_coordinate(D)
        nonneg_leaf_set = frozenset(
            tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0)
        neg_leaf_set = frozenset(tip
                                 for tip, v in zip(ordered_tip_names, loadings)
                                 if v < 0)
        part = frozenset([nonneg_leaf_set, neg_leaf_set])
        if part not in valid_parts:
            nerrors += 1
            print >> out, 'error: a partition that was supposed to be valid was found to be invalid'
            print >> out, 'tree:', NewickIO.get_newick_string(tree)
            print >> out, 'invalid partition:', partition_to_string(part)
            print >> out
        # check the validity of the partition implied by the incorrect formula
        Q = D * D
        loadings = get_principal_coordinate(Q)
        nonneg_leaf_set = frozenset(
            tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0)
        neg_leaf_set = frozenset(tip
                                 for tip, v in zip(ordered_tip_names, loadings)
                                 if v < 0)
        part = frozenset([nonneg_leaf_set, neg_leaf_set])
        if part not in valid_parts:
            ncounterexamples += 1
            print >> out, 'found a counterexample!'
            print >> out, 'tree:', NewickIO.get_newick_string(tree)
            print >> out, 'invalid partition:', partition_to_string(part)
            print >> out
    print >> out, 'errors found:', nerrors
    print >> out, 'counterexamples found:', ncounterexamples
    # return the response
    return out.getvalue()
Пример #11
0
def main():
    filename = 'counterexamples.out'
    fout = open(filename, 'wt')
    print 'Does monotonically transforming the pairwise leaf distances affect the compatibility'
    print 'of the split found using principal coordinate analysis?'
    print 'I am looking through random trees for a tree that is split incompatibly'
    print 'when distances are squared.'
    print 'Use control-c to stop the program when you get bored.'
    try:
        count = 0
        ncounterexamples = 0
        nerrors = 0
        while True:
            count += 1
            # get a random tree
            n_base_leaves = 4
            n_expected_extra_leaves = 1
            expected_branch_length = 1
            tree = TreeSampler.sample_tree(n_base_leaves,
                                           n_expected_extra_leaves,
                                           expected_branch_length)
            # get the set of valid partitions implied by the tree
            valid_parts = TreeComparison.get_partitions(tree)
            ordered_tip_names = [tip.get_name() for tip in tree.gen_tips()]
            # assert that the partition implied by the correct formula is valid
            D = np.array(tree.get_distance_matrix(ordered_tip_names))
            loadings = get_principal_coordinate(D)
            nonneg_leaf_set = frozenset(
                tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0)
            neg_leaf_set = frozenset(
                tip for tip, v in zip(ordered_tip_names, loadings) if v < 0)
            part = frozenset([nonneg_leaf_set, neg_leaf_set])
            if part not in valid_parts:
                nerrors += 1
                print >> fout, 'error: a partition that was supposed to be valid was found to be invalid'
                print >> fout, 'tree:', NewickIO.get_newick_string(tree)
                print >> fout, 'invalid partition:', partition_to_string(part)
                print >> fout
            # check the validity of the partition implied by the incorrect formula
            Q = D * D
            loadings = get_principal_coordinate(Q)
            nonneg_leaf_set = frozenset(
                tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0)
            neg_leaf_set = frozenset(
                tip for tip, v in zip(ordered_tip_names, loadings) if v < 0)
            part = frozenset([nonneg_leaf_set, neg_leaf_set])
            if part not in valid_parts:
                ncounterexamples += 1
                print >> fout, 'found a counterexample!'
                print >> fout, 'tree:', NewickIO.get_newick_string(tree)
                print >> fout, 'invalid partition:', partition_to_string(part)
                print >> fout
    except KeyboardInterrupt, e:
        print 'trees examined:', count
        print 'errors:', nerrors
        print 'counterexamples:', ncounterexamples
Пример #12
0
 def run(self, distance_matrices, ordered_names):
     """
     This function stores the losses for each reconstruction.
     @param distance_matrices: a sequence of distance matrices
     @param ordered_names: order of taxa in the distance matrix
     """
     if self.start_time is not None:
         msg = "each simulation object should be run only once"
         raise HandlingError(msg)
     if not distance_matrices:
         raise HandlingErrror("no distance matrices were provided")
     tip_name_set = set(node.name for node in self.original_tree.gen_tips())
     if tip_name_set != set(ordered_names):
         raise HandlingError("leaf name mismatch")
     self.start_time = time.time()
     # Define the reference tree and its maximum cost
     # under different loss functions.
     reference_tree = self.original_tree
     max_error_count = TreeComparison.get_nontrivial_split_count(reference_tree)
     max_loss_value = TreeComparison.get_weighted_split_count(reference_tree)
     for distance_matrix in distance_matrices:
         # create the tree builder
         tree_builder = NeighborhoodJoining.TreeBuilder(distance_matrix, ordered_names, self.splitter)
         # set parameters of the validating tree builder
         tree_builder.set_fallback_name(self.fallback_name)
         # build the tree
         try:
             query_tree = tree_builder.build()
         except NeighborhoodJoining.NeighborhoodJoiningError as e:
             raise HandlingError(e)
         # Note the number and weight of partition errors
         # during the reconstruction.
         error_count = TreeComparison.get_split_distance(query_tree, reference_tree)
         loss_value = TreeComparison.get_weighted_split_distance(query_tree, reference_tree)
         # make sure that the summary is internally consistent
         assert error_count <= max_error_count, (error_count, max_error_count)
         assert loss_value <= max_loss_value, (loss_value, max_loss_value)
         # save the reconstruction characteristics to use later
         self.error_counts.append(error_count)
         self.loss_values.append(loss_value)
         self.max_error_counts.append(max_error_count)
         self.max_loss_values.append(max_loss_value)
     self.stop_time = time.time()
Пример #13
0
def get_response_content(fs):
    # get the newick trees.
    trees = []
    for tree_string in iterutils.stripped_lines(StringIO(fs.trees)):
        # parse each tree and make sure that it conforms to various requirements
        tree = NewickIO.parse(tree_string, FelTree.NewickTree)
        tip_names = [tip.get_name() for tip in tree.gen_tips()]
        if len(tip_names) < 4:
            raise HandlingError('expected at least four tips but found ' + str(len(tip_names)))
        if any(name is None for name in tip_names):
            raise HandlingError('each terminal node must be labeled')
        if len(set(tip_names)) != len(tip_names):
            raise HandlingError('each terminal node label must be unique')
        trees.append(tree)
    # begin the response
    out = StringIO()
    # look at each tree
    nerrors = 0
    ncounterexamples = 0
    for tree in trees:
        # get the set of valid partitions implied by the tree
        valid_parts = TreeComparison.get_partitions(tree)
        ordered_tip_names = [tip.get_name() for tip in tree.gen_tips()]
        # assert that the partition implied by the correct formula is valid
        D = np.array(tree.get_distance_matrix(ordered_tip_names))
        loadings = get_principal_coordinate(D)
        nonneg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0)
        neg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v < 0)
        part = frozenset([nonneg_leaf_set, neg_leaf_set])
        if part not in valid_parts:
            nerrors += 1
            print >> out, 'error: a partition that was supposed to be valid was found to be invalid'
            print >> out, 'tree:', NewickIO.get_newick_string(tree)
            print >> out, 'invalid partition:', partition_to_string(part)
            print >> out
        # check the validity of the partition implied by the incorrect formula
        Q = D * D
        loadings = get_principal_coordinate(Q)
        nonneg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0)
        neg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v < 0)
        part = frozenset([nonneg_leaf_set, neg_leaf_set])
        if part not in valid_parts:
            ncounterexamples += 1
            print >> out, 'found a counterexample!'
            print >> out, 'tree:', NewickIO.get_newick_string(tree)
            print >> out, 'invalid partition:', partition_to_string(part)
            print >> out
    print >> out, 'errors found:', nerrors
    print >> out, 'counterexamples found:', ncounterexamples
    # return the response
    return out.getvalue()
Пример #14
0
def main():
    filename = 'counterexamples.out'
    fout = open(filename, 'wt')
    print 'Does monotonically transforming the pairwise leaf distances affect the compatibility'
    print 'of the split found using principal coordinate analysis?'
    print 'I am looking through random trees for a tree that is split incompatibly'
    print 'when distances are squared.'
    print 'Use control-c to stop the program when you get bored.'
    try:
        count = 0
        ncounterexamples = 0
        nerrors = 0
        while True:
            count += 1
            # get a random tree
            n_base_leaves = 4
            n_expected_extra_leaves = 1
            expected_branch_length = 1
            tree = TreeSampler.sample_tree(n_base_leaves, n_expected_extra_leaves, expected_branch_length)
            # get the set of valid partitions implied by the tree
            valid_parts = TreeComparison.get_partitions(tree)
            ordered_tip_names = [tip.get_name() for tip in tree.gen_tips()]
            # assert that the partition implied by the correct formula is valid
            D = np.array(tree.get_distance_matrix(ordered_tip_names))
            loadings = get_principal_coordinate(D)
            nonneg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0)
            neg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v < 0)
            part = frozenset([nonneg_leaf_set, neg_leaf_set])
            if part not in valid_parts:
                nerrors += 1
                print >> fout, 'error: a partition that was supposed to be valid was found to be invalid'
                print >> fout, 'tree:', NewickIO.get_newick_string(tree)
                print >> fout, 'invalid partition:', partition_to_string(part)
                print >> fout
            # check the validity of the partition implied by the incorrect formula
            Q = D * D
            loadings = get_principal_coordinate(Q)
            nonneg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0)
            neg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v < 0)
            part = frozenset([nonneg_leaf_set, neg_leaf_set])
            if part not in valid_parts:
                ncounterexamples += 1
                print >> fout, 'found a counterexample!'
                print >> fout, 'tree:', NewickIO.get_newick_string(tree)
                print >> fout, 'invalid partition:', partition_to_string(part)
                print >> fout
    except KeyboardInterrupt, e:
        print 'trees examined:', count
        print 'errors:', nerrors
        print 'counterexamples:', ncounterexamples
Пример #15
0
def examine_mds_splits():
    """
    Examine properties of the hyperplane orthogonal to the MDS axis of a hyperellipse.
    The hyperellipse is the Steiner circumscribed hyperellipse that intersects
    points of the embedded leaves of a tree.
    Earlier results show that the hyperplane orthogonal to the principal
    axis of this hyperellipse should separate the leaves in a way that is compatible
    with the topology of the tree.
    Here we investigate the conjecture that this same hyperplane
    also splits internal vertices in a way that is compatible with the topology of the tree.
    """
    count = 0
    ncontrol_noneuclidean_counterexamples = 0
    ncontrol_secondary_counterexamples = 0
    print 'Does the principal hyperplane of the leaves always intersect the tree at exactly one point?'
    print 'Press control-C to stop looking for a counterexample...'
    try:
        while True:
            # pick a random number of taxa to use as leaves in the tree
            ntaxa = random.randrange(3, 12)
            # sample an xtree with exponentially distributed branch lengths
            xtree = TreeSampler.sample_agglomerated_tree(ntaxa)
            for branch in xtree.get_branches():
                mu = 2.0
                branch.length = random.expovariate(1/mu)
            # convert the xtree to a FelTree so we can use the internal vertices
            tree_string = xtree.get_newick_string()
            tree = NewickIO.parse(tree_string, FelTree.NewickTree)
            # get the full id splits of the tree, including internal nodes
            id_set = set(id(node) for node in tree.preorder())
            d = TreeComparison._get_branch_id_to_node_id_set(tree)
            full_id_splits = set(frozenset((frozenset(x), frozenset(id_set-x))) for x in d.values())
            # get ordered ids and the number of leaves
            ordered_ids = get_ordered_ids(tree)
            nleaves = len(list(tree.gen_tips()))
            # get the projection
            D_full = np.array(tree.get_full_distance_matrix(ordered_ids))
            projected_points = do_projection(D_full, nleaves)
            # get the split implied by the principal hyperplane of the leaves
            left_ids = set(i for i, point in zip(ordered_ids, projected_points) if point[0] < 0)
            right_ids = id_set - left_ids
            split = frozenset((frozenset(left_ids), frozenset(right_ids)))
            # if the split is not compatible with the tree then we have found a counterexample
            if split not in full_id_splits:
                print 'counterexample:'
                print tree_string
                break
            # now do a control where I look at the wrong eigenvector
            left_ids = set(i for i, point in zip(ordered_ids, projected_points) if point[1] < 0)
            right_ids = id_set - left_ids
            split = frozenset((frozenset(left_ids), frozenset(right_ids)))
            if split not in full_id_splits:
                ncontrol_secondary_counterexamples += 1
            # now do a control that should provide the occasional counterexample
            D_control = np.sqrt(D_full)
            projected_points = do_projection(D_control, nleaves)
            left_ids = set(i for i, point in zip(ordered_ids, projected_points) if point[0] < 0)
            right_ids = id_set - left_ids
            split = frozenset((frozenset(left_ids), frozenset(right_ids)))
            if split not in full_id_splits:
                ncontrol_noneuclidean_counterexamples += 1
            # increment the count
            count += 1
    except KeyboardInterrupt, e:
        print 'Checked', count, 'trees and found no counterexample.'
        print 'Found', ncontrol_secondary_counterexamples, 'control counterexamples where I use the wrong eigenvector.'
        print 'Found', ncontrol_noneuclidean_counterexamples, 'control counterexamples where I use the wrong distance matrix.'
Пример #16
0
def get_response_content(fs):
    # get the newick trees.
    trees = []
    for tree_string in iterutils.stripped_lines(fs.trees.splitlines()):
        # parse each tree and make sure that it conforms to various requirements
        tree = NewickIO.parse(tree_string, FelTree.NewickTree)
        tip_names = [tip.get_name() for tip in tree.gen_tips()]
        if len(tip_names) < 4:
            raise HandlingError('expected at least four tips '
                                'but found ' + str(len(tip_names)))
        if any(name is None for name in tip_names):
            raise HandlingError('each terminal node must be labeled')
        if len(set(tip_names)) != len(tip_names):
            raise HandlingError('each terminal node label must be unique')
        trees.append(tree)
    # create the response
    out = StringIO()
    same_count = 0
    diff_count = 0
    for tree in trees:
        # make the local paragraph that will be shown if there is an event
        local_out = StringIO()
        has_event = False
        # print the tree
        print >> local_out, NewickIO.get_newick_string(tree)
        # get the tip nodes and the internal nodes
        tip_nodes = []
        internal_nodes = []
        for node in tree.preorder():
            if node.is_tip():
                tip_nodes.append(node)
            else:
                internal_nodes.append(node)
        all_nodes = tip_nodes + internal_nodes
        # get all tip name partitions implied by the tree topology
        valid_partitions = TreeComparison.get_partitions(tree)
        # get results from the augmented distance matrix
        D_full = tree.get_partial_distance_matrix(
            [id(node) for node in all_nodes])
        y_full = get_vector(D_full).tolist()
        y = y_full[:len(tip_nodes)]
        name_selection = frozenset(node.get_name()
                                   for node, elem in zip(tip_nodes, y)
                                   if elem > 0)
        name_complement = frozenset(node.get_name()
                                    for node, elem in zip(tip_nodes, y)
                                    if elem <= 0)
        name_partition_a = frozenset((name_selection, name_complement))
        if name_partition_a not in valid_partitions:
            print >> local_out, 'augmented distance matrix split fail:',
            print >> local_out, name_partition_a
            has_event = True
        # get results from the not-augmented distance matrix
        D = tree.get_partial_distance_matrix([id(node) for node in tip_nodes])
        y = get_vector(D).tolist()
        name_selection = frozenset(node.get_name()
                                   for node, elem in zip(tip_nodes, y)
                                   if elem > 0)
        name_complement = frozenset(node.get_name()
                                    for node, elem in zip(tip_nodes, y)
                                    if elem <= 0)
        name_partition_b = frozenset((name_selection, name_complement))
        if name_partition_b not in valid_partitions:
            print >> local_out, 'not-augmented distance matrix split fail:',
            print >> local_out, name_partition_b
            has_event = True
        # compare the name partitions
        if name_partition_a == name_partition_b:
            same_count += 1
        else:
            diff_count += 1
            print >> local_out, 'this tree was split differently '
            print >> local_out, 'by the different methods:'
            print >> local_out, 'augmented distance matrix split:',
            print >> local_out, name_partition_a
            print >> local_out, 'not-augmented distance matrix split:',
            print >> local_out, name_partition_b
            has_event = True
        # print a newline between trees
        if has_event:
            print >> out, local_out.getvalue()
    # write the summary
    print >> out, 'for this many trees the same split was found:',
    print >> out, same_count
    print >> out, 'for this many trees different splits were found:',
    print >> out, diff_count
    # write the response
    return out.getvalue()
Пример #17
0
def get_response_content(fs):
    # get the newick trees.
    trees = []
    for tree_string in iterutils.stripped_lines(fs.trees.splitlines()):
        # parse each tree and make sure that it conforms to various requirements
        tree = NewickIO.parse(tree_string, FelTree.NewickTree)
        tip_names = [tip.get_name() for tip in tree.gen_tips()]
        if len(tip_names) < 4:
            raise HandlingError(
                    'expected at least four tips '
                    'but found ' + str(len(tip_names)))
        if any(name is None for name in tip_names):
            raise HandlingError('each terminal node must be labeled')
        if len(set(tip_names)) != len(tip_names):
            raise HandlingError('each terminal node label must be unique')
        trees.append(tree)
    # create the response
    out = StringIO()
    same_count = 0
    diff_count = 0
    for tree in trees:
        # make the local paragraph that will be shown if there is an event
        local_out = StringIO()
        has_event = False
        # print the tree
        print >> local_out, NewickIO.get_newick_string(tree)
        # get the tip nodes and the internal nodes
        tip_nodes = []
        internal_nodes = []
        for node in tree.preorder():
            if node.is_tip():
                tip_nodes.append(node)
            else:
                internal_nodes.append(node)
        all_nodes = tip_nodes + internal_nodes
        # get all tip name partitions implied by the tree topology
        valid_partitions = TreeComparison.get_partitions(tree)
        # get results from the augmented distance matrix
        D_full = tree.get_partial_distance_matrix(
                [id(node) for node in all_nodes])
        y_full = get_vector(D_full).tolist()
        y = y_full[:len(tip_nodes)]
        name_selection = frozenset(node.get_name()
                for node, elem in zip(tip_nodes, y) if elem > 0)
        name_complement = frozenset(node.get_name()
                for node, elem in zip(tip_nodes, y) if elem <= 0)
        name_partition_a = frozenset((name_selection, name_complement))
        if name_partition_a not in valid_partitions:
            print >> local_out, 'augmented distance matrix split fail:',
            print >> local_out, name_partition_a
            has_event = True
        # get results from the not-augmented distance matrix
        D = tree.get_partial_distance_matrix([id(node) for node in tip_nodes])
        y = get_vector(D).tolist()
        name_selection = frozenset(node.get_name()
                for node, elem in zip(tip_nodes, y) if elem > 0)
        name_complement = frozenset(node.get_name()
                for node, elem in zip(tip_nodes, y) if elem <= 0)
        name_partition_b = frozenset((name_selection, name_complement))
        if name_partition_b not in valid_partitions:
            print >> local_out, 'not-augmented distance matrix split fail:',
            print >> local_out, name_partition_b
            has_event = True
        # compare the name partitions
        if name_partition_a == name_partition_b:
            same_count += 1
        else:
            diff_count += 1
            print >> local_out, 'this tree was split differently '
            print >> local_out, 'by the different methods:'
            print >> local_out, 'augmented distance matrix split:',
            print >> local_out, name_partition_a
            print >> local_out, 'not-augmented distance matrix split:',
            print >> local_out, name_partition_b
            has_event = True
        # print a newline between trees
        if has_event:
            print >> out, local_out.getvalue()
    # write the summary
    print >> out, 'for this many trees the same split was found:',
    print >> out, same_count
    print >> out, 'for this many trees different splits were found:',
    print >> out, diff_count
    # write the response
    return out.getvalue()
Пример #18
0
def examine_mds_splits():
    """
    Examine properties of the hyperplane orthogonal to the MDS axis of a hyperellipse.
    The hyperellipse is the Steiner circumscribed hyperellipse that intersects
    points of the embedded leaves of a tree.
    Earlier results show that the hyperplane orthogonal to the principal
    axis of this hyperellipse should separate the leaves in a way that is compatible
    with the topology of the tree.
    Here we investigate the conjecture that this same hyperplane
    also splits internal vertices in a way that is compatible with the topology of the tree.
    """
    count = 0
    ncontrol_noneuclidean_counterexamples = 0
    ncontrol_secondary_counterexamples = 0
    print 'Does the principal hyperplane of the leaves always intersect the tree at exactly one point?'
    print 'Press control-C to stop looking for a counterexample...'
    try:
        while True:
            # pick a random number of taxa to use as leaves in the tree
            ntaxa = random.randrange(3, 12)
            # sample an xtree with exponentially distributed branch lengths
            xtree = TreeSampler.sample_agglomerated_tree(ntaxa)
            for branch in xtree.get_branches():
                mu = 2.0
                branch.length = random.expovariate(1 / mu)
            # convert the xtree to a FelTree so we can use the internal vertices
            tree_string = xtree.get_newick_string()
            tree = NewickIO.parse(tree_string, FelTree.NewickTree)
            # get the full id splits of the tree, including internal nodes
            id_set = set(id(node) for node in tree.preorder())
            d = TreeComparison._get_branch_id_to_node_id_set(tree)
            full_id_splits = set(
                frozenset((frozenset(x), frozenset(id_set - x)))
                for x in d.values())
            # get ordered ids and the number of leaves
            ordered_ids = get_ordered_ids(tree)
            nleaves = len(list(tree.gen_tips()))
            # get the projection
            D_full = np.array(tree.get_full_distance_matrix(ordered_ids))
            projected_points = do_projection(D_full, nleaves)
            # get the split implied by the principal hyperplane of the leaves
            left_ids = set(i for i, point in zip(ordered_ids, projected_points)
                           if point[0] < 0)
            right_ids = id_set - left_ids
            split = frozenset((frozenset(left_ids), frozenset(right_ids)))
            # if the split is not compatible with the tree then we have found a counterexample
            if split not in full_id_splits:
                print 'counterexample:'
                print tree_string
                break
            # now do a control where I look at the wrong eigenvector
            left_ids = set(i for i, point in zip(ordered_ids, projected_points)
                           if point[1] < 0)
            right_ids = id_set - left_ids
            split = frozenset((frozenset(left_ids), frozenset(right_ids)))
            if split not in full_id_splits:
                ncontrol_secondary_counterexamples += 1
            # now do a control that should provide the occasional counterexample
            D_control = np.sqrt(D_full)
            projected_points = do_projection(D_control, nleaves)
            left_ids = set(i for i, point in zip(ordered_ids, projected_points)
                           if point[0] < 0)
            right_ids = id_set - left_ids
            split = frozenset((frozenset(left_ids), frozenset(right_ids)))
            if split not in full_id_splits:
                ncontrol_noneuclidean_counterexamples += 1
            # increment the count
            count += 1
    except KeyboardInterrupt, e:
        print 'Checked', count, 'trees and found no counterexample.'
        print 'Found', ncontrol_secondary_counterexamples, 'control counterexamples where I use the wrong eigenvector.'
        print 'Found', ncontrol_noneuclidean_counterexamples, 'control counterexamples where I use the wrong distance matrix.'