def write_tree( child_lists, name_map, rank_map, options, branch_length=1 ):
    # Uses Biopython, only load if making tree
    import Bio.Phylo
    from Bio.Phylo import BaseTree

    def _get_name( node_id ):
        if options.name_id:
            return node_id
        return name_map[node_id]
    nodes = {}
    root_node_id = child_lists["0"][0]
    nodes[root_node_id] = BaseTree.Clade( name=_get_name( root_node_id), branch_length=branch_length )

    def recurse_children( parent_id ):
        if options.cluster is not None and rank_map[parent_id] == options.cluster:
            # Short circuit if we found our rank, prevents 'hanging' no ranks from being output
            # e.g. clustering by "species" (Escherichia coli), but have "no rank" below (Escherichia coli K-12) in test_db
            return
        if parent_id not in nodes:
            nodes[parent_id] = BaseTree.Clade( name=_get_name( parent_id ), branch_length=branch_length )
        for child_id in child_lists.get( parent_id, [] ):
            if options.cluster is None or ( rank_map[child_id] <= options.cluster  ):
                if child_id not in nodes:
                    nodes[child_id] = BaseTree.Clade(name=_get_name( child_id ), branch_length=branch_length)
                nodes[parent_id].clades.append(nodes[child_id])
                recurse_children( child_id )
    recurse_children( root_node_id )
    tree = BaseTree.Tree(root=nodes[root_node_id])
    Bio.Phylo.write( [tree], options.output_tree, 'newick' )
Пример #2
0
def create_upgma_tree(matrix, is_distance=True):
    adj_map = matrix.create_adjacency_map()
    closest_pairs = create_closest_pairs(adj_map, is_distance=is_distance)
    clade_map = create_clade_map(adj_map)

    for i in range(matrix.size - 2):
        if is_distance:
            source, pair_edge = min(closest_pairs.items(), key=lambda x: x[1])
        else:
            source, pair_edge = max(closest_pairs.items(), key=lambda x: x[1])
        merge_closest_edge(adj_map,
                           clade_map,
                           closest_pairs, (source, pair_edge[0]),
                           pair_edge[1],
                           is_distance=is_distance)

    unmerged_clusters = list(clade_map.keys())
    unmerged_clades = list(clade_map.values())
    if len(unmerged_clusters) > 1:
        branch_length = adj_map[unmerged_clusters[0]][unmerged_clusters[1]]

        root = BaseTree.Clade(branch_length=branch_length,
                              clades=unmerged_clades)
    else:
        root = unmerged_clades[0]

    root.matrix = matrix
    tree = BaseTree.Tree(root=root, rooted=False)

    return tree
Пример #3
0
    def upgma(self, distance_matrix):
        """Construct and return an UPGMA tree.

        Constructs and returns an Unweighted Pair Group Method
        with Arithmetic mean (UPGMA) tree.

        :Parameters:
            distance_matrix : DistanceMatrix
                The distance matrix for tree construction.

        """
        if not isinstance(distance_matrix, DistanceMatrix):
            raise TypeError("Must provide a DistanceMatrix object.")

        # make a copy of the distance matrix to be used
        dm = copy.deepcopy(distance_matrix)
        # init terminal clades
        clades = [BaseTree.Clade(None, name) for name in dm.names]
        # init minimum index
        min_i = 0
        min_j = 0
        inner_count = 0
        while len(dm) > 1:
            min_dist = dm[1, 0]
            # find minimum index
            for i in range(1, len(dm)):
                for j in range(0, i):
                    if min_dist >= dm[i, j]:
                        min_dist = dm[i, j]
                        min_i = i
                        min_j = j

            # create clade
            clade1 = clades[min_i]
            clade2 = clades[min_j]
            inner_count += 1
            inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count))
            inner_clade.clades.append(clade1)
            inner_clade.clades.append(clade2)
            # assign branch length
            clade1.branch_length = min_dist * 1.0 / 2 - self._height_of(clade1)

            clade2.branch_length = min_dist * 1.0 / 2 - self._height_of(clade2)

            # update node list
            clades[min_j] = inner_clade
            del clades[min_i]

            # rebuild distance matrix,
            # set the distances of new node at the index of min_j
            for k in range(0, len(dm)):
                if k != min_i and k != min_j:
                    dm[min_j, k] = (dm[min_i, k] + dm[min_j, k]) * 1.0 / 2

            dm.names[min_j] = "Inner" + str(inner_count)

            del dm[min_i]
        inner_clade.branch_length = 0
        return BaseTree.Tree(inner_clade)
Пример #4
0
 def create_ntree(tree):
     ntree = BaseTree.Clade()
     for key in tree:
         el = tree[key]
         if len(el.values()) > 0:
             ntree.clades.append(create_ntree(el))
         else:
             ntree.clades.append(BaseTree.Clade(name=list(key)[0]))
     return ntree
Пример #5
0
    def fit(self):

        if self.dist_matrix is None:
            return (False)

        assert (self.dist_matrix.shape[0] == self.dist_matrix.shape[1])
        assert (not any(
            [self.dist_matrix[i][i] for i in self.dist_matrix.index]))

        self.tree = None

        self._nodes = {
            n: BaseTree.Clade(None, str(n))
            for n in self.dist_matrix.columns
        }
        self._d_matrix = self.dist_matrix

        while self._d_matrix.shape[0] > 2:

            self._update_q_matrix()

            raw_min, col_min = self._get_pos_min_from_q_matrix()
            range_raw, range_col = self._get_dist_for_neighborhood(
                raw_min, col_min)
            new_name = "{}{}".format(str(raw_min), str(col_min))

            self._update_nodes(raw_min, col_min, range_raw, range_col,
                               new_name)

            new_dist_matrix = self._get_new_dist_matrix(raw_min, col_min)
            dists_node = [
                self._get_dist_for_nodes(raw_min, col_min, index)
                for index in new_dist_matrix.index
            ]
            new_item = pd.Series(dists_node,
                                 name=new_name,
                                 index=new_dist_matrix.index)
            new_dist_matrix = new_dist_matrix.append(new_item).T
            new_item = new_item.append(
                pd.Series(0, name=new_name, index=[new_name]))
            new_dist_matrix = new_dist_matrix.append(new_item)
            self._d_matrix = new_dist_matrix

        assert (len(self._nodes) == 2)
        name1 = self._d_matrix.index[0]
        name2 = self._d_matrix.index[1]
        node1 = self._nodes.pop(name1)
        node2 = self._nodes.pop(name2)
        node1.branch_length = self._d_matrix[name1][name2]
        node2.clades.append(node1)
        self.tree = BaseTree.Tree(node2, rooted=False)

        self._nodes = None
        self._q_matrix = None
        self._d_matrix = None

        return (True)
Пример #6
0
def _part(clades):
    """recursive function of adam consensus algorithm"""
    new_clade = None
    terms = clades[0].get_terminals()
    term_names = [term.name for term in terms]
    if len(terms) == 1 or len(terms) == 2:
        new_clade = clades[0]
    else:
        bitstrs = set([_BitString('1' * len(terms))])
        for clade in clades:
            for child in clade.clades:
                bitstr = _clade_to_bitstr(child, term_names)
                to_remove = set()
                to_add = set()
                for bs in bitstrs:
                    if bs == bitstr:
                        continue
                    elif bs.contains(bitstr):
                        to_add.add(bitstr)
                        to_add.add(bs ^ bitstr)
                        to_remove.add(bs)
                    elif bitstr.contains(bs):
                        to_add.add(bs ^ bitstr)
                    elif not bs.independent(bitstr):
                        to_add.add(bs & bitstr)
                        to_add.add(bs & bitstr ^ bitstr)
                        to_add.add(bs & bitstr ^ bs)
                        to_remove.add(bs)
                # bitstrs = bitstrs | to_add
                bitstrs ^= to_remove
                if to_add:
                    for ta in sorted(to_add, key=lambda bs: bs.count('1')):
                        independent = True
                        for bs in bitstrs:
                            if not ta.independent(bs):
                                independent = False
                                break
                        if independent:
                            bitstrs.add(ta)
        new_clade = BaseTree.Clade()
        for bitstr in sorted(bitstrs):
            indices = bitstr.index_one()
            if len(indices) == 1:
                new_clade.clades.append(terms[indices[0]])
            elif len(indices) == 2:
                bifur_clade = BaseTree.Clade()
                bifur_clade.clades.append(terms[indices[0]])
                bifur_clade.clades.append(terms[indices[1]])
                new_clade.clades.append(bifur_clade)
            elif len(indices) > 2:
                part_names = [term_names[i] for i in indices]
                next_clades = []
                for clade in clades:
                    next_clades.append(_sub_clade(clade, part_names))
                # next_clades = [clade.common_ancestor([clade.find_any(name=name) for name in part_names]) for clade in clades]
                new_clade.clades.append(_part(next_clades))
    return new_clade
Пример #7
0
def strict_consensus(trees, mcmc=False):
    """Search strict consensus tree from multiple trees.

    :Parameters:
        trees : list
            list of trees to produce consensus tree or a list of tuples
            output of mcmc if mcmc=True, tuples like (tree, number of occurences in MCMC)

        mcmc : Boolean
            True if parameter trees is a tuple, output of mcmc

    """
    if mcmc:
        trees = [tree[0] for tree in trees]
    trees_iter = iter(trees)
    first_tree = next(trees_iter)

    terms = first_tree.get_terminals()
    bitstr_counts, tree_count = _count_clades(itertools.chain([first_tree], trees_iter))

    # Store bitstrs for strict clades
    strict_bitstrs = [
        bitstr for bitstr, t in bitstr_counts.items() if t[0] == tree_count
    ]
    strict_bitstrs.sort(key=lambda bitstr: bitstr.count("1"), reverse=True)
    # Create root
    root = BaseTree.Clade()
    if strict_bitstrs[0].count("1") == len(terms):
        root.clades.extend(terms)
    else:
        raise ValueError("Taxons in provided trees should be consistent")
    # make a bitstr to clades dict and store root clade
    bitstr_clades = {strict_bitstrs[0]: root}
    # create inner clades
    for bitstr in strict_bitstrs[1:]:
        clade_terms = [terms[i] for i in bitstr.index_one()]
        clade = BaseTree.Clade()
        clade.clades.extend(clade_terms)
        for bs, c in bitstr_clades.items():
            # check if it should be the parent of current clade
            if bs.contains(bitstr):
                # remove old bitstring
                del bitstr_clades[bs]
                # update clade childs
                new_childs = [child for child in c.clades if child not in clade_terms]
                c.clades = new_childs
                # set current clade as child of c
                c.clades.append(clade)
                # update bitstring
                bs = bs ^ bitstr
                # update clade
                bitstr_clades[bs] = c
                break
        # put new clade
        bitstr_clades[bitstr] = clade
    return BaseTree.Tree(root=root)
 def recurse_children( parent_id ):
     if options.cluster is not None and rank_map[parent_id] == options.cluster:
         # Short circuit if we found our rank, prevents 'hanging' no ranks from being output
         # e.g. clustering by "species" (Escherichia coli), but have "no rank" below (Escherichia coli K-12) in test_db
         return
     if parent_id not in nodes:
         nodes[parent_id] = BaseTree.Clade( name=_get_name( parent_id ), branch_length=branch_length )
     for child_id in child_lists.get( parent_id, [] ):
         if options.cluster is None or ( rank_map[child_id] <= options.cluster  ):
             if child_id not in nodes:
                 nodes[child_id] = BaseTree.Clade(name=_get_name( child_id ), branch_length=branch_length)
             nodes[parent_id].clades.append(nodes[child_id])
             recurse_children( child_id )
Пример #9
0
def prettyprint_tree(tree, file=None):
    # Convert the "tree" object (list of clades) to a BioPython tree
    # to take advantage of their output methods
    def create_ntree(tree):
        ntree = BaseTree.Clade()
        for key in tree:
            el = tree[key]
            if len(el.values()) > 0:
                ntree.clades.append(create_ntree(el))
            else:
                ntree.clades.append(BaseTree.Clade(name=list(key)[0]))
        return ntree

    # Sort the clades from largest to smallest
    new_tree = sorted(tree, key=lambda x: -len(x))
    # Build a dictionary representation of the tree
    tree_dict = {}
    for clade in new_tree:
        tree_dict = create_tree_dict(tree_dict, clade)
    # Convert the dictionary representation to a BioPython Tree object
    ntree = BaseTree.Tree(create_ntree(tree_dict))
    # Use the BioPython print method
    Phylo.draw_ascii(ntree, file=file)
    try:
        Phylo.draw(ntree)
    except:
        pass
    return
Пример #10
0
    def _update_nodes(self, n1, n2, d1, d2, new_n):
        tmp1 = self._nodes.pop(n1)
        tmp2 = self._nodes.pop(n2)

        tmp1.branch_length = float(d1)
        tmp2.branch_length = float(d2)

        self._nodes[new_n] = BaseTree.Clade(None, new_n, [tmp1, tmp2])
def strict_consensus(trees):
    """Search strict consensus tree from multiple trees.

    :Parameters:
        trees: list
            list of trees to produce consensus tree.
    """
    terms = trees[0].get_terminals()
    bitstr_counts = _count_clades(trees)
    # Store bitstrs for strict clades
    strict_bitstrs = [bitstr for bitstr, t in bitstr_counts.items()
                      if t[0] == len(trees)]
    strict_bitstrs.sort(key=lambda bitstr: bitstr.count('1'), reverse=True)
    # Create root
    root = BaseTree.Clade()
    if strict_bitstrs[0].count('1') == len(terms):
        root.clades.extend(terms)
    else:
        raise ValueError('Taxons in provided trees should be consistent')
    # make a bitstr to clades dict and store root clade
    bitstr_clades = {strict_bitstrs[0]: root}
    # create inner clades
    for bitstr in strict_bitstrs[1:]:
        clade_terms = [terms[i] for i in bitstr.index_one()]
        clade = BaseTree.Clade()
        clade.clades.extend(clade_terms)
        for bs, c in bitstr_clades.items():
            # check if it should be the parent of current clade
            if bs.contains(bitstr):
                # remove old bitstring
                del bitstr_clades[bs]
                # update clade childs
                new_childs = [child for child in c.clades
                              if child not in clade_terms]
                c.clades = new_childs
                # set current clade as child of c
                c.clades.append(clade)
                # update bitstring
                bs = bs ^ bitstr
                # update clade
                bitstr_clades[bs] = c
                break
        # put new clade
        bitstr_clades[bitstr] = clade
    return BaseTree.Tree(root=root)
Пример #12
0
    def create_tree(self, root: ParsimonyClade):
        """
        Create tree with given root.

        root: root

        returns: tree
        """
        return BaseTree.Tree(root, rooted=True)
Пример #13
0
def create_clade_map(adj_map):
    clade_map = dict.fromkeys(adj_map.keys())

    for cluster in adj_map.keys():
        clade = BaseTree.Clade(name=str(cluster))
        clade.matrix = None
        clade_map[cluster] = clade

    return clade_map
Пример #14
0
def adam_consensus(trees):
    """Search Adam Consensus tree from multiple trees

    :Parameters:
        trees : list
            list of trees to produce consensus tree.
    """
    clades = [tree.root for tree in trees]
    return BaseTree.Tree(root=_part(clades), rooted=True)
Пример #15
0
def get_node_by_id(tree: Phylo.BaseTree,
                   postorder_node_id: int) -> Phylo.BaseTree.Clade:
    """
    Finds a tree node by its post-order DFS id.
    These IDs are used in .jplace formatted files.
    """
    postorder_id = 0
    for node in tree.find_elements(order='postorder'):
        if postorder_id == postorder_node_id:
            return node
        postorder_id += 1
    raise RuntimeError(str(postorder_node_id) + " not found.")
Пример #16
0
	def create_tree(self):

		"""Methods that constructs upgma tree
		based on the distance matrix
		"""

		if hasattr(self, 'tree'):
			return self.tree

		if not self.distances:
			self.tree = None
			return None

		clades = [BaseTree.Clade(None, n) for n in self.distances.names]

		find_clade = lambda name: [i for i, el in enumerate(clades) if el.name == name][0]

		while len(self.distances.names) > 1: 
			dist, i, j = _find_min(self.distances)
			i_clade, j_clade = find_clade(i), find_clade(j)
			new_clade = BaseTree.Clade(0, str(i) + str(j))

			_recalc_height(clades[i_clade], dist)
			_recalc_height(clades[j_clade], dist)

			new_clade.clades.append(clades[i_clade])
			new_clade.clades.append(clades[j_clade])

			if j_clade > i_clade:
				i_clade, j_clade = j_clade, i_clade

			clades.pop(i_clade)
			clades.pop(j_clade)

			clades.append(new_clade)
			self.distances = _join_clades(i, j,  self.distances)

		self.tree = BaseTree.Tree(clades[0])

		return self.tree
Пример #17
0
def _sub_clade(clade, term_names):
    """Extract a compatible subclade that only contains the given terminal names (PRIVATE)."""
    term_clades = [clade.find_any(name) for name in term_names]
    sub_clade = clade.common_ancestor(term_clades)
    if len(term_names) != sub_clade.count_terminals():
        temp_clade = BaseTree.Clade()
        temp_clade.clades.extend(term_clades)
        for c in sub_clade.find_clades(terminal=False, order="preorder"):
            if c == sub_clade.root:
                continue
            childs = set(c.find_clades(terminal=True)) & set(term_clades)
            if childs:
                for tc in temp_clade.find_clades(terminal=False, order="preorder"):
                    tc_childs = set(tc.clades)
                    tc_new_clades = tc_childs - childs
                    if childs.issubset(tc_childs) and tc_new_clades:
                        tc.clades = list(tc_new_clades)
                        child_clade = BaseTree.Clade()
                        child_clade.clades.extend(list(childs))
                        tc.clades.append(child_clade)
        sub_clade = temp_clade
    return sub_clade
Пример #18
0
def adam_consensus(trees, mcmc=False):
    """Search Adam Consensus tree from multiple trees.

    :Parameters:
        trees : list
            list of trees to produce consensus tree or a list of tuples
            output of mcmc if mcmc=True, tuples like (tree, number of occurences in MCMC)

        mcmc : Boolean
            True if parameter trees is a tuple, output of mcmc

    """
    if mcmc:
        trees = [tree[0] for tree in trees]
    clades = [tree.root for tree in trees]
    return BaseTree.Tree(root=_part(clades), rooted=True)
Пример #19
0
def merge_closest_edge(adj_map,
                       clade_map,
                       closest_pairs,
                       closest_pair,
                       pair_value,
                       is_distance=True):
    source = closest_pair[0]
    merging = closest_pair[1]
    if closest_pair[1] < source:
        source = closest_pair[1]
        merging = closest_pair[0]

    source_dict = dict()

    for cluster, adj_dict in adj_map.items():
        if cluster in closest_pair:
            continue

        adj_dict[source] = (adj_dict[source] + adj_dict[merging]) / 2
        adj_dict.pop(merging)

        source_dict[cluster] = adj_dict[source]

        ccp = (cluster, closest_pairs[cluster][0])
        if source in ccp or merging in ccp:
            if is_distance:
                closest_pairs[cluster] = min(adj_dict.items(),
                                             key=lambda x: x[1])
            else:
                closest_pairs[cluster] = max(adj_dict.items(),
                                             key=lambda x: x[1])

    adj_map.pop(merging)
    adj_map[source] = source_dict

    closest_pairs.pop(merging)
    if is_distance:
        closest_pairs[source] = min(source_dict.items(), key=lambda x: x[1])
    else:
        closest_pairs[source] = max(source_dict.items(), key=lambda x: x[1])

    merging_clade = clade_map.pop(merging)
    clade_map[source] = BaseTree.Clade(
        branch_length=pair_value, clades=[clade_map[source], merging_clade])
    clade_map[source].matrix = None
Пример #20
0
    def nj(self, distance_matrix):
        """Construct and return a Neighbor Joining tree.

        :Parameters:
            distance_matrix : DistanceMatrix
                The distance matrix for tree construction.

        """
        if not isinstance(distance_matrix, DistanceMatrix):
            raise TypeError("Must provide a DistanceMatrix object.")

        # make a copy of the distance matrix to be used
        dm = copy.deepcopy(distance_matrix)
        # init terminal clades
        clades = [BaseTree.Clade(None, name) for name in dm.names]
        # init node distance
        node_dist = [0] * len(dm)
        # init minimum index
        min_i = 0
        min_j = 0
        inner_count = 0
        # special cases for Minimum Alignment Matrices
        if len(dm) == 1:
            root = clades[0]

            return BaseTree.Tree(root, rooted=False)
        elif len(dm) == 2:
            # minimum distance will always be [1,0]
            min_i = 1
            min_j = 0
            clade1 = clades[min_i]
            clade2 = clades[min_j]
            clade1.branch_length = dm[min_i, min_j] / 2.0
            clade2.branch_length = dm[min_i, min_j] - clade1.branch_length
            inner_clade = BaseTree.Clade(None, "Inner")
            inner_clade.clades.append(clade1)
            inner_clade.clades.append(clade2)
            clades[0] = inner_clade
            root = clades[0]

            return BaseTree.Tree(root, rooted=False)
        while len(dm) > 2:
            # calculate nodeDist
            for i in range(0, len(dm)):
                node_dist[i] = 0
                for j in range(0, len(dm)):
                    node_dist[i] += dm[i, j]
                node_dist[i] = node_dist[i] / (len(dm) - 2)

            # find minimum distance pair
            min_dist = dm[1, 0] - node_dist[1] - node_dist[0]
            min_i = 0
            min_j = 1
            for i in range(1, len(dm)):
                for j in range(0, i):
                    temp = dm[i, j] - node_dist[i] - node_dist[j]
                    if min_dist > temp:
                        min_dist = temp
                        min_i = i
                        min_j = j
            # create clade
            clade1 = clades[min_i]
            clade2 = clades[min_j]
            inner_count += 1
            inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count))
            inner_clade.clades.append(clade1)
            inner_clade.clades.append(clade2)
            # assign branch length
            clade1.branch_length = (dm[min_i, min_j] + node_dist[min_i] -
                                    node_dist[min_j]) / 2.0
            clade2.branch_length = dm[min_i, min_j] - clade1.branch_length

            # update node list
            clades[min_j] = inner_clade
            del clades[min_i]

            # rebuild distance matrix,
            # set the distances of new node at the index of min_j
            for k in range(0, len(dm)):
                if k != min_i and k != min_j:
                    dm[min_j, k] = (dm[min_i, k] + dm[min_j, k] -
                                    dm[min_i, min_j]) / 2.0

            dm.names[min_j] = "Inner" + str(inner_count)
            del dm[min_i]

        # set the last clade as one of the child of the inner_clade
        root = None
        if clades[0] == inner_clade:
            clades[0].branch_length = 0
            clades[1].branch_length = dm[1, 0]
            clades[0].clades.append(clades[1])
            root = clades[0]
        else:
            clades[0].branch_length = dm[1, 0]
            clades[1].branch_length = 0
            clades[1].clades.append(clades[0])
            root = clades[1]

        return BaseTree.Tree(root, rooted=False)
Пример #21
0
    def main(self, tree_filename, tree_format='newick'):
        col_delimiter = '\t'
        url = 'http://ecat-dev.gbif.org/repository/export/checklist1.zip'

        # download the taxonomy archive
        filename = self.download_file(url)

        # extract the tables
        extract = 'taxon.txt'
        if os.path.exists(os.path.join(self.data_dir, extract)):
            print 'Using existing copy of %s' % extract
        else:
            print 'Extracting %s from %s...' % (extract, filename)
            archive = zipfile.ZipFile(filename, mode='r')
            archive.extract(extract, path=self.data_dir)
            archive.close()

        # build BioPython clades
        print 'Reading taxonomy...'
        nodes = {}
        with open(os.path.join(self.data_dir, 'taxon.txt')) as taxonomy_file:
            for line in taxonomy_file:
                line = line.strip()
                values = line.split(col_delimiter)
                id, parent_id, syn_id, _, name, _, status = values[:7]

                # skip incertae sedis taxa
                if id == '0': continue

                if syn_id and not 'synonym' in status:
                    continue
                elif syn_id and 'synonym' in status:
                    if tree_format == 'cdao':
                        nodes[id] = ('synonym', name, syn_id)
                elif not syn_id:
                    nodes[id] = BaseTree.Clade(name=name)
                    nodes[id].parent_id = parent_id

        print 'Found %s OTUs.' % len(nodes)
        nodes[''] = root_node = BaseTree.Clade()

        # create tree from nodes dictionary
        print 'Building tree...'
        for node_id, this_node in nodes.iteritems():
            if not node_id: continue

            if isinstance(this_node, BaseTree.Clade):
                try:
                    parent_node = nodes[this_node.parent_id]
                    parent_node.clades.append(this_node)
                    del this_node.parent_id
                except (KeyError, AttributeError):
                    pass

            elif this_node[0] == 'synonym':
                _, name, syn_id = this_node
                try:
                    accepted_node = nodes[syn_id]
                except KeyError:
                    continue

                if not isinstance(accepted_node, BaseTree.Clade): continue

                if not hasattr(accepted_node, 'tu_attributes'):
                    nodes[syn_id].tu_attributes = []
                nodes[syn_id].tu_attributes.append(
                    ('<http://www.w3.org/2004/02/skos/core#altLabel>',
                     Taxonomy.format_rdf_string(name)))
                #print 'Synonym: %s -> %s' % (name, nodes[syn_id].name)

        tree = BaseTree.Tree(root=root_node)

        # write tree to file
        print 'Writing %s tree to %s...' % (tree_format, tree_filename)
        bp.write([tree], tree_filename, tree_format)

        print 'Done!' ''
Пример #22
0
def majority_consensus(trees, cutoff=0):
    """Search majority rule consensus tree from multiple trees.

    This is a extend majority rule method, which means the you can set any
    cutoff between 0 ~ 1 instead of 0.5. The default value of cutoff is 0 to
    create a relaxed binary consensus tree in any condition (as long as one of
    the provided trees is a binary tree). The branch length of each consensus
    clade in the result consensus tree is the average length of all counts for
    that clade.

    :Parameters:
        trees : iterable
            iterable of trees to produce consensus tree.
    """
    tree_iter = iter(trees)
    first_tree = next(tree_iter)

    terms = first_tree.get_terminals()
    bitstr_counts, tree_count = _count_clades(
        itertools.chain([first_tree], tree_iter))

    # Sort bitstrs by descending #occurrences, then #tips, then tip order
    bitstrs = sorted(
        bitstr_counts.keys(),
        key=lambda bitstr:
        (bitstr_counts[bitstr][0], bitstr.count('1'), str(bitstr)),
        reverse=True)
    root = BaseTree.Clade()
    if bitstrs[0].count('1') == len(terms):
        root.clades.extend(terms)
    else:
        raise ValueError('Taxons in provided trees should be consistent')
    # Make a bitstr-to-clades dict and store root clade
    bitstr_clades = {bitstrs[0]: root}
    # create inner clades
    for bitstr in bitstrs[1:]:
        # apply majority rule
        count_in_trees, branch_length_sum = bitstr_counts[bitstr]
        confidence = 100.0 * count_in_trees / tree_count
        if confidence < cutoff * 100.0:
            break
        clade_terms = [terms[i] for i in bitstr.index_one()]
        clade = BaseTree.Clade()
        clade.clades.extend(clade_terms)
        clade.confidence = confidence
        clade.branch_length = branch_length_sum / count_in_trees
        bsckeys = sorted(bitstr_clades,
                         key=lambda bs: bs.count('1'),
                         reverse=True)

        # check if current clade is compatible with previous clades and
        # record it's possible parent and child clades.
        compatible = True
        parent_bitstr = None
        child_bitstrs = []  # multiple independent childs
        for bs in bsckeys:
            if not bs.iscompatible(bitstr):
                compatible = False
                break
            # assign the closest ancestor as its parent
            # as bsckeys is sorted, it should be the last one
            if bs.contains(bitstr):
                parent_bitstr = bs
            # assign the closest descendant as its child
            # the largest and independent clades
            if (bitstr.contains(bs) and bs != bitstr
                    and all(c.independent(bs) for c in child_bitstrs)):
                child_bitstrs.append(bs)
        if not compatible:
            continue

        if parent_bitstr:
            # insert current clade; remove old bitstring
            parent_clade = bitstr_clades.pop(parent_bitstr)
            # update parent clade childs
            parent_clade.clades = [
                c for c in parent_clade.clades if c not in clade_terms
            ]
            # set current clade as child of parent_clade
            parent_clade.clades.append(clade)
            # update bitstring
            # parent = parent ^ bitstr
            # update clade
            bitstr_clades[parent_bitstr] = parent_clade

        if child_bitstrs:
            remove_list = []
            for c in child_bitstrs:
                remove_list.extend(c.index_one())
                child_clade = bitstr_clades[c]
                parent_clade.clades.remove(child_clade)
                clade.clades.append(child_clade)
            remove_terms = [terms[i] for i in remove_list]
            clade.clades = [c for c in clade.clades if c not in remove_terms]
        # put new clade
        bitstr_clades[bitstr] = clade
        if ((len(bitstr_clades) == len(terms) - 1) or
            (len(bitstr_clades) == len(terms) - 2 and len(root.clades) == 3)):
            break
    return BaseTree.Tree(root=root)
Пример #23
0
    def nj_full_gpu(self, distance_matrix):
        if not isinstance(distance_matrix, DistanceMatrix):
            raise TypeError("Must provide a DistanceMatrix object.")

        # make a copy of the distance matrix to be used
        dm = copy.deepcopy(distance_matrix)
        # init terminal clades
        clades = [BaseTree.Clade(None, name) for name in dm.names]
        # init node distance
        node_dist = [0] * len(dm)
        # init minimum index
        min_i = 0
        min_j = 0
        inner_count = 0
        total_time = 0
        total_time2 = 0
        # special cases for Minimum Alignment Matrices
        if len(dm) == 1:
            root = clades[0]

            return BaseTree.Tree(root, rooted=False)
        elif len(dm) == 2:
            # minimum distance will always be [1,0]
            min_i = 1
            min_j = 0
            clade1 = clades[min_i]
            clade2 = clades[min_j]
            clade1.branch_length = dm[min_i, min_j] / 2.0
            clade2.branch_length = dm[min_i, min_j] - clade1.branch_length
            inner_clade = BaseTree.Clade(None, "Inner")
            inner_clade.clades.append(clade1)
            inner_clade.clades.append(clade2)
            clades[0] = inner_clade
            root = clades[0]

            return BaseTree.Tree(root, rooted=False)

        mod = SourceModule("""
          #include <stdio.h>
          #include <stdlib.h>
          __global__ void DeviceNodeDist(double *device_dm, double *device_node_dist, int N)
          {
              const int tid = threadIdx.y + blockIdx.y* blockDim.y;
              if (tid >= N) return;
              for(int i = 0; i< N; i++){
                if(tid< i){
                    device_node_dist[tid] += device_dm[(i*(i+1))/2 + tid];

                }else{
                    device_node_dist[tid] += device_dm[(tid*(tid+1))/2 + i];

                }

               }

          device_node_dist[tid]= (double)(device_node_dist[tid]/ (N-2));
          }""")

        mod1 = SourceModule("""
         __global__ void findMin(double *dm, double *node_dist, long long *index_x, long long *index_y, double *local_min, int c, int l, int dm_length)
        {
            int k = threadIdx.y + blockIdx.y*blockDim.y;

            double min_dist = 0.0;  

            int min_x =0;
            int min_y =0;
            int x = 0;
            int y = 0;

            for(int i= k*c ; i< (k+1)*c; i++){
                if(i<l)
                {
                    for(int j=0; j<dm_length; j++){
                        if(i==0){
                            x=1;
                            y=0;
                            break;
                        }else{
                            int t_val = ((j+1)*(j+2))/2 ;
                            if(i < t_val){
                                x=j+1;
                                y= i-(t_val-j-1);
                                break;
                            }else if(i== t_val){
                                x = j+2;
                                y = 0;
                                break;
                            }

                        }
                    }


                    double temp = dm[i] - (node_dist[x] + node_dist[y] );
                    if(min_dist > temp){
                        min_dist = temp;
                        min_x = x;
                        min_y = y;
                    }
                } 
            }
            local_min[k]=min_dist;
            index_x[k]= min_x;
            index_y[k]= min_y;

        }""")
        # print("Time taken to run SourceModule %s" % (time.time()-in_t1))
        while len(dm) > 2:


            # calculate nodeDist
            host_dm = []  # 1D list for distance matrix
            for list in dm.matrix:
                host_dm.extend(list)

            host_dm = np.array(host_dm)
            # host_dm = host_dm.astype(np.float32)
            length = len(dm)
            host_node_dist = np.zeros((length,), dtype=float)
            # host_node_dist = host_node_dist.astype(np.float32)

            ###GPU code
            start = cuda.Event()
            end = cuda.Event()

            # get the optimum block size based on dataset size
            if (length < 128):
                BLOCKSIZE = 128
            elif (length < 256):
                BLOCKSIZE = 256
            elif (length < 512):
                BLOCKSIZE = 512
            else:
                BLOCKSIZE = 1024

            ###Allocate GPU device memory
            device_dm = cuda.mem_alloc(host_dm.nbytes)
            device_node_dist = cuda.mem_alloc(host_node_dist.nbytes)

            ###Memcopy from host to device
            cuda.memcpy_htod(device_dm, host_dm)

            DeviceNodeDist = mod.get_function("DeviceNodeDist")

            blockDim = (1, BLOCKSIZE, 1)
            gridDim = (1, length / BLOCKSIZE + 1, 1)

            start.record()

            DeviceNodeDist(device_dm, device_node_dist, np.int32(length), block=blockDim, grid=gridDim)
            end.record()
            end.synchronize()

            node_dist1 = np.empty_like(host_node_dist)
            cuda.memcpy_dtoh(node_dist1, device_node_dist)
            node_dist2 = node_dist1.tolist()
            node_dist[0:len(node_dist2)] = node_dist2

            device_dm.free()







            in_t2 = time.time()

            start1 = cuda.Event()
            end1 = cuda.Event()

            mat = dm.matrix
            dm_cpu = np.array(mat[1][:-1])
            for i in range(2, len(dm)):
                dm_cpu = np.append(dm_cpu, mat[i][:-1])

            combinations = int(((len(dm) - 1) * len(dm)) / 2)

            if combinations < 1024 * 128:
                block_size = int(round((len(dm)) / 2))
            else:
                block_size = 512

            local_count = int(round(combinations / block_size))
            index_x = np.zeros(block_size, dtype=int)
            index_y = np.zeros(block_size, dtype=int)
            min_val = np.zeros(block_size, dtype=float)


            local_min_array_gpu = cuda.mem_alloc(dm_cpu.nbytes)
            local_index_gpux = cuda.mem_alloc(index_x.nbytes)
            local_index_gpuy = cuda.mem_alloc(index_y.nbytes)
            local_min_gpu = cuda.mem_alloc(min_val.nbytes)

            cuda.memcpy_htod(local_min_array_gpu, dm_cpu)

            func = mod1.get_function("findMin")
            start1.record()
            func(local_min_array_gpu, device_node_dist, local_index_gpux, local_index_gpuy, local_min_gpu,
                 np.int32(local_count), np.int32(len(dm_cpu)), np.int32(len(dm)),
                 block=(1, block_size, 1))
            end1.record()
            end1.synchronize()

            cuda.memcpy_dtoh(min_val, local_min_gpu)
            cuda.memcpy_dtoh(index_x, local_index_gpux)
            cuda.memcpy_dtoh(index_y, local_index_gpuy)



            min_val_new = min_val.tolist()


            local_min_array_gpu.free()
            local_min_gpu.free()
            local_index_gpux.free()
            local_index_gpuy.free()
            device_node_dist.free()

            min_dist = min(min_val_new)

            for i in range(len(min_val)):
                if min_dist == min_val[i]:
                    min_i = index_x[i]
                    min_j = index_y[i]
                    break

            del host_dm
            del host_node_dist
            del dm_cpu

            total_time2 += time.time() - in_t2








            # create clade
            clade1 = clades[min_i]
            clade2 = clades[min_j]
            inner_count += 1
            inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count))
            inner_clade.clades.append(clade1)
            inner_clade.clades.append(clade2)
            # assign branch length
            clade1.branch_length = (dm[min_i, min_j] + node_dist[min_i] -
                                    node_dist[min_j]) / 2.0
            clade2.branch_length = dm[min_i, min_j] - clade1.branch_length

            # update node list
            clades[min_j] = inner_clade
            del clades[min_i]

            # rebuild distance matrix,
            # set the distances of new node at the index of min_j
            for k in range(0, len(dm)):
                if k != min_i and k != min_j:
                    dm[min_j, k] = (dm[min_i, k] + dm[min_j, k] -
                                    dm[min_i, min_j]) / 2.0

            dm.names[min_j] = "Inner" + str(inner_count)
            del dm[min_i]

        #print("Time taken for min dist node calculation= %s" % total_time2)
        # set the last clade as one of the child of the inner_clade
        root = None
        if clades[0] == inner_clade:
            clades[0].branch_length = 0
            clades[1].branch_length = dm[1, 0]
            clades[0].clades.append(clades[1])
            root = clades[0]
        else:
            clades[0].branch_length = dm[1, 0]
            clades[1].branch_length = 0
            clades[1].clades.append(clades[0])
            root = clades[1]

        return BaseTree.Tree(root, rooted=False)
Пример #24
0
    def main(self, tree_filename, tree_format='newick'):
        col_delimiter = '|'
        url = 'http://www.itis.gov/downloads/itisMySQLTables.tar.gz'

        # download the taxonomy archive
        filename = self.download_file(url)

        # extract the tables
        for extract in ('taxonomic_units', 'longnames', 'synonym_links',
                        'vernaculars'):
            if os.path.exists(os.path.join(self.data_dir, extract)):
                print 'Using existing copy of %s' % extract
            else:
                print 'Extracting %s from %s...' % (extract, filename)
                archive = tarfile.open(name=filename, mode='r:gz')
                full_extract = [
                    x for x in archive.getnames()
                    if x.split('/')[-1] == extract
                ][0]
                member = archive.getmember(full_extract)
                member.name = extract
                archive.extract(extract, path=self.data_dir)
                archive.close()

        # get names for all ITIS TSNs from longnames table
        print 'Getting names...'
        names = {}
        with open(os.path.join(self.data_dir, 'longnames')) as names_file:
            for line in names_file:
                line = line.strip()
                values = line.split(col_delimiter)
                tax_id, name = values
                names[tax_id] = name

        # read all node info from taxonomic_units
        print 'Reading taxonomy...'
        nodes = {}
        with open(os.path.join(self.data_dir,
                               'taxonomic_units')) as nodes_file:
            for line in nodes_file:
                line = line.strip()
                values = line.split(col_delimiter)

                (tax_id, usage, parent_id,
                 uncertain_parent) = [values[n] for n in (0, 10, 17, 23)]

                #if uncertain_parent: continue
                if not usage in ('accepted', 'valid'): continue

                name = names[tax_id]
                this_node = BaseTree.Clade(name=name)
                nodes[tax_id] = this_node
                this_node.parent_id = parent_id

        other_names = defaultdict(set)
        if tree_format == 'cdao':
            # get synonym definitions
            print 'Getting synonyms...'
            with open(os.path.join(self.data_dir,
                                   'synonym_links')) as synonym_file:
                for line in synonym_file:
                    line = line.strip()
                    values = line.split(col_delimiter)
                    node_id, syn_id, _ = values
                    nodes[node_id] = ('synonym', names[node_id], syn_id)
            with open(os.path.join(self.data_dir,
                                   'vernaculars')) as synonym_file:
                for line in synonym_file:
                    line = line.strip()
                    values = line.split(col_delimiter)
                    tax_id, name = values[:2]
                    other_names[tax_id].add(name)

        print 'Found %s OTUs.' % len(nodes)
        nodes['0'] = root_node = BaseTree.Clade()

        # create tree from nodes dictionary
        print 'Building tree...'
        for node_id, this_node in nodes.iteritems():
            if node_id == '0': continue

            if isinstance(this_node, BaseTree.Clade):
                try:
                    parent_node = nodes[this_node.parent_id]
                    parent_node.clades.append(this_node)

                except (KeyError, AttributeError):
                    continue

                del this_node.parent_id

                if not hasattr(this_node, 'tu_attributes'):
                    this_node.tu_attributes = []
                for name in other_names[node_id]:
                    this_node.tu_attributes.append(
                        ('<http://www.w3.org/2004/02/skos/core#altLabel>',
                         Taxonomy.format_rdf_string(name)))

            elif this_node[0] == 'synonym':
                _, name, syn_id = this_node
                try:
                    accepted_node = nodes[syn_id]
                except KeyError:
                    continue

                if not isinstance(accepted_node, BaseTree.Clade): continue

                if not hasattr(accepted_node, 'tu_attributes'):
                    nodes[syn_id].tu_attributes = []
                nodes[syn_id].tu_attributes.append(
                    ('<http://www.w3.org/2004/02/skos/core#altLabel>',
                     Taxonomy.format_rdf_string(name)))
                #print 'Synonym: %s -> %s' % (name, nodes[syn_id].name)

        tree = BaseTree.Tree(root=root_node)

        # write tree to file
        print 'Writing %s tree to %s...' % (tree_format, tree_filename)
        bp.write([tree], tree_filename, tree_format)

        print 'Done!' ''
Пример #25
0
    def upgma(self, distance_matrix):

        # make a copy of the distance matrix to be used
        dm = copy.deepcopy(distance_matrix)
        dm_count = copy.deepcopy(dm)
        for i in range(1, len(dm_count)):
            for j in range(0, i):
                dm_count[i, j] = 1

        # init terminal clades
        clades = [BaseTree.Clade(None, name) for name in dm.names]

        # init minimum index
        min_i = 0
        min_j = 0
        inner_count = 0

        while len(dm) > 1:
            min_dist = dm[1, 0]
            # find minimum index

            mintime = time.time()
            for i in range(1, len(dm)):
                for j in range(0, i):
                    if min_dist >= dm[i, j]:
                        min_dist = dm[i, j]
                        min_i = i
                        min_j = j

            mintime2 = time.time()

            self.gap += mintime2 - mintime

            # create clade
            clade1 = clades[min_i]
            clade2 = clades[min_j]
            inner_count += 1
            inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count))
            inner_clade.clades.append(clade1)
            inner_clade.clades.append(clade2)

            # assign branch length
            if clade1.is_terminal():
                clade1.branch_length = min_dist * 1.0 / 2
            else:
                clade1.branch_length = min_dist * \
                                       1.0 / 2 - self._height_of(clade1)

            if clade2.is_terminal():
                clade2.branch_length = min_dist * 1.0 / 2
            else:
                clade2.branch_length = min_dist * \
                                       1.0 / 2 - self._height_of(clade2)

            # update node list
            clades[min_j] = inner_clade
            del clades[min_i]

            # rebuild distance matrix,
            # set the distances of new node at the index of min_j
            for k in range(0, len(dm)):
                r = 0
                if k != min_i and k != min_j:
                    r = dm_count[min_i, k] + dm_count[min_j, k]
                    dm[min_j, k] = ((dm[min_i, k] * dm_count[min_i, k]) +
                                    (dm[min_j, k] * dm_count[min_j, k])) / r
                    dm_count[min_j, k] = r

            dm_count.names[min_j] = "Inner" + str(inner_count)
            del dm_count[min_i]

            dm.names[min_j] = "Inner" + str(inner_count)

            del dm[min_i]
            inner_clade.branch_length = 0

        return BaseTree.Tree(inner_clade)
Пример #26
0
    def create_tree(self, names, matrix):
        if not names or not matrix:
            return self.tree

        distance_matrix = DistanceMatrix(names, matrix)
        dm = copy.deepcopy(distance_matrix)
        clades = [BaseTree.Clade(None, name) for name in dm.names]
        # clades[0].clades.append(clades[2])
        while len(clades) != 1:
            q_matrix = []
            q_names = dm.names
            for ind_i, i in enumerate(dm.matrix):
                tmp = []
                for ind_j, j in enumerate(i):
                    if ind_i == ind_j:
                        tmp.append(0)
                        continue
                    tmp.append((len(dm) - 2) * j - sum(dm[ind_i]) -
                               sum(dm[ind_j]))
                q_matrix.append(tmp)
            q_matrix = DistanceMatrix(q_names, q_matrix)

            min_i = float('Inf')
            min_j = float('Inf')
            q_min = float('Inf')
            for ind_i, i in enumerate(q_matrix):
                for ind_j, j in enumerate(i):
                    if j < q_min:
                        q_min = j
                        min_i = ind_i
                        min_j = ind_j

            if len(clades) == 2:
                # print('c:', clade_j)
                if min_i == 0:
                    clade_j = clades[min_j]
                    clade_j.branch_length = dm[min_i][min_j]
                    clades[min_i].clades.append(clade_j)
                    del clades[min_j]
                    break
                if min_i == 1:
                    clade_i = clades[min_i]
                    clade_i.branch_length = dm[min_i][min_j]
                    clades[min_j].clades.append(clade_i)
                    del clades[min_i]
                    break

            dist_i = 0.5 * dm[min_i][min_j] + (
                sum(dm[min_i]) - sum(dm[min_j])) / (2 * (len(dm) - 2))
            dist_j = dm[min_i][min_j] - dist_i

            clade_i = clades[min_i]
            clade_j = clades[min_j]
            clade_i.branch_length = dist_i
            clade_j.branch_length = dist_j

            tmp_clade = BaseTree.Clade(None, dm.names[min_i] + dm.names[min_j])
            tmp_clade.clades.append(clade_i)
            tmp_clade.clades.append(clade_j)

            clades[min_j] = tmp_clade
            del clades[min_i]
            # print(clades)

            tmp_dist = []
            for k in range(len(dm)):
                if k == min_j or k == 0:
                    tmp_dist.append(0)
                    continue
                tmp_dist.append(
                    0.5 * (dm[min_i][k] + dm[min_j][k] - dm[min_i][min_j]))
            dm[min_j] = tmp_dist

            dm.names[min_j] = dm.names[min_i] + dm.names[min_j]
            del dm[min_i]

        self.tree = BaseTree.Tree(clades[0], rooted=False)
        # print('res: ', BaseTree.Tree(clades[0], rooted = False))
        return self.tree
Пример #27
0
    def full_gpu_upgma(self, distance_matrix):

        # make a copy of the distance matrix to be used
        dm = copy.deepcopy(distance_matrix)

        dm_count = copy.deepcopy(dm)
        for i in range(1, len(dm_count)):
            for j in range(0, i):
                dm_count[i, j] = 1

        # init terminal clades
        clades = [BaseTree.Clade(None, name) for name in dm.names]

        # init minimum index
        min_i = 0
        min_j = 0
        inner_count = 0

        # GPU kernel to find the minimum index and minimum distance
        mod = SourceModule("""
         __global__ void findMin(double *dm, long long *index, double *local_min, int c, int l)
        {
            int k = threadIdx.y + blockIdx.y*blockDim.y;  
            double min_dist = dm[k*c];  
            int id = 0 ;               
            for(int i= k*c ; i< (k+1)*c; i++){
                if(i<l)
                {
                    if(min_dist >= dm[i])
                    {
                        min_dist = dm[i];
                        id = i;
                    }
                } 
            }
            local_min[k]=min_dist;
            index[k]= id;

        }""")

        while len(dm) > 1:
            # host array creation
            time_gpu_start = time.time()
            mat = dm.matrix
            dm_cpu = np.array(mat[1][:-1])
            for i in range(2, len(dm)):
                dm_cpu = np.append(dm_cpu, mat[i][:-1])

            combinations = int(((len(dm) - 1) * len(dm)) / 2)

            if combinations < 1024 * 256:
                block_size = int(round((len(dm)) / 2))
            elif combinations < 1024 * 1024:
                block_size = 512
            else:
                block_size = 1024

            local_count = int(round(combinations / block_size))
            if local_count < 1024:
                grid_size = 1
            else:
                grid_size = int(round(local_count / 1024)) + 1

            index = np.zeros(block_size, dtype=int)
            min_val = np.zeros(block_size, dtype=float)

            local_min_array_gpu = drv.mem_alloc(dm_cpu.nbytes)
            local_index_gpu = drv.mem_alloc(index.nbytes)
            local_min_gpu = drv.mem_alloc(min_val.nbytes)

            drv.memcpy_htod(local_min_array_gpu, dm_cpu)
            drv.memcpy_htod(local_index_gpu, index)
            drv.memcpy_htod(local_min_gpu, min_val)
            func = mod.get_function("findMin")

            # start.record()
            func(local_min_array_gpu,
                 local_index_gpu,
                 local_min_gpu,
                 np.int32(local_count),
                 np.int32(len(dm_cpu)),
                 block=(1, block_size, 1),
                 grid=(1, grid_size, 1))

            # end.record()
            # end.synchronize()
            drv.memcpy_dtoh(min_val, local_min_gpu)
            drv.memcpy_dtoh(index, local_index_gpu)

            min_val_new = min_val
            min_val = min_val.tolist()

            local_min_gpu.free()
            local_index_gpu.free()

            min_dist = min(min_val)
            global_id = 0
            for i in range(len(min_val_new)):
                if min_dist == min_val_new[i]:
                    global_id = index[i]
                    break

            for i in range(1, len(distance_matrix)):
                if global_id == 0:
                    min_i = 1
                    min_j = 0
                    break
                else:
                    t_val = ((i + 1) * (i + 2)) / 2
                    if global_id < t_val:
                        min_i = i + 1
                        min_j = global_id - (t_val - i - 1)
                        break
                    elif global_id == t_val:
                        min_i = i + 2
                        min_j = 0
                        break

            # create clade
            clade1 = clades[min_i]
            clade2 = clades[min_j]
            inner_count += 1
            inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count))
            inner_clade.clades.append(clade1)
            inner_clade.clades.append(clade2)

            # assign branch length
            if clade1.is_terminal():
                clade1.branch_length = min_dist * 1.0 / 2
            else:
                clade1.branch_length = min_dist * \
                                       1.0 / 2 - self._height_of(clade1)

            if clade2.is_terminal():
                clade2.branch_length = min_dist * 1.0 / 2
            else:
                clade2.branch_length = min_dist * \
                                       1.0 / 2 - self._height_of(clade2)

            # update node list
            clades[min_j] = inner_clade
            del clades[min_i]

            # rebuild distance matrix,
            # set the distances of new node at the index of min_j

            for k in range(0, len(dm)):
                r = 0
                if k != min_i and k != min_j:
                    r = dm_count[min_i, k] + dm_count[min_j, k]
                    dm[min_j, k] = ((dm[min_i, k] * dm_count[min_i, k]) +
                                    (dm[min_j, k] * dm_count[min_j, k])) / r
                    dm_count[min_j, k] = r

            dm_count.names[min_j] = "Inner" + str(inner_count)
            del dm_count[min_i]

            dm.names[min_j] = "Inner" + str(inner_count)

            del dm[min_i]
            inner_clade.branch_length = 0
            del dm_cpu
        return BaseTree.Tree(inner_clade)
Пример #28
0
    def main(self, tree_filename, tree_format='newick', ids=None):
        col_delimiter = '\t|\t'
        row_delimiter = '\t|\n'
        url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'

        # download the taxonomy archive
        filename = self.download_file(url)

        # extract the text dump
        for extract in ('nodes.dmp', 'names.dmp'):
            if os.path.exists(os.path.join(self.data_dir, extract)):
                print 'Using existing copy of %s' % extract
            else:
                print 'Extracting %s from %s...' % (extract, filename)
                archive = tarfile.open(name=filename, mode='r:gz')
                archive.extract(extract, path=self.data_dir)
                archive.close()

        # get names for all tax_ids from names.dmp
        print 'Getting names...'
        scientific_names = {}
        other_names = defaultdict(set)
        with open(os.path.join(self.data_dir, 'names.dmp')) as names_file:
            for line in names_file:
                line = line.rstrip(row_delimiter)
                values = line.split(col_delimiter)
                tax_id, name_txt, _, name_type = values[:4]
                if name_type == 'scientific name':
                    scientific_names[tax_id] = name_txt
                else:
                    other_names[tax_id].add(name_txt)

        # read all node info from nodes.dmp
        print 'Reading taxonomy...'
        nodes = {}
        with open(os.path.join(self.data_dir, 'nodes.dmp')) as nodes_file:
            for line in nodes_file:
                line = line.rstrip(row_delimiter)
                values = line.split(col_delimiter)
                tax_id, parent_id = values[:2]
                if ids:
                    this_node = BaseTree.Clade(name=tax_id)
                else:
                    this_node = BaseTree.Clade(name=scientific_names[tax_id])

                nodes[tax_id] = this_node
                this_node.parent_id = parent_id

                if tree_format == 'cdao':
                    # add common names, synonyms, mispellings, etc. as skos:altLabels
                    if not hasattr(this_node, 'tu_attributes'):
                        this_node.tu_attributes = []
                    for x in other_names[tax_id]:
                        this_node.tu_attributes.append(
                            ('<http://www.w3.org/2004/02/skos/core#altLabel>',
                             Taxonomy.format_rdf_string(x)))

        print 'Found %s OTUs.' % len(nodes)

        # create tree from nodes dictionary
        print 'Building tree...'
        for node_id, this_node in nodes.iteritems():
            if node_id == this_node.parent_id:
                root_node = this_node
                print 'Found root.'
            else:
                parent_node = nodes[this_node.parent_id]
                parent_node.clades.append(this_node)

            del this_node.parent_id

        tree = BaseTree.Tree(root=root_node)

        # write tree to file
        print 'Writing %s tree to %s...' % (tree_format, tree_filename)
        bp.write([tree], tree_filename, tree_format)

        print 'Done!'
def makeNj(score, otuName):
    for i in range(len(score)):
        for j in range(len(score)):
            score[i][j] = round(score[i][j], 6)
    clades = [BaseTree.Clade(None, name) for name in otuName]

    # init node distance
    node_dist = [0] * len(score)
    # init minimum index
    min_i = 0
    min_j = 0
    inner_count = 0
    while len(score) > 2:
        # calculate nodeDist
        for i in range(0, len(score)):
            node_dist[i] = 0
            for j in range(0, len(score)):
                node_dist[i] += score[i][j]
            node_dist[i] = node_dist[i] / (len(score) - 2)

        # find minimum distance pair
        min_dist = score[1][0] - node_dist[1] - node_dist[0]
        min_i = 0
        min_j = 1
        for i in range(1, len(score)):
            for j in range(0, i):
                temp = score[i][j] - node_dist[i] - node_dist[j]
                if min_dist > temp:
                    min_dist = temp
                    min_i = i
                    min_j = j
        # create clade
        clade1 = clades[min_i]
        clade2 = clades[min_j]
        inner_count += 1
        inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count))
        inner_clade.clades.append(clade1)
        inner_clade.clades.append(clade2)
        # assign branch length
        clade1.branch_length = (score[min_i][min_j] + node_dist[min_i] -
                                node_dist[min_j]) / 2.0
        clade2.branch_length = score[min_i][min_j] - clade1.branch_length

        # update node list
        clades[min_j] = inner_clade
        del clades[min_i]

        # rebuild distance matrix,
        # set the distances of new node at the index of min_j
        for k in range(0, len(score)):
            if k != min_i and k != min_j:
                score[min_j][k] = (score[min_i][k] + score[min_j][k] -
                                   score[min_i][min_j]) / 2.0
                score[k][min_j] = score[min_j][k]

        otuName[min_j] = "Inner" + str(inner_count)
        del score[min_i]
        for i in range(len(score)):
            del score[i][min_i]

    # set the last clade as one of the child of the inner_clade
    root = None
    if clades[0] == inner_clade:
        clades[0].branch_length = 0
        clades[1].branch_length = score[1][0]
        clades[0].clades.append(clades[1])
        root = clades[0]
    else:
        clades[0].branch_length = score[1][0]
        clades[1].branch_length = 0
        clades[1].clades.append(clades[0])
        root = clades[1]

    return BaseTree.Tree(root, rooted=False)
def makeUpgma(score, otuName):

    for i in range(len(score)):
        for j in range(len(score)):
            score[i][j] = round(score[i][j], 6)

    clades = [BaseTree.Clade(None, name) for name in otuName]

    # init minimum index
    min_i = 0
    min_j = 0
    inner_count = 0
    while len(score) > 1:
        min_dist = score[1][0]
        # find minimum index
        for i in range(1, len(score)):
            for j in range(0, i):
                if min_dist >= score[i][j]:
                    min_dist = score[i][j]
                    min_i = i
                    min_j = j

        # create clade
        clade1 = clades[min_i]
        clade2 = clades[min_j]
        inner_count += 1
        inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count))
        inner_clade.clades.append(clade1)
        inner_clade.clades.append(clade2)
        # assign branch length

        # TODO: originally self._height_of function from github repo
        #       was called but not implemented in this code.
        #       Function was input above.
        if clade1.is_terminal():
            clade1.branch_length = min_dist * 1.0 / 2
        else:
            clade1.branch_length = min_dist * \
                1.0 / 2 - height_of(clade1)

        if clade2.is_terminal():
            clade2.branch_length = min_dist * 1.0 / 2
        else:
            clade2.branch_length = min_dist * \
                1.0 / 2 - height_of(clade2)
        ################################################################
        ################################################################
        ################################################################
        # update node list
        clades[min_j] = inner_clade
        del clades[min_i]

        # rebuild distance matrix,
        # set the distances of new node at the index of min_j
        for k in range(0, len(score)):
            if k != min_i and k != min_j:
                score[min_j][k] = (score[min_i][k] + score[min_j][k]) * 1.0 / 2
                score[k][min_j] = score[min_j][k]

        otuName[min_j] = "Inner" + str(inner_count)
        del score[min_i]
        for i in range(len(score)):
            del score[i][min_i]

    inner_clade.branch_length = 0
    return BaseTree.Tree(inner_clade)