def majority_consensus(trees, cutoff=0): """Search majority rule consensus tree from multiple trees. This is a extend majority rule method, which means the you can set any cutoff between 0 ~ 1 instead of 0.5. The default value of cutoff is 0 to create a relaxed binary consensus tree in any condition (as long as one of the provided trees is a binary tree). The branch length of each consensus clade in the result consensus tree is the average length of all counts for that clade. :Parameters: trees : iterable iterable of trees to produce consensus tree. """ tree_iter = iter(trees) first_tree = next(tree_iter) terms = first_tree.get_terminals() bitstr_counts, tree_count = _count_clades( itertools.chain([first_tree], tree_iter)) # Sort bitstrs by descending #occurrences, then #tips, then tip order bitstrs = sorted( bitstr_counts.keys(), key=lambda bitstr: (bitstr_counts[bitstr][0], bitstr.count('1'), str(bitstr)), reverse=True) root = BaseTree.Clade() if bitstrs[0].count('1') == len(terms): root.clades.extend(terms) else: raise ValueError('Taxons in provided trees should be consistent') # Make a bitstr-to-clades dict and store root clade bitstr_clades = {bitstrs[0]: root} # create inner clades for bitstr in bitstrs[1:]: # apply majority rule count_in_trees, branch_length_sum = bitstr_counts[bitstr] confidence = 100.0 * count_in_trees / tree_count if confidence < cutoff * 100.0: break clade_terms = [terms[i] for i in bitstr.index_one()] clade = BaseTree.Clade() clade.clades.extend(clade_terms) clade.confidence = confidence clade.branch_length = branch_length_sum / count_in_trees bsckeys = sorted(bitstr_clades, key=lambda bs: bs.count('1'), reverse=True) # check if current clade is compatible with previous clades and # record it's possible parent and child clades. compatible = True parent_bitstr = None child_bitstrs = [] # multiple independent childs for bs in bsckeys: if not bs.iscompatible(bitstr): compatible = False break # assign the closest ancestor as its parent # as bsckeys is sorted, it should be the last one if bs.contains(bitstr): parent_bitstr = bs # assign the closest descendant as its child # the largest and independent clades if (bitstr.contains(bs) and bs != bitstr and all(c.independent(bs) for c in child_bitstrs)): child_bitstrs.append(bs) if not compatible: continue if parent_bitstr: # insert current clade; remove old bitstring parent_clade = bitstr_clades.pop(parent_bitstr) # update parent clade childs parent_clade.clades = [ c for c in parent_clade.clades if c not in clade_terms ] # set current clade as child of parent_clade parent_clade.clades.append(clade) # update bitstring # parent = parent ^ bitstr # update clade bitstr_clades[parent_bitstr] = parent_clade if child_bitstrs: remove_list = [] for c in child_bitstrs: remove_list.extend(c.index_one()) child_clade = bitstr_clades[c] parent_clade.clades.remove(child_clade) clade.clades.append(child_clade) remove_terms = [terms[i] for i in remove_list] clade.clades = [c for c in clade.clades if c not in remove_terms] # put new clade bitstr_clades[bitstr] = clade if ((len(bitstr_clades) == len(terms) - 1) or (len(bitstr_clades) == len(terms) - 2 and len(root.clades) == 3)): break return BaseTree.Tree(root=root)
def majority_consensus(trees, cutoff=0, mcmc=False, n=1): """Search majority rule consensus tree from multiple trees. This is a extend majority rule method, which means the you can set any cutoff between 0 ~ 1 instead of 0.5. The default value of cutoff is 0 to create a relaxed binary consensus tree in any condition (as long as one of the provided trees is a binary tree). The branch length of each consensus clade in the result consensus tree is the average length of all counts for that clade. :Parameters: trees : iterable iterable of trees to produce consensus tree or a list of tuples output of mcmc if mcmc=True, tuples like (tree, number of occurences in MCMC) cutoff : float Must be between 0 and 1. cutoff=0.5 means, that all clades in the consensus tree must occur in at least 50% of trees, cutoff=1 is the same as strict consensus. mcmc : Boolean True if parameter trees is a tuple, output of mcmc n : integer Maximum number of best consensus trees returned - if the number is too big, it may be impossible to produce that many different consensus tree. """ if not (0 <= cutoff <= 1): raise ValueError("Cutoff must be a number between 0 and 1") tree_iter = iter(trees) first_tree = next(tree_iter) if mcmc: terms = first_tree[0].get_terminals() term_names = [term.name for term in terms] bitstr_counts, tree_count = _count_clades_mcmc(itertools.chain([first_tree], tree_iter), term_names) else: terms = first_tree.get_terminals() term_names = [term.name for term in terms] bitstr_counts, tree_count = _count_clades(itertools.chain([first_tree], tree_iter), term_names) # Sort bitstrs by descending #occurrences, then #tips, then tip order bitstrs = sorted( bitstr_counts.keys(), key=lambda bitstr: (bitstr_counts[bitstr][0], bitstr.count("1"), str(bitstr)), reverse=True, ) if not bitstrs[0].count("1") == len(terms): raise ValueError("Taxons in provided trees should be consistent") # Make a bitstr-to-clades dict and store root clade # create inner clades possible_starts = queue.Queue() possible_starts.put([bitstrs[0]]) clades_used = set() consensus_trees = [] # we will try to produce n different consensus trees, starting with bitstrings # that were not compatible with previous trees while len(consensus_trees) < n and not possible_starts.empty(): root = BaseTree.Clade() root.clades.extend(terms) bitstr_clades = {bitstrs[0]: root} new_start = possible_starts.get() new_start_appeared = 0 for bitstr in itertools.chain(new_start, bitstrs[1:]): if bitstr == new_start[0]: new_start_appeared += 1 if new_start_appeared > 1: continue if bitstr == bitstrs[0]: continue # apply majority rule count_in_trees, branch_length_sum = bitstr_counts[bitstr] confidence = count_in_trees / tree_count if confidence < cutoff: break clade_terms = [terms[i] for i in bitstr.index_one()] clade = BaseTree.Clade() clade.clades.extend(clade_terms) clade.confidence = confidence clade.branch_length = branch_length_sum / count_in_trees bsckeys = sorted(bitstr_clades, key=lambda bs: bs.count("1"), reverse=True) # check if current clade is compatible with previous clades and # record it's possible parent and child clades. compatible = True parent_bitstr = None child_bitstrs = [] # multiple independent childs for bs in bsckeys: if not bs.iscompatible(bitstr): if bitstr not in clades_used: possible_starts.put([bitstr]) compatible = False break # assign the closest ancestor as its parent # as bsckeys is sorted, it should be the last one if bs.contains(bitstr): parent_bitstr = bs # assign the closest descendant as its child # the largest and independent clades if ( bitstr.contains(bs) and bs != bitstr and all(c.independent(bs) for c in child_bitstrs) ): child_bitstrs.append(bs) if not compatible: continue if parent_bitstr: # insert current clade; remove old bitstring parent_clade = bitstr_clades.pop(parent_bitstr) # update parent clade childs parent_clade.clades = [ c for c in parent_clade.clades if c not in clade_terms ] # set current clade as child of parent_clade parent_clade.clades.append(clade) # update bitstring # parent = parent ^ bitstr # update clade bitstr_clades[parent_bitstr] = parent_clade if child_bitstrs: remove_list = [] for c in child_bitstrs: remove_list.extend(c.index_one()) child_clade = bitstr_clades[c] parent_clade.clades.remove(child_clade) clade.clades.append(child_clade) remove_terms = [terms[i] for i in remove_list] clade.clades = [c for c in clade.clades if c not in remove_terms] # put new clade bitstr_clades[bitstr] = clade clades_used.add(bitstr) consensus_trees.append(BaseTree.Tree(root=root)) if n == 1: return consensus_trees[0] return consensus_trees
def nj(self, distance_matrix): """Construct and return a Neighbor Joining tree. :Parameters: distance_matrix : DistanceMatrix The distance matrix for tree construction. """ if not isinstance(distance_matrix, DistanceMatrix): raise TypeError("Must provide a DistanceMatrix object.") # make a copy of the distance matrix to be used dm = copy.deepcopy(distance_matrix) # init terminal clades clades = [BaseTree.Clade(None, name) for name in dm.names] # init node distance node_dist = [0] * len(dm) # init minimum index min_i = 0 min_j = 0 inner_count = 0 while len(dm) > 2: # calculate nodeDist for i in range(0, len(dm)): node_dist[i] = 0 for j in range(0, len(dm)): node_dist[i] += dm[i, j] node_dist[i] = node_dist[i] / (len(dm) - 2) # find minimum distance pair min_dist = dm[1, 0] - node_dist[1] - node_dist[0] min_i = 0 min_j = 1 for i in range(1, len(dm)): for j in range(0, i): temp = dm[i, j] - node_dist[i] - node_dist[j] if min_dist > temp: min_dist = temp min_i = i min_j = j # create clade clade1 = clades[min_i] clade2 = clades[min_j] inner_count += 1 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) inner_clade.clades.append(clade1) inner_clade.clades.append(clade2) # assign branch length clade1.branch_length = (dm[min_i, min_j] + node_dist[min_i] - node_dist[min_j]) / 2.0 clade2.branch_length = dm[min_i, min_j] - clade1.branch_length # update node list clades[min_j] = inner_clade del clades[min_i] # rebuild distance matrix, # set the distances of new node at the index of min_j for k in range(0, len(dm)): if k != min_i and k != min_j: dm[min_j, k] = (dm[min_i, k] + dm[min_j, k] - dm[min_i, min_j]) / 2.0 dm.names[min_j] = "Inner" + str(inner_count) del dm[min_i] # set the last clade as one of the child of the inner_clade root = None if clades[0] == inner_clade: clades[0].branch_length = 0 clades[1].branch_length = dm[1, 0] clades[0].clades.append(clades[1]) root = clades[0] else: clades[0].branch_length = dm[1, 0] clades[1].branch_length = 0 clades[1].clades.append(clades[0]) root = clades[1] return BaseTree.Tree(root, rooted=False)
def nj(self, distance_matrix): if not isinstance(distance_matrix, DistanceMatrix): raise TypeError("Must provide a DistanceMatrix object.") # make a copy of the distance matrix to be used dm = copy.deepcopy(distance_matrix) # init terminal clades clades = [BaseTree.Clade(None, name) for name in dm.names] # init node distance node_dist = [0] * len(dm) # init minimum index min_i = 0 min_j = 0 inner_count = 0 total_time = 0 total_time2 = 0 # special cases for Minimum Alignment Matrices if len(dm) == 1: root = clades[0] return BaseTree.Tree(root, rooted=False) elif len(dm) == 2: # minimum distance will always be [1,0] min_i = 1 min_j = 0 clade1 = clades[min_i] clade2 = clades[min_j] clade1.branch_length = dm[min_i, min_j] / 2.0 clade2.branch_length = dm[min_i, min_j] - clade1.branch_length inner_clade = BaseTree.Clade(None, "Inner") inner_clade.clades.append(clade1) inner_clade.clades.append(clade2) clades[0] = inner_clade root = clades[0] return BaseTree.Tree(root, rooted=False) mod = SourceModule(""" #include <stdio.h> #include <stdlib.h> __global__ void DeviceNodeDist(float *device_dm, float *device_node_dist, float N) { const int tid = threadIdx.y + blockIdx.y* blockDim.y; if (tid >= N) return; for(int i = 0; i< N; i++){ if(tid< i){ device_node_dist[tid] += device_dm[(i*(i+1))/2 + tid]; }else{ device_node_dist[tid] += device_dm[(tid*(tid+1))/2 + i]; } } device_node_dist[tid]= device_node_dist[tid]/ (N-2.0); }""") while len(dm) > 2: # calculate nodeDist host_dm = [] # 1D list for distance matrix for list in dm.matrix: host_dm.extend(list) host_dm = np.array(host_dm) host_dm = host_dm.astype(np.float32) length = len(dm) host_node_dist = np.zeros((length,), dtype=float) host_node_dist = host_node_dist.astype(np.float32) ###GPU code start = cuda.Event() end = cuda.Event() # get the optimum block size based on dataset size if (length < 128): BLOCKSIZE = 128 elif (length < 256): BLOCKSIZE = 256 elif (length < 512): BLOCKSIZE = 512 else: BLOCKSIZE = 1024 ###Allocate GPU device memory device_dm = cuda.mem_alloc(host_dm.nbytes) device_node_dist = cuda.mem_alloc(host_node_dist.nbytes) ###Memcopy from host to device cuda.memcpy_htod(device_dm, host_dm) DeviceNodeDist = mod.get_function("DeviceNodeDist") blockDim = (1, BLOCKSIZE, 1) gridDim = (1, length / BLOCKSIZE + 1, 1) start.record() DeviceNodeDist(device_dm, device_node_dist, np.float32(length), block=blockDim, grid=gridDim) end.record() end.synchronize() node_dist1 = np.empty_like(host_node_dist) cuda.memcpy_dtoh(node_dist1, device_node_dist) node_dist2 = node_dist1.tolist() node_dist[0:len(node_dist2)]= node_dist2 device_dm.free() device_node_dist.free() del host_dm del host_node_dist #minimum distance calculation in_t2= time.time() min_dist = dm[1, 0] - node_dist[1] - node_dist[0] min_i = 0 min_j = 1 for i in range(1, len(dm)): for j in range(0, i): temp = dm[i, j] - node_dist[i] - node_dist[j] if min_dist > temp: min_dist = temp min_i = i min_j = j total_time2+= time.time()- in_t2 # create clade clade1 = clades[min_i] clade2 = clades[min_j] inner_count += 1 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) inner_clade.clades.append(clade1) inner_clade.clades.append(clade2) # assign branch length clade1.branch_length = (dm[min_i, min_j] + node_dist[min_i] - node_dist[min_j]) / 2.0 clade2.branch_length = dm[min_i, min_j] - clade1.branch_length # update node list clades[min_j] = inner_clade del clades[min_i] # rebuild distance matrix, # set the distances of new node at the index of min_j for k in range(0, len(dm)): if k != min_i and k != min_j: dm[min_j, k] = (dm[min_i, k] + dm[min_j, k] - dm[min_i, min_j]) / 2.0 dm.names[min_j] = "Inner" + str(inner_count) del dm[min_i] # set the last clade as one of the child of the inner_clade root = None if clades[0] == inner_clade: clades[0].branch_length = 0 clades[1].branch_length = dm[1, 0] clades[0].clades.append(clades[1]) root = clades[0] else: clades[0].branch_length = dm[1, 0] clades[1].branch_length = 0 clades[1].clades.append(clades[0]) root = clades[1] return BaseTree.Tree(root, rooted=False)
def upgma(self, distance_matrix): """Construct and return an UPGMA tree. Constructs and returns an Unweighted Pair Group Method with Arithmetic mean (UPGMA) tree. :Parameters: distance_matrix : DistanceMatrix The distance matrix for tree construction. """ if not isinstance(distance_matrix, DistanceMatrix): raise TypeError("Must provide a DistanceMatrix object.") # make a copy of the distance matrix to be used dm = copy.deepcopy(distance_matrix) # init terminal clades clades = [BaseTree.Clade(None, name) for name in dm.names] # init minimum index min_i = 0 min_j = 0 inner_count = 0 while len(dm) > 1: min_dist = dm[1, 0] # find minimum index for i in range(1, len(dm)): for j in range(0, i): if min_dist >= dm[i, j]: min_dist = dm[i, j] min_i = i min_j = j # create clade clade1 = clades[min_i] clade2 = clades[min_j] inner_count += 1 inner_clade = BaseTree.Clade(None, "Inner" + str(inner_count)) inner_clade.clades.append(clade1) inner_clade.clades.append(clade2) # assign branch length if clade1.is_terminal(): clade1.branch_length = min_dist * 1.0 / 2 else: clade1.branch_length = min_dist * \ 1.0 / 2 - self._height_of(clade1) if clade2.is_terminal(): clade2.branch_length = min_dist * 1.0 / 2 else: clade2.branch_length = min_dist * \ 1.0 / 2 - self._height_of(clade2) # update node list clades[min_j] = inner_clade del clades[min_i] # rebuild distance matrix, # set the distances of new node at the index of min_j for k in range(0, len(dm)): if k != min_i and k != min_j: dm[min_j, k] = (dm[min_i, k] + dm[min_j, k]) * 1.0 / 2 dm.names[min_j] = "Inner" + str(inner_count) del dm[min_i] inner_clade.branch_length = 0 return BaseTree.Tree(inner_clade)