def mapDendro(self, tree, G): print "MAP" pdm = treemeasure.PatristicDistanceMatrix(tree) tns = tree.taxon_namespace #~ tree.edge_mrca = {} node_map = {} self.edge_count = {} self.mrca_count = Counter([]) for i in xrange(self.N): v = self.namesLUTr[str(i)] nodes = pdm._mrca[tns.get_taxon(v)] self.mrca_count += Counter( [ancestor for taxon, ancestor in nodes.iteritems()]) n_internals = len(self.mrca_count.keys()) node_map = dict( (node, id) for id, node in enumerate(self.mrca_count.iterkeys())) for node in self.mrca_count.keys(): self.mrca_count[node_map[node]] = self.mrca_count.pop(node) self.edge_count[node_map[node]] = np.zeros(len(G)) for gi, edgeList in enumerate(G): for edge in edgeList: emrca = pdm.mrca(tns.get_taxon(edge[0]), tns.get_taxon(edge[1])) self.edge_count[node_map[emrca]][gi] += 1. print "DENDRO"
def pairwise_sequence(X, Y): ''' input: X: recordings of all cells in one cluster in pandas DataFrame format; contains two column: 'cell' and 'state' Y: groundtruth(tree) of the cell cluster output: permutated pairwise combinitions of all cells and distances X_out: a list of 2 by 10 matrix of the recordings from two cells Y_out: a list of distances between two cells ''' pdm = treemeasure.PatristicDistanceMatrix(Y) X_out = [] Y_out = [] for i, tx1 in enumerate(Y.taxon_namespace): for j, tx2 in enumerate(Y.taxon_namespace): Y_pair = pdm(tx1, tx2) state1 = np.array( [float(a) for a in list(tx1.label.split(' ')[1])]) state2 = np.array( [float(a) for a in list(tx2.label.split(' ')[1])]) #X_pair = np.vstack((state1, state2)) # random forest only takes 1-d feature array X_pair = np.hstack((state1, state2)) X_out.append(X_pair) Y_out.append(Y_pair) #print(X_pair, Y_pair) return X_out, Y_out
def bdict_from_tree(tree, pairs): """ get pairwise distance dictionary from tree input: tree: tree object pairs: list, of all pairs of sequences output: bDict: dict, pair -> distance """ bDict = {} pdm = treemeasure.PatristicDistanceMatrix(tree) for pair in pairs: seq1Name, seq2Name = pair # print pdm(tree.find_node_with_taxon_label(seq1Name).taxon, tree.find_node_with_taxon_label(seq2Name).taxon) bDict[pair] = pdm(tree.find_node_with_taxon_label(seq1Name).taxon, tree.find_node_with_taxon_label(seq2Name).taxon) return bDict
def calculatePvals(self, tree, edgeList=None, ng=1, setPriors=False): st = time.time() #~ print "Pvals" pdm = treemeasure.PatristicDistanceMatrix(tree) tns = tree.taxon_namespace #~ print "Init pvals",len(tree.nodes()),time.time()-st for node in tree.nodes(): if node.is_internal(): node.ei = 0.0 tree.edge_mrca = {} if edgeList is not None: #~ print "Calc ei",len(edgeList),time.time()-st for edge in edgeList: emrca = pdm.mrca(tns.get_taxon(edge[0]), tns.get_taxon(edge[1])) emrca.ei += 1 tree.edge_mrca[str(edge)] = emrca #~ self.mrca(pdm,tns.get_taxon(edge[0]),tns.get_taxon(edge[1])).ei += 1 #~ print "Calc p",len(tree.nodes()),time.time()-st for node in tree.nodes(): if node.is_internal(): split_sizes = [len(b.leaf_nodes()) for b in node.child_nodes()] node.ni = np.sum( split_sizes * (np.sum(split_sizes) - split_sizes) ) # number of possible links (split_sizes vector times reverse cumsum split_sizes) #~ node.p = (node.ei+1) / (node.ni+2) #Expected value and not actually used node.alpha = 1 #set priors - NB: these are overwritten for the null model node.beta = 1 if setPriors: node.alpha += node.ei node.beta += (node.ni * ng - node.ei)
try: clusterF = open(args.c, 'r') except IOError: print "\n cluster file not found in directory." sys.exit() #create save file try: save = open("ClustersExtendedOnTree.txt", 'w') except IOError: print 'no room for save file' sys.exit() #read in the tree, keeping underscores in the names tree = dendropy.Tree.get(path=tree, schema='newick', preserve_underscores=True) pdm = treemeasure.PatristicDistanceMatrix(tree) taxa = tree.taxon_namespace #go cluster by cluster, get the mrca of them all, then output all the tip child nodes of that MRCA as the new cluster clusters = [] while 1: s = clusterF.readline() if not s: break s = s.rstrip() sections = s.split("\t") mrca = tree.mrca(taxon_labels=sections) newClusterNodes = mrca.leaf_nodes() newCluster = [] for leaf in newClusterNodes: newCluster.append(re.sub("\'", "", str(leaf.taxon)))
def summarize_trees(self, trees, trees_outf=None, params=None, summaries=None): trees = self.tree_postprocessor.process_trees(trees) stats_fields = set() # crucial assumption here is all trees from same landscape wrt to # number of islands and habitats representative_taxon = trees[0].taxon_namespace[0] community_by_disturbed_vs_interior_habitat = {} num_islands = len(representative_taxon.island_code) num_habitats = len(representative_taxon.habitat_code) # community_by_island = {} # community_by_habitat = {} # for i in num_islands: # community_by_island[i] = {} # for i in num_habitats: # community_by_habitat[i] = {} # community_by_disturbed_vs_interior_habitat[0] = {} # community_by_disturbed_vs_interior_habitat[1] = {} for tree in list(trees): num_tips = 0 total_length = 0.0 total_edges = 0 nodes_by_island = collections.defaultdict(list) nodes_by_habitat = collections.defaultdict(list) disturbed_habitat_nodes = [] interior_habitat_nodes = [] all_tips = [] for nd in tree: # colorize if nd.taxon is None and nd.label is None: continue if nd.label is not None: self.tree_postprocessor.decode_labeled_item_biogeography(nd) # stats total_edges += 1 num_tips += 1 total_length += nd.edge.length if nd.is_leaf(): all_tips.append(nd) island_code = nd.taxon.island_code for idx, i in enumerate(island_code): # island_idx = len(island_code) - idx island_idx = idx if i == "1": nodes_by_island[island_idx].append(nd) habitat_code = nd.taxon.habitat_code for idx, i in enumerate(habitat_code): # habitat_idx = len(habitat_code) - idx habitat_idx = idx if i == "1": nodes_by_habitat[habitat_idx].append(nd) if habitat_idx == 0: disturbed_habitat_nodes.append(nd) else: if nd not in interior_habitat_nodes: interior_habitat_nodes.append(nd) if len(nodes_by_island) < num_islands and self.drop_trees_not_occupying_all_islands: trees.remove(tree) continue if len(nodes_by_habitat) < num_habitats and self.drop_trees_not_occupying_all_habitats: trees.remove(tree) continue pdm = treemeasure.PatristicDistanceMatrix(tree=tree) tree.stats = collections.defaultdict(lambda:"NA") if params is not None: tree.params = params.copy() tree.stats["size"] = num_tips tree.stats["length"] = total_length tree.stats["edges"] = total_edges # node_ages = tree.internal_node_ages() # node_ages = [n/total_length for n in node_ages] # tree.stats["est.birth.rate"] = birthdeath.fit_pure_birth_model(internal_node_ages=node_ages)["birth_rate"] tree.stats["est.birth.rate"] = birthdeath.fit_pure_birth_model(tree=tree)["birth_rate"] weighted_disturbed, unweighted_disturbed = self.get_mean_patristic_distance(pdm, disturbed_habitat_nodes) weighted_interior, unweighted_interior = self.get_mean_patristic_distance(pdm, interior_habitat_nodes) tree.stats["weighted.disturbed.habitat.pd"] = weighted_disturbed tree.stats["unweighted.disturbed.habitat.pd"] = unweighted_disturbed tree.stats["weighted.interior.habitat.pd"] = weighted_interior tree.stats["unweighted.interior.habitat.pd"] = unweighted_interior try: tree.stats["weighted.disturbed.to.interior.habitat.pd"] = weighted_disturbed/weighted_interior tree.stats["unweighted.disturbed.to.interior.habitat.pd"] = unweighted_disturbed/unweighted_interior except (ZeroDivisionError, TypeError): tree.stats["weighted.disturbed.to.interior.habitat.pd"] = "NA" tree.stats["unweighted.disturbed.to.interior.habitat.pd"] = "NA" rstats = self.rcalc.calc_ecological_stats( tree=tree, patristic_distance_matrix=pdm, total_tree_length=total_length, total_tree_edges=total_edges, nodes_by_island=nodes_by_island, nodes_by_habitat=nodes_by_habitat, disturbed_habitat_nodes=disturbed_habitat_nodes, interior_habitat_nodes=interior_habitat_nodes, ) stats_fields.update(tree.stats.keys()) if summaries is not None: sss = tree.stats.copy() sss.update(tree.params) summaries.append(sss) if trees_outf is not None: try: trees.write_to_stream(trees_outf, "nexus") except AttributeError: self.write_nexus(trees, trees_outf) return trees, stats_fields
def leaf_data_from_sim_tree(tree): """ summarizing results from simulated tree output: """ seqs = {} alignInSeg = {} alignSeg = {} segRateDict = {} # notAlignSeqs = {} ts = {} treeTem = deepcopy(tree) pdm = treemeasure.PatristicDistanceMatrix(treeTem) leafNodes = treeTem.leaf_nodes() lanNames = [leafNode.taxon.label for leafNode in leafNodes] lanNames.sort() lanPairsIter = itertools.combinations(lanNames, 2) segRateDict = get_seg_rate_dict(lanNames, treeTem) segIds = segRateDict.keys() segIds.sort() nSegs = len(segIds) segIdsNew = range(nSegs) segIdsTrans = dict(zip(segIds, segIdsNew)) for segId in segIds: segIdNew = segIdsTrans[segId] segRateDict[segIdNew] = segRateDict.pop(segId) # get simple sequences without alignment for lanName in lanNames: node = treeTem.find_node_with_taxon_label(lanName) seqs[lanName] = node.value # notAlignSeqs[lanName] = get_dna_seq(node.value) # get multiple alignment segIdLocIdDict = get_all_locid_within_all_segs(seqs, segIds) multiAlignSeqs = get_dna_multi_align(segIds, segIdLocIdDict, seqs, lanNames) for lanName, values in seqs.iteritems(): segIdsInLan = values.keys() for segId in segIdsInLan: segIdNew = segIdsTrans[segId] seqs[lanName][segIdNew] = seqs[lanName][segId] del seqs[lanName][segId] # get pairwise alignemtn for lanPair in lanPairsIter: lanName1 = lanPair[0] lanName2 = lanPair[1] node1 = treeTem.find_node_with_taxon_label(lanName1) node2 = treeTem.find_node_with_taxon_label(lanName2) segs1 = node1.value segs2 = node2.value alignInSeg[lanPair] = get_dna_align(segs1, segs2) alignSeg[lanPair] = get_seg_align(segs1, segs2) ts[lanPair] = pdm(node1.taxon, node2.taxon) # very small compuatational error in pdm method # get segment lengths seq0 = deepcopy(multiAlignSeqs[multiAlignSeqs.keys()[0]]) lenSegs = [] while seq0 != '': next = seq0.index('*') if next != 0: lenSegs.append(next) seq0 = seq0[(next + 1):] else: seq0 = seq0[1:] return seqs, alignInSeg, alignSeg, segRateDict, ts, multiAlignSeqs, lenSegs