def read_phyloxml(input_file): """ Parses a pyhlogenetic tree in phyloxml-format. :param str input_file: path to file :return: ete2.Tree object """ from ete2 import Phyloxml project = Phyloxml() project.build_from_file(input_file) return project.get_phylogeny()
def getRelevantEdges( adjGraph, t1, t2 ): pxml = Phyloxml() pxml.build_from_file(t1) pxml.build_from_file(t2) la = filter( lambda x : x.find('LOST') == -1, pxml.phylogeny[0].get_leaf_names() ) lb = filter( lambda x : x.find('LOST') == -1, pxml.phylogeny[1].get_leaf_names() ) #lb = filter( lambda x : x.find('LOST') == -1, map( getName, cogent.LoadTree(t2).tips() ) ) crossValidationEdges = filter( lambda (x,y) : ((x in la) and (y in lb)) or ((y in la) and (x in lb)) , adjGraph.edges() ) relevantEdges = filter( lambda (x,y) : ((x in la) or (x in lb)) and ((y in la) or (y in lb)) , adjGraph.edges() ) newGraph = nx.Graph() newGraph.add_nodes_from( la + lb ) newGraph.add_edges_from( relevantEdges ) return newGraph, crossValidationEdges
def getTreeFromPhyloxml(xml, saveToFile="default.xml", delFile=True): """ Read a phylogeny tree from a phyloxml string and return a TreeClass object or a list of TreeClass object """ project = Phyloxml() fo=open(saveToFile, "w+") fo.write(xml) fo.close() project.build_from_file(saveToFile) treeList=[] for tree in project.get_phylogeny(): treeList.append(TreeClass.import_from_PhyloxmlTree(tree)) if(delFile): os.remove(saveToFile) if len(treeList)==1: return treeList[0] return treeList
def readScoreFile(fname, noself, randomize=False): # The strings naming the proteins whose interaction was removed in # this input tstring = fname.split('@')[-2].split("#") # Convert to upper case and make into an edge name tedge = (tstring[0].upper(), tstring[1].upper()) # Read in the phylogenies for the orthology groups treeDir = "../../Parana2Data/HerpesPPIs/trees/rearranged"#"dataOut_June17/rearranged" baseFile = fname.split('/')[-1] orthoGroup1 = baseFile.split('@')[0] orthoGroup2 = baseFile.split('@')[1] t1 = "{0}/{1}.xml.rooting.0.ntg.reconciled.0.ntg.rearrange.0.ntg".format(treeDir,orthoGroup1) t2 = "{0}/{1}.xml.rooting.0.ntg.reconciled.0.ntg.rearrange.0.ntg".format(treeDir,orthoGroup2) if ( not (os.path.exists(t1) and os.path.exists(t2)) ): return None, None # The extant (non ancestral, non lost) nodes from the two homology groups getName = lambda x : x.Name.upper() pxml = Phyloxml() pxml.build_from_file(t1) pxml.build_from_file(t2) la = filter( lambda x : x.find('LOST') == -1, map( lambda x: x.upper(), pxml.phylogeny[0].get_leaf_names() ) ) lb = filter( lambda x : x.find('LOST') == -1, map( lambda x: x.upper(), pxml.phylogeny[1].get_leaf_names() ) ) # The set of all possible interactions among the two homology groups #pe = list(itertools.product(la,la)) + list(itertools.product(la,lb)) + list(itertools.product(lb,lb)) possibleEndpoints = combinationsWithSelf(la) + list(itertools.product(la,lb)) + combinationsWithSelf(lb) \ if not (orthoGroup1 == orthoGroup2 ) else combinationsWithSelf(la) # From among all possible endpoints, only those protein pairs that reside in the # same species represent a potential edge allPossibleEdges = filter( lambda (x,y): x.split('_')[-1] == y.split('_')[-1], possibleEndpoints ) # an edge has both endpoints in the set of extant nodes inCurrentGroups = lambda e, x, y: (e[0] in x or e[0] in y) and (e[1] in x or e[1] in y) # an edge is relevant if it's constrained to the current groups relevantExtantEdges = [ e for e in ExtantNetwork.edges_iter() if inCurrentGroups(e,la,lb) ] # the set of potential edges that don't appear in the input network nonPresentEdgesMinusTarget = list(set([ x for x in allPossibleEdges if not ExtantNetwork.has_edge(x[0],x[1])])) # the same as above but including our target edge nonPresentEdges = nonPresentEdgesMinusTarget + [tedge] import random # Ancestral edges start with an N or R ancestral = ['R','N'] # The node is valid if it is neither lost nor ancestral isValidNode = lambda x : (x[0] not in ancestral) and (x.find('LOST') == -1) # Is the edge u,v the target edge? isCurrentEdge = lambda u,v : (u == tedge[0] and v == tedge[1]) or (u == tedge[1] and v == tedge[0]) isRealEdge = lambda u,v : (not isCurrentEdge(u,v)) and ExtantNetwork.has_edge(u,v) isValidEdge = lambda u,v : ((isValidNode(u) and isValidNode(v)) and (not isRealEdge(u,v))) # Is u,v one of the edges we wish to consider? def inPotentialEdges(u,v) : contains = (u,v) in nonPresentEdges or (v,u) in nonPresentEdges if noself: return u != v and contains else: return contains def isEdge( se, p1, p2 ): r = ((se.p1 == p1 and se.p2 == p2) or (se.p1 == p2 and se.p2 == p1)) return r scoredEdges = [] nonEdgesWithProb = set( nonPresentEdges ) with open(fname,'rb') as ifile: for l in ifile: toks = l.rstrip().split() p1 = toks[0].upper() p2 = toks[1].upper() s = float(toks[3]) if inPotentialEdges(p1,p2): if randomize: s = random.uniform(0.0,1.0) #if p1 == p2: s = 0.0 se = ScoredEdge(p1,p2,s) scoredEdges.append( se ) nonEdgesWithProb.discard((p1,p2)) nonEdgesWithProb.discard((p2,p1)) rev = True for u,v in (nonEdgesWithProb - set(nonPresentEdges)): s = random.uniform(0.0,1.0) if randomize else 0.0 scoredEdges.append(ScoredEdge(u, v, s)) # cost = 0.0 # for u,v in nonPresentEdges: # se = ScoredEdge(u,v,cost) # fe = [ e for e in scoredEdges if isEdge(e, u, v) ] # if len(fe) == 0: # scoredEdges.append(se) random.shuffle(scoredEdges) scoredEdges = list(enumerate(sorted( scoredEdges, key=lambda x: x.score, reverse=rev ))) # print(len(scoredEdges)) # print(t1,t2) # print("Target Edge = {0}".format(tedge)) # print("Extant Edges = {0}".format(relevantExtantEdges)) # print("Potential Edges = {0}".format(nonPresentEdges)) # print("Scored Edges = {0}".format(scoredEdges)) res = [ x for x in scoredEdges if isEdge(x[1], tedge[0], tedge[1]) ] if len(res) > 0: print(res) # Prev (ISMB) #print(res[0][0],float(len(nonPresentEdges)-1)) #return (res[0][0], float(len(nonPresentEdges)-1)) # New #print(res[0][0],float(len(scoredEdges)-1)) return (res[0][0], float(len(scoredEdges)-1)) else: raise 'Hell'
def main (): global options, args if options.verbose: print time.asctime(), if options.verbose: print "load and parse newick file" # TODO: read newick file tree = Phylo.read(args[0],'newick') # TODO: convert newick to phyloxml treeXML = StringIO() Phylo.write(tree,treeXML,'phyloxml') # TODO: read phyloxml as ete object hPhylotree = Phyloxml() with tempinput(treeXML.getvalue()) as tempfilename: hPhylotree.build_from_file(tempfilename) # TODO: get the tree tree2 = hPhylotree.get_phylogeny()[0] if options.verbose: print time.asctime(), if options.verbose: print "load and parse taxonomy file" # TODO: read taxonomy file tax = get_taxonomy(args[1]) # TODO: refine taxonomy annotation of internal node tree2 = add_taxonomy_for_internal_branch(tree2,tax) # TODO: refine tree node label #tree2 = add_node_label(tree2,tax) for node in tree2.traverse(): if not node.is_leaf(): label = "null" for t in ['kingdom','phylum','class','order','family','genus','species']: if len(tax[node.id][t])>3: label = tax[node.id][t] node.add_feature("mylabel",label) # TODO: add node depth depth={} if options.depth: with open(options.depth) as f: for line in f: (id,dep) = line.split() depth[id] = float(dep) # TODO: add color attribute if options.depth: for node in tree2.iter_leaves(): if depth[node.id] >= 10 and depth[node.id] < 100: node.add_feature("color","#D8BFD8") elif depth[node.id] >= 100 and depth[node.id] < 1000: node.add_feature("color","#DDA0DD") elif depth[node.id] >= 1000 and depth[node.id] < 5000: node.add_feature("color","#EE82EE") elif depth[node.id] >= 5000: node.add_feature("color","#DA70D6") else: node.add_feature("color","#E6E6FA") # TODO: set tree style ts = TreeStyle() ts.show_leaf_name = False ts.layout_fn = tree_layout # TODO: show tree2 #tree2.show(tree_style=ts) tree2.render(args[0]+".png",dpi=2048,tree_style=ts)
def readScoreFile(fname, noself, randomize=False): # The strings naming the proteins whose interaction was removed in # this input tstring = fname.split('@')[-2].split("#") # Convert to upper case and make into an edge name tedge = (tstring[0].upper(), tstring[1].upper()) # Read in the phylogenies for the orthology groups treeDir = "../../Parana2Data/HerpesPPIs/trees/rearranged" #"dataOut_June17/rearranged" baseFile = fname.split('/')[-1] orthoGroup1 = baseFile.split('@')[0] orthoGroup2 = baseFile.split('@')[1] t1 = "{0}/{1}.xml.rooting.0.ntg.reconciled.0.ntg.rearrange.0.ntg".format( treeDir, orthoGroup1) t2 = "{0}/{1}.xml.rooting.0.ntg.reconciled.0.ntg.rearrange.0.ntg".format( treeDir, orthoGroup2) if (not (os.path.exists(t1) and os.path.exists(t2))): return None, None # The extant (non ancestral, non lost) nodes from the two homology groups getName = lambda x: x.Name.upper() pxml = Phyloxml() pxml.build_from_file(t1) pxml.build_from_file(t2) la = filter(lambda x: x.find('LOST') == -1, map(lambda x: x.upper(), pxml.phylogeny[0].get_leaf_names())) lb = filter(lambda x: x.find('LOST') == -1, map(lambda x: x.upper(), pxml.phylogeny[1].get_leaf_names())) # The set of all possible interactions among the two homology groups #pe = list(itertools.product(la,la)) + list(itertools.product(la,lb)) + list(itertools.product(lb,lb)) possibleEndpoints = combinationsWithSelf(la) + list(itertools.product(la,lb)) + combinationsWithSelf(lb) \ if not (orthoGroup1 == orthoGroup2 ) else combinationsWithSelf(la) # From among all possible endpoints, only those protein pairs that reside in the # same species represent a potential edge allPossibleEdges = filter( lambda (x, y): x.split('_')[-1] == y.split('_')[-1], possibleEndpoints) # an edge has both endpoints in the set of extant nodes inCurrentGroups = lambda e, x, y: (e[0] in x or e[0] in y) and (e[ 1] in x or e[1] in y) # an edge is relevant if it's constrained to the current groups relevantExtantEdges = [ e for e in ExtantNetwork.edges_iter() if inCurrentGroups(e, la, lb) ] # the set of potential edges that don't appear in the input network nonPresentEdgesMinusTarget = list( set([ x for x in allPossibleEdges if not ExtantNetwork.has_edge(x[0], x[1]) ])) # the same as above but including our target edge nonPresentEdges = nonPresentEdgesMinusTarget + [tedge] import random # Ancestral edges start with an N or R ancestral = ['R', 'N'] # The node is valid if it is neither lost nor ancestral isValidNode = lambda x: (x[0] not in ancestral) and (x.find('LOST') == -1) # Is the edge u,v the target edge? isCurrentEdge = lambda u, v: (u == tedge[0] and v == tedge[1]) or ( u == tedge[1] and v == tedge[0]) isRealEdge = lambda u, v: (not isCurrentEdge(u, v) ) and ExtantNetwork.has_edge(u, v) isValidEdge = lambda u, v: ( (isValidNode(u) and isValidNode(v)) and (not isRealEdge(u, v))) # Is u,v one of the edges we wish to consider? def inPotentialEdges(u, v): contains = (u, v) in nonPresentEdges or (v, u) in nonPresentEdges if noself: return u != v and contains else: return contains def isEdge(se, p1, p2): r = ((se.p1 == p1 and se.p2 == p2) or (se.p1 == p2 and se.p2 == p1)) return r scoredEdges = [] nonEdgesWithProb = set(nonPresentEdges) with open(fname, 'rb') as ifile: for l in ifile: toks = l.rstrip().split() p1 = toks[0].upper() p2 = toks[1].upper() s = float(toks[3]) if inPotentialEdges(p1, p2): if randomize: s = random.uniform(0.0, 1.0) #if p1 == p2: s = 0.0 se = ScoredEdge(p1, p2, s) scoredEdges.append(se) nonEdgesWithProb.discard((p1, p2)) nonEdgesWithProb.discard((p2, p1)) rev = True for u, v in (nonEdgesWithProb - set(nonPresentEdges)): s = random.uniform(0.0, 1.0) if randomize else 0.0 scoredEdges.append(ScoredEdge(u, v, s)) # cost = 0.0 # for u,v in nonPresentEdges: # se = ScoredEdge(u,v,cost) # fe = [ e for e in scoredEdges if isEdge(e, u, v) ] # if len(fe) == 0: # scoredEdges.append(se) random.shuffle(scoredEdges) scoredEdges = list( enumerate(sorted(scoredEdges, key=lambda x: x.score, reverse=rev))) # print(len(scoredEdges)) # print(t1,t2) # print("Target Edge = {0}".format(tedge)) # print("Extant Edges = {0}".format(relevantExtantEdges)) # print("Potential Edges = {0}".format(nonPresentEdges)) # print("Scored Edges = {0}".format(scoredEdges)) res = [x for x in scoredEdges if isEdge(x[1], tedge[0], tedge[1])] if len(res) > 0: print(res) # Prev (ISMB) #print(res[0][0],float(len(nonPresentEdges)-1)) #return (res[0][0], float(len(nonPresentEdges)-1)) # New #print(res[0][0],float(len(scoredEdges)-1)) return (res[0][0], float(len(scoredEdges) - 1)) else: raise 'Hell'
from ete2 import Phyloxml project = Phyloxml() project.build_from_file("apaf.xml") # Each tree contains the same methods as a PhyloTree object for tree in project.get_phylogeny(): print tree # you can even use rendering options tree.show() # PhyloXML features are stored in the phyloxml_clade attribute for node in tree: print "Node name:", node.name for seq in node.phyloxml_clade.get_sequence(): for domain in seq.domain_architecture.get_domain(): domain_data = [domain.valueOf_, domain.get_from(), domain.get_to()] print " Domain:", '\t'.join(map(str, domain_data))
from ete2 import Phyloxml project = Phyloxml() project.build_from_file("testTree.xml") # Each tree contains the same methods as a PhyloTree object for tree in project.get_phylogeny(): print tree # you can even use rendering options tree.show() # PhyloXML features are stored in the phyloxml_clade attribute for node in tree: print "Node name:", node.name for seq in node.phyloxml_clade.get_sequence(): for domain in seq.domain_architecture.get_domain(): domain_data = [domain.valueOf_, domain.get_from(), domain.get_to()] print " Domain:", '\t'.join(map(str, domain_data))