예제 #1
0
def read_phyloxml(input_file):
    """
    Parses a pyhlogenetic tree in phyloxml-format.
    :param str input_file: path to file
    :return: ete2.Tree object
    """
    from ete2 import Phyloxml
    project = Phyloxml()
    project.build_from_file(input_file)

    return project.get_phylogeny()
예제 #2
0
 def getRelevantEdges( adjGraph, t1, t2 ):
   pxml = Phyloxml()
   pxml.build_from_file(t1)
   pxml.build_from_file(t2)
   la = filter( lambda x : x.find('LOST') == -1, pxml.phylogeny[0].get_leaf_names() )
   lb = filter( lambda x : x.find('LOST') == -1, pxml.phylogeny[1].get_leaf_names() )
   #lb = filter( lambda x : x.find('LOST') == -1, map( getName, cogent.LoadTree(t2).tips() ) )
   crossValidationEdges = filter( lambda (x,y) : ((x in la) and (y in lb)) or ((y  in la) and (x in lb))  , adjGraph.edges() )
   relevantEdges = filter( lambda (x,y) : ((x in la) or (x in lb)) and ((y in la) or (y in lb)) , adjGraph.edges() )
   newGraph = nx.Graph()
   newGraph.add_nodes_from( la + lb )
   newGraph.add_edges_from( relevantEdges )
   return newGraph, crossValidationEdges
예제 #3
0
def getTreeFromPhyloxml(xml, saveToFile="default.xml", delFile=True):
	"""
	Read a phylogeny tree from a phyloxml string and return a TreeClass object
	or a list of TreeClass object
	"""
	project = Phyloxml()
	fo=open(saveToFile, "w+")
	fo.write(xml)
	fo.close()
	project.build_from_file(saveToFile)
	treeList=[]
	for tree in project.get_phylogeny():
		treeList.append(TreeClass.import_from_PhyloxmlTree(tree))

	if(delFile):
		os.remove(saveToFile)
	if len(treeList)==1:
		return treeList[0]
	return treeList
예제 #4
0
def readScoreFile(fname, noself, randomize=False):

  # The strings naming the proteins whose interaction was removed in
  # this input
  tstring = fname.split('@')[-2].split("#")
  # Convert to upper case and make into an edge name
  tedge = (tstring[0].upper(), tstring[1].upper())

  # Read in the phylogenies for the orthology groups
  treeDir = "../../Parana2Data/HerpesPPIs/trees/rearranged"#"dataOut_June17/rearranged"
  baseFile = fname.split('/')[-1]
  orthoGroup1 = baseFile.split('@')[0]
  orthoGroup2 = baseFile.split('@')[1]
  t1 = "{0}/{1}.xml.rooting.0.ntg.reconciled.0.ntg.rearrange.0.ntg".format(treeDir,orthoGroup1)
  t2 = "{0}/{1}.xml.rooting.0.ntg.reconciled.0.ntg.rearrange.0.ntg".format(treeDir,orthoGroup2)

  if ( not (os.path.exists(t1) and os.path.exists(t2)) ):
    return None, None

  # The extant (non ancestral, non lost) nodes from the two homology groups
  getName = lambda x : x.Name.upper()
  pxml = Phyloxml()
  pxml.build_from_file(t1)
  pxml.build_from_file(t2)
  la = filter( lambda x : x.find('LOST') == -1, map( lambda x: x.upper(), pxml.phylogeny[0].get_leaf_names() ) )
  lb = filter( lambda x : x.find('LOST') == -1, map( lambda x: x.upper(), pxml.phylogeny[1].get_leaf_names() ) )

  # The set of all possible interactions among the two homology groups
  #pe = list(itertools.product(la,la)) + list(itertools.product(la,lb)) + list(itertools.product(lb,lb))
  possibleEndpoints =  combinationsWithSelf(la) + list(itertools.product(la,lb)) + combinationsWithSelf(lb) \
                       if not (orthoGroup1 == orthoGroup2 ) else combinationsWithSelf(la)

  # From among all possible endpoints, only those protein pairs that reside in the
  # same species represent a potential edge                      
  allPossibleEdges = filter( lambda (x,y): x.split('_')[-1] == y.split('_')[-1], possibleEndpoints )
    
  # an edge has both endpoints in the set of extant nodes
  inCurrentGroups = lambda e, x, y: (e[0] in x or e[0] in y) and (e[1] in x or e[1] in y)
  # an edge is relevant if it's constrained to the current groups
  relevantExtantEdges = [ e for e in ExtantNetwork.edges_iter() if inCurrentGroups(e,la,lb) ]
  # the set of potential edges that don't appear in the input network
  nonPresentEdgesMinusTarget = list(set([ x for x in allPossibleEdges if not ExtantNetwork.has_edge(x[0],x[1])]))
  # the same as above but including our target edge
  nonPresentEdges = nonPresentEdgesMinusTarget + [tedge]

  import random

  # Ancestral edges start with an N or R
  ancestral = ['R','N']

  # The node is valid if it is neither lost nor ancestral
  isValidNode = lambda x : (x[0] not in ancestral) and (x.find('LOST') == -1)

  # Is the edge u,v the target edge?
  isCurrentEdge = lambda u,v : (u == tedge[0] and v == tedge[1]) or (u == tedge[1] and v == tedge[0])
  isRealEdge = lambda u,v : (not isCurrentEdge(u,v)) and ExtantNetwork.has_edge(u,v)
  isValidEdge = lambda u,v : ((isValidNode(u) and isValidNode(v)) and (not isRealEdge(u,v)))

  # Is u,v one of the edges we wish to consider?
  def inPotentialEdges(u,v) :
    contains = (u,v) in nonPresentEdges or (v,u) in nonPresentEdges
    if noself:
      return u != v and contains
    else:
      return contains

  def isEdge( se, p1, p2 ):
      r =  ((se.p1 == p1 and se.p2 == p2) or (se.p1 == p2 and se.p2 == p1))
      return r

  scoredEdges = []

  nonEdgesWithProb = set( nonPresentEdges )
  with open(fname,'rb') as ifile:
    for l in ifile:
      toks = l.rstrip().split()
      p1 = toks[0].upper()
      p2 = toks[1].upper()
      s = float(toks[3])
      if inPotentialEdges(p1,p2):
        if randomize: s = random.uniform(0.0,1.0)
        #if p1 == p2: s = 0.0
        se = ScoredEdge(p1,p2,s)
        scoredEdges.append( se )
        nonEdgesWithProb.discard((p1,p2))
        nonEdgesWithProb.discard((p2,p1))

  rev = True
  for u,v in (nonEdgesWithProb - set(nonPresentEdges)):
    s = random.uniform(0.0,1.0) if randomize else 0.0
    scoredEdges.append(ScoredEdge(u, v, s))
  
  # cost = 0.0
  # for u,v in nonPresentEdges:
  #     se = ScoredEdge(u,v,cost)
  #     fe = [ e for e in scoredEdges if isEdge(e, u, v) ]
  #     if len(fe) == 0:
  #         scoredEdges.append(se)

  random.shuffle(scoredEdges)
  scoredEdges = list(enumerate(sorted( scoredEdges, key=lambda x: x.score, reverse=rev )))
  # print(len(scoredEdges))
  # print(t1,t2)
  # print("Target Edge = {0}".format(tedge))
  # print("Extant Edges = {0}".format(relevantExtantEdges))
  # print("Potential Edges = {0}".format(nonPresentEdges))
  # print("Scored Edges = {0}".format(scoredEdges))

  res = [ x for x in scoredEdges if isEdge(x[1], tedge[0], tedge[1])  ]

  if len(res) > 0:
    print(res)
    # Prev (ISMB)
    #print(res[0][0],float(len(nonPresentEdges)-1))
    #return (res[0][0], float(len(nonPresentEdges)-1))
    # New
    #print(res[0][0],float(len(scoredEdges)-1))
    return (res[0][0], float(len(scoredEdges)-1))
    
  else:
    raise 'Hell'
예제 #5
0
def main ():

    global options, args

    if options.verbose: print time.asctime(),
    if options.verbose: print "load and parse newick file"
    # TODO: read newick file
    tree = Phylo.read(args[0],'newick') 
    # TODO: convert newick to phyloxml
    treeXML = StringIO()
    Phylo.write(tree,treeXML,'phyloxml')
    # TODO: read phyloxml as ete object
    hPhylotree = Phyloxml()
    with tempinput(treeXML.getvalue()) as tempfilename:   
        hPhylotree.build_from_file(tempfilename)
    # TODO: get the tree
    tree2 = hPhylotree.get_phylogeny()[0]
       
    if options.verbose: print time.asctime(),
    if options.verbose: print "load and parse taxonomy file"
    # TODO: read taxonomy file
    tax = get_taxonomy(args[1])
    # TODO: refine taxonomy annotation of internal node
    tree2 = add_taxonomy_for_internal_branch(tree2,tax)
    # TODO: refine tree node label
    #tree2 = add_node_label(tree2,tax)
    for node in tree2.traverse():
        if not node.is_leaf():
            label = "null"
            for t in ['kingdom','phylum','class','order','family','genus','species']:
                if len(tax[node.id][t])>3:
                    label = tax[node.id][t]
            node.add_feature("mylabel",label)

    # TODO: add node depth
    depth={}
    if options.depth:
        with open(options.depth) as f:
            for line in f:
                (id,dep) = line.split()
                depth[id] = float(dep)
   
    # TODO: add color attribute
    if options.depth:
        for node in tree2.iter_leaves():
            if depth[node.id] >= 10 and depth[node.id] < 100:
                node.add_feature("color","#D8BFD8")
            elif depth[node.id] >= 100 and depth[node.id] < 1000:
                node.add_feature("color","#DDA0DD")
            elif depth[node.id] >= 1000 and depth[node.id] < 5000:
                node.add_feature("color","#EE82EE")
            elif depth[node.id] >= 5000:
                node.add_feature("color","#DA70D6")
            else:
                node.add_feature("color","#E6E6FA")
        
    # TODO: set tree style
    ts = TreeStyle()
    ts.show_leaf_name = False
    ts.layout_fn = tree_layout
    # TODO: show tree2
    #tree2.show(tree_style=ts)
    tree2.render(args[0]+".png",dpi=2048,tree_style=ts)
예제 #6
0
def readScoreFile(fname, noself, randomize=False):

    # The strings naming the proteins whose interaction was removed in
    # this input
    tstring = fname.split('@')[-2].split("#")
    # Convert to upper case and make into an edge name
    tedge = (tstring[0].upper(), tstring[1].upper())

    # Read in the phylogenies for the orthology groups
    treeDir = "../../Parana2Data/HerpesPPIs/trees/rearranged"  #"dataOut_June17/rearranged"
    baseFile = fname.split('/')[-1]
    orthoGroup1 = baseFile.split('@')[0]
    orthoGroup2 = baseFile.split('@')[1]
    t1 = "{0}/{1}.xml.rooting.0.ntg.reconciled.0.ntg.rearrange.0.ntg".format(
        treeDir, orthoGroup1)
    t2 = "{0}/{1}.xml.rooting.0.ntg.reconciled.0.ntg.rearrange.0.ntg".format(
        treeDir, orthoGroup2)

    if (not (os.path.exists(t1) and os.path.exists(t2))):
        return None, None

    # The extant (non ancestral, non lost) nodes from the two homology groups
    getName = lambda x: x.Name.upper()
    pxml = Phyloxml()
    pxml.build_from_file(t1)
    pxml.build_from_file(t2)
    la = filter(lambda x: x.find('LOST') == -1,
                map(lambda x: x.upper(), pxml.phylogeny[0].get_leaf_names()))
    lb = filter(lambda x: x.find('LOST') == -1,
                map(lambda x: x.upper(), pxml.phylogeny[1].get_leaf_names()))

    # The set of all possible interactions among the two homology groups
    #pe = list(itertools.product(la,la)) + list(itertools.product(la,lb)) + list(itertools.product(lb,lb))
    possibleEndpoints =  combinationsWithSelf(la) + list(itertools.product(la,lb)) + combinationsWithSelf(lb) \
                         if not (orthoGroup1 == orthoGroup2 ) else combinationsWithSelf(la)

    # From among all possible endpoints, only those protein pairs that reside in the
    # same species represent a potential edge
    allPossibleEdges = filter(
        lambda (x, y): x.split('_')[-1] == y.split('_')[-1], possibleEndpoints)

    # an edge has both endpoints in the set of extant nodes
    inCurrentGroups = lambda e, x, y: (e[0] in x or e[0] in y) and (e[
        1] in x or e[1] in y)
    # an edge is relevant if it's constrained to the current groups
    relevantExtantEdges = [
        e for e in ExtantNetwork.edges_iter() if inCurrentGroups(e, la, lb)
    ]
    # the set of potential edges that don't appear in the input network
    nonPresentEdgesMinusTarget = list(
        set([
            x for x in allPossibleEdges
            if not ExtantNetwork.has_edge(x[0], x[1])
        ]))
    # the same as above but including our target edge
    nonPresentEdges = nonPresentEdgesMinusTarget + [tedge]

    import random

    # Ancestral edges start with an N or R
    ancestral = ['R', 'N']

    # The node is valid if it is neither lost nor ancestral
    isValidNode = lambda x: (x[0] not in ancestral) and (x.find('LOST') == -1)

    # Is the edge u,v the target edge?
    isCurrentEdge = lambda u, v: (u == tedge[0] and v == tedge[1]) or (
        u == tedge[1] and v == tedge[0])
    isRealEdge = lambda u, v: (not isCurrentEdge(u, v)
                               ) and ExtantNetwork.has_edge(u, v)
    isValidEdge = lambda u, v: (
        (isValidNode(u) and isValidNode(v)) and (not isRealEdge(u, v)))

    # Is u,v one of the edges we wish to consider?
    def inPotentialEdges(u, v):
        contains = (u, v) in nonPresentEdges or (v, u) in nonPresentEdges
        if noself:
            return u != v and contains
        else:
            return contains

    def isEdge(se, p1, p2):
        r = ((se.p1 == p1 and se.p2 == p2) or (se.p1 == p2 and se.p2 == p1))
        return r

    scoredEdges = []

    nonEdgesWithProb = set(nonPresentEdges)
    with open(fname, 'rb') as ifile:
        for l in ifile:
            toks = l.rstrip().split()
            p1 = toks[0].upper()
            p2 = toks[1].upper()
            s = float(toks[3])
            if inPotentialEdges(p1, p2):
                if randomize: s = random.uniform(0.0, 1.0)
                #if p1 == p2: s = 0.0
                se = ScoredEdge(p1, p2, s)
                scoredEdges.append(se)
                nonEdgesWithProb.discard((p1, p2))
                nonEdgesWithProb.discard((p2, p1))

    rev = True
    for u, v in (nonEdgesWithProb - set(nonPresentEdges)):
        s = random.uniform(0.0, 1.0) if randomize else 0.0
        scoredEdges.append(ScoredEdge(u, v, s))

    # cost = 0.0
    # for u,v in nonPresentEdges:
    #     se = ScoredEdge(u,v,cost)
    #     fe = [ e for e in scoredEdges if isEdge(e, u, v) ]
    #     if len(fe) == 0:
    #         scoredEdges.append(se)

    random.shuffle(scoredEdges)
    scoredEdges = list(
        enumerate(sorted(scoredEdges, key=lambda x: x.score, reverse=rev)))
    # print(len(scoredEdges))
    # print(t1,t2)
    # print("Target Edge = {0}".format(tedge))
    # print("Extant Edges = {0}".format(relevantExtantEdges))
    # print("Potential Edges = {0}".format(nonPresentEdges))
    # print("Scored Edges = {0}".format(scoredEdges))

    res = [x for x in scoredEdges if isEdge(x[1], tedge[0], tedge[1])]

    if len(res) > 0:
        print(res)
        # Prev (ISMB)
        #print(res[0][0],float(len(nonPresentEdges)-1))
        #return (res[0][0], float(len(nonPresentEdges)-1))
        # New
        #print(res[0][0],float(len(scoredEdges)-1))
        return (res[0][0], float(len(scoredEdges) - 1))

    else:
        raise 'Hell'
예제 #7
0
from ete2 import Phyloxml
project = Phyloxml()
project.build_from_file("apaf.xml")

# Each tree contains the same methods as a PhyloTree object
for tree in project.get_phylogeny():
    print tree
    # you can even use rendering options
    tree.show()
    # PhyloXML features are stored in the phyloxml_clade attribute
    for node in tree: 
        print "Node name:", node.name
        for seq in node.phyloxml_clade.get_sequence(): 
            for domain in seq.domain_architecture.get_domain():
                domain_data = [domain.valueOf_, domain.get_from(), domain.get_to()]
                print "  Domain:", '\t'.join(map(str, domain_data))
예제 #8
0
from ete2 import Phyloxml
project = Phyloxml()
project.build_from_file("testTree.xml")

# Each tree contains the same methods as a PhyloTree object
for tree in project.get_phylogeny():
    print tree
    # you can even use rendering options
    tree.show()
    # PhyloXML features are stored in the phyloxml_clade attribute
    for node in tree: 
        print "Node name:", node.name
        for seq in node.phyloxml_clade.get_sequence(): 
            for domain in seq.domain_architecture.get_domain():
                domain_data = [domain.valueOf_, domain.get_from(), domain.get_to()]
                print "  Domain:", '\t'.join(map(str, domain_data))