示例#1
0
    def __init__(self, newick = None, text_array = None, \
                 fdist=clustvalidation.default_dist):
        # Default dist is spearman_dist when scipy module is loaded
        # otherwise, it is set to euclidean_dist.

        # Initialize basic tree features and loads the newick (if any)
        TreeNode.__init__(self, newick)
        self._fdist = None
        self._silhouette = None
        self._intercluster_dist = None
        self._intracluster_dist = None
        self._profile = None
        self._std_profile = None

        # Cluster especific features
        self.features.add("intercluster_dist")
        self.features.add("intracluster_dist")
        self.features.add("silhouette")
        self.features.add("profile")
        self.features.add("deviation")

        # Initialize tree with array data
        if text_array:
            self.link_to_arraytable(text_array)

        if newick:
            self.set_distance_function(fdist)
示例#2
0
文件: clustertree.py 项目: a1an77/ete
    def __init__(self, newick = None, text_array = None, \
                 fdist=clustvalidation.default_dist):
        # Default dist is spearman_dist when scipy module is loaded
        # otherwise, it is set to euclidean_dist.

        # Initialize basic tree features and loads the newick (if any)
        TreeNode.__init__(self, newick)
        self._fdist = None
        self._silhouette = None
        self._intercluster_dist = None
        self._intracluster_dist = None
        self._profile = None
        self._std_profile = None

        # Cluster especific features
        self.features.add("intercluster_dist")
        self.features.add("intracluster_dist")
        self.features.add("silhouette")
        self.features.add("profile")
        self.features.add("deviation")

        # Initialize tree with array data
        if text_array:
            self.link_to_arraytable(text_array)

        if newick:
            self.set_distance_function(fdist)
    def __init__(self, newick=None, alignment=None, alg_format="fasta", \
                 sp_naming_function=_parse_species, format=0):

        # _update names?
        self._name = "NoName"
        self._species = "Unknown"
        self._speciesFunction = None
        # Caution! native __init__ has to be called after setting
        # _speciesFunction to None!!
        TreeNode.__init__(self, newick=newick, format=format)

        # This will be only executed after reading the whole tree,
        # because the argument 'alignment' is not passed to the
        # PhyloNode constructor during parsing
        if alignment:
            self.link_to_alignment(alignment, alg_format)
        if newick:
            self.set_species_naming_function(sp_naming_function)
示例#4
0
    def __init__(self, newick=None, alignment=None, alg_format="fasta", \
                 sp_naming_function=_parse_species, format=0, **kargs):

        # _update names?
        self._name = "NoName"
        self._species = "Unknown"
        self._speciesFunction = None
        # Caution! native __init__ has to be called after setting
        # _speciesFunction to None!!
        TreeNode.__init__(self, newick=newick, format=format, **kargs)

        # This will be only executed after reading the whole tree,
        # because the argument 'alignment' is not passed to the
        # PhyloNode constructor during parsing
        if alignment:
            self.link_to_alignment(alignment, alg_format)
        if newick:
            self.set_species_naming_function(sp_naming_function)
  def buildDSAndSpeciesTree(self):
    """Build one valid DS-tree for the relationships given, and a species tree with which it is consistent, and returns both trees.
      If no such pair of trees can be built, returns False instead.
      The DS-tree built prioritizes dups first.
        """
    
    startcc = self.genes.copy()
    dstree = TreeNode()
    dstree.set = set(startcc)
    dsNodeList =[dstree]
    constructedSpeciesTree = TreeNode()	
    constructedSpeciesTree.set = set(self.treelessGeneSpeciesMapping[x] for x in startcc)
    hasPassed = self._buildDSAndSpeciesTree(startcc, dsNodeList, constructedSpeciesTree)

    #Return dstree and species tree as an ordered pair
    treepair = [dstree, constructedSpeciesTree]
    
    #in case you are wondering, what's below is (hasPassed ? treepair : False)
    return (treepair if hasPassed else False)
示例#6
0
def validate_tree(tree_path, msa_path):
    ''' Tries to validate that the tree contains unique ids, that those ids exist in the MSA, and finally,
    it links each leaf to its msa row.'''
    # holds the link between the leaf accession and the row in that msa
    leaf_information = {}
    # first load the tree
    tree = TreeNode(newick=tree_path, format=0)
    # also load in the MSA
    msa = AlignIO.read(msa_path,'fasta')
    # next go through all of the leaves
    for leaf in tree.get_leaves():
        found = False
        for alignment_row in msa:
            if leaf.name in alignment_row.description:
                if leaf.name not in leaf_information:
                    leaf_information[leaf.name] = alignment_row
                    found = True
                    break
                else:
                    raise Exception("%s is found in the tree/MSA twice, accessions must be unique" % (leaf.name))
        if not found:
            raise Exception("%s is in the tree but not found in the MSA" % (leaf.name))
    return (tree, msa, leaf_information)
def generateRandomProblem(nbGenes, nbSpecies, orthologProb = 0.5, paralogProb = 0.4):
  """ Generates a random set of genes, orthologs, paralogs and species tree, ready to be input in ConstraintGraph class.
      Returned value has the form 
      { "genes" : geneset, "orthologs" : orthologs, "paralogs" : paralogs, "speciesTree" : speciesTree }
      geneset items have the form [GENENAME]:[SPECIESNAME]
      Gene species are attributed randomly, though each species has at least one gene.
      If two genes have the same species, they end up in paralogs, always.
      
        :argument nbGenes: Number of genes to generate
        :argument orthologProb: chances for 2 genes to be a pair in orthologs
        :argument paralogProb: chances for 2 genes to be a pair in paralogs
        """
  speciesnames = range(0, nbSpecies)
  speciesTree = TreeNode()
  speciesTree.populate(nbSpecies, speciesnames)
  
  for node in speciesTree:
    if not node.name is None:
      node.name = str(node.name)
  
  genes = []
  
  #first add one gene per species
  for i in range(nbSpecies):
    genes.append("g" + str(len(genes)) + ":" + str(i))
    
  #then fill in the rest with random species genes
  paralogs = set()
  orthologs = set()
  
  for i in range(nbGenes - nbSpecies):
    s = random.randint(0, nbSpecies - 1)
    genes.append("g" + str(len(genes)) + ":" + str(s))
    
  #and here we decide of random relationships
  paralogProb += orthologProb 
  for i in range(nbGenes):
    g1 = genes[i]
    pz = g1.split(":")
    g1name = pz[0]
    g1species = pz[1]
    
    for j in range(i + 1, nbGenes):
      g2 = genes[j]
      px = g2.split(":")
      g2name = px[0]
      g2species = px[1]
      
      if g1species == g2species:
        paralogs.add( (g1, g2) )
      else:
        p = random.random()
        
        if p < orthologProb:
          orthologs.add( (g1, g2) )
        elif p >= orthologProb and p < paralogProb:
          paralogs.add( (g1, g2) )

  geneset = set()
  geneset.update(genes)
  return { "genes" : geneset, "orthologs" : orthologs, "paralogs" : paralogs, "speciesTree" : speciesTree }
    orthologsStr = pz[1]
  elif arg.startswith("--paralogs="):
    pz = arg.split("=")
    paralogsStr = pz[1]
  elif not arg.startswith('-'):
    if graphfile1 == '':
      graphfile1 = arg
    elif graphfile2 == '':
      graphfile2 = arg

# Map each species name to a leaf.  This avoids using search_nodes, which is slow
# speciesTree is used in every mode, so we build it right here right now
speciesTree = None

if speciesTreeStr != '':
  speciesTree = TreeNode(speciesTreeStr)
  speciesLeavesList = speciesTree.get_leaves()
  speciesLeaves = {}
  for leaf in speciesLeavesList:
    speciesLeaves[leaf.name] = leaf
 


class ConstraintGraph:
  """ Given a set of genes, and and two set of tuples that represent orthology and paralogy relationships between genes,
      builds a graph of orthologies, and a graph of paralogies.  It then becomes possible to build a DS-tree that satisifies
      all these constraints (or detect that it can't be done).
  """
  
  def __init__(self, genes, orthology_relations, paralogy_relations, speciesTree = None, geneSpeciesMapping = None, treelessGeneSpeciesMapping  = None):
    """Constructor.  The 2 graphs are built here, as adjacency lists (self.orthologs and self.paralogs, two dicts with key = gene-string, value = neighbors as a set of gene-strings). 
	def __init__(self, newick=None, format=0, dist=None, support=None,name=None):
		"""	Default init for the TreeClass. This works better than wrapping the entire class"""
		TreeNode.__init__(self, newick=newick, format=format, dist=dist, support=support, name=name)
示例#10
0
      #treeid = "ENSGT00390000013823"
      server = "http://beta.rest.ensembl.org"
      ext = "/genetree/id/" + treeid + "?"
      resp, content = http.request(server+ext, method="GET", headers={"Content-Type":"application/json"})
      
      if not resp.status == 200:
	print "Invalid response: ", resp.status
	continue	#evil continue


      decoded = json.loads(content)

      tree = decoded['tree']

      geneNodeMap = {}
      root = TreeNode()
      visit_json_elem(tree, root, geneNodeMap)
      
      orthologs = set()
      paralogs = set()
      paralogs_dubious = set()
      
      for g1 in geneNodeMap:
	for g2 in geneNodeMap:
	  if g1 != g2:
	    n1 = geneNodeMap[g1]
	    n2 = geneNodeMap[g2]
	    
	    lca = n1.get_common_ancestor(n2)
	    
	    if lca.name == 'duplication' or lca.name == 'gene_split':
##################################
ass_node2parent = {"1": "1"}
for nodeid in taxa:
    parentid = node2parent[nodeid]
    while nodeid != parentid:  #costruiamo un nuovo dizionario per i soli taxa che abbiamo identificato nel campione
        ass_node2parent[nodeid] = parentid
        nodeid = parentid
        parentid = node2parent[nodeid]

node2parentid = {}
for nodeid in ass_node2parent.keys():
    parentid = ass_node2parent[nodeid]
    # Stores node connections
    all_ids.update([nodeid, parentid])
    # Creates a new TreeNode instance for each new node in file
    n = TreeNode()
    # Sets some TreeNode attributes
    n.add_feature("name", node2name[nodeid])
    n.add_feature("taxid", nodeid)
    n.add_feature("Order", node2order[nodeid])

    # updates node list and connections
    node2parentid[n] = parentid
    id2node[nodeid] = n
print len(id2node)
# Reconstruct tree topology from previously stored tree connections
print 'Reconstructing tree topology...'
for node in id2node.itervalues():
    parentid = node2parentid[node]
    parent = id2node[parentid]
    # node with taxid=1 is the root of the tree
def load_NCBI(species_file, names_file, nodes_file ):
    if not os.path.isfile(species_file):
            print "ERROR "+species_file+' can\'t be read. Exiting... '
            sys.exit(8)

    all_wanted_species={} # species_name:   taxid (string)

    print "Reading wanted species from file: "+species_file
    ifile=open(species_file, 'r')
    for iline in ifile:
            species_name=iline.strip()
            all_wanted_species[species_name]=-1
    ifile.close()


    # This sets Unbuffered stdout/auto-flush
    sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)

    id2node= {}
    node2parentid = {}
    all_ids = set([])
    all_nodes = []
    id2name= {}

    # Loads info from NCBI taxonomy files
    if os.path.exists(nodes_file):
        NODESFILE = open(nodes_file)
    elif os.path.exists(nodes_file+"bz2"):
        import bz2
        NODESFILE = bz2.BZ2File(nodes_file+'.bz2')
    else:
        print nodes_file+' file is missing. '
        sys.exit(8)

    if os.path.exists(names_file):
        NAMESFILE = open(names_file)
    elif os.path.exists(names_file+"bz2"):
        import bz2
        NAMESFILE = bz2.BZ2File(names_file+'.bz2')
    else:
        print names_file +' file is missing. '
        sys.exit(8)


    # Reads taxid/names transaltion
    print 'Loading species names from "names.dmp" file...',
    for line in NAMESFILE:
        # lines are redundant. synonyms are on different lines defined by the same id. So, we store only lines with "scientific name".
        line = line.strip()
        fields = map(strip, line.split("|"))
        nodeid, name = fields[0], fields[1]

        if all_wanted_species.has_key(name):
            all_wanted_species[name]=nodeid

        if fields[3]=='scientific name':
            #storing name that will appear afterwards in the ete2 node
            id2name[nodeid] = name

    print len(id2name)

    any_species_is_missing=0
    for species_name in all_wanted_species:
            if all_wanted_species[species_name]==-1:
                    print "ERROR the species name \""+species_name+"\" was not found!"
                    any_species_is_missing=1
    if any_species_is_missing:                    
        sys.exit(9)


    # Reads node connections in nodes.dmp
    print 'Loading node connections from "nodes.dmp" file...', 
    for line in NODESFILE:
        line = line.strip()
        fields = map(strip, line.split("|"))
        nodeid, parentid = fields[0], fields[1]
        if nodeid =="" or parentid == "":
            raw_input("Wrong nodeid!")

        # Stores node connections
        all_ids.update([nodeid, parentid])

        # Creates a new TreeNode instance for each new node in file
        n = TreeNode()
        # Sets some TreeNode attributes
        n.add_feature("name", id2name[nodeid])
        n.add_feature("taxid", nodeid)

        # updates node list and connections
        node2parentid[n]=parentid
        id2node[nodeid] = n

    print len(id2node)


    # Reconstruct tree topology from previously stored tree connections
    print 'Reconstructing tree topology...'
    for node in id2node.itervalues():
        parentid = node2parentid[node]
        parent = id2node[parentid]
        # node with taxid=1 is the root of the tree
        if node.taxid == "1":
            t = node
        else:
            parent.add_child(node)
    return t, id2node, all_wanted_species
 def __init__(self, newick=None, formating=0):
     TreeNode.__init__(self, newick, formating)
     self.abundance = None
     self.nabundance = None
     self.fulltree=False
示例#14
0
def build_tax_tree():
        import os
        import sys 
        from string import strip
        from ete2 import TreeNode, Tree
        #print sys.argv[1]
        #if len(sys.argv) == 1:
        #	print "Usage:  taxid2lineage file_with_taxids.txt"
        #else:
        #	f = open(sys.argv[1], 'r')

        # This sets Unbuffered stdout/auto-flush
        sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)

        id2node= {}
        id2rank={}
        node2parentid = {}
        all_ids = set([])
        all_nodes = []
        id2name= {}

        # Loads info from NCBI taxonomy files
        if os.path.exists("nodes.dmp"):
            NODESFILE = open('nodes.dmp')
        elif os.path.exists("nodes.dmp.bz2"):
            import bz2
            NODESFILE = bz2.BZ2File('nodes.dmp.bz2')
        else:
            print '"nodes.dmp" file is missing. Try to downloaded from: '

        if os.path.exists("names_scientific.dmp"):
            NAMESFILE = open('names_scientific.dmp')
        elif os.path.exists("names_scientific.dmp.bz2"):
            import bz2
            NAMESFILE = bz2.BZ2File('names_scientific.dmp.bz2')
        else:
            print '"names_scientific.dmp" file is missing. Try to downloaded from: '

        # Reads taxid/names transaltion
        #print 'Loading species names from "names_scientific.dmp" file...',
        for line in NAMESFILE:
            line = line.strip()
            fields = map(strip, line.split("|"))
            nodeid, name = fields[0], fields[1]
            id2name[nodeid] = name


        # Reads node connections in nodes.dmp
        #print 'Loading node connections form "nodes.dmp" file...', 
        for line in NODESFILE:
            line = line.strip()
            fields = map(strip, line.split("|"))
            nodeid, parentid,rankid = fields[0], fields[1], fields[2]
            id2rank[nodeid]=rankid
            if nodeid =="" or parentid == "":
                raw_input("Wrong nodeid!")

            # Stores node connections
            all_ids.update([nodeid, parentid])

            # Creates a new TreeNode instance for each new node in file
            n = TreeNode()
            # Sets some TreeNode attributes
            n.add_feature("name", id2name[nodeid])
            n.add_feature("taxid", nodeid)
            n.add_feature("rank",id2rank[nodeid])

            # updates node list and connections
            node2parentid[n]=parentid
            id2node[nodeid] = n
        #print len(id2node)

        # Reconstruct tree topology from previously stored tree connections
        #print 'Reconstructing tree topology...'
        for node in id2node.itervalues():
            parentid = node2parentid[node]
            parent = id2node[parentid]
            if node.taxid == "1":
                t = node
            else:
                parent.add_child(node)

        return id2node, id2name
##################################
ass_node2parent = {"1": "1"}
for nodeid in taxa:
    parentid = node2parent[nodeid]
    while nodeid != parentid:  #costruiamo un nuovo dizionario per i soli taxa che abbiamo identificato nel campione
        ass_node2parent[nodeid] = parentid
        nodeid = parentid
        parentid = node2parent[nodeid]

node2parentid = {}
for nodeid in ass_node2parent.keys():
    parentid = ass_node2parent[nodeid]
    # Stores node connections
    all_ids.update([nodeid, parentid])
    # Creates a new TreeNode instance for each new node in file
    n = TreeNode()
    # Sets some TreeNode attributes
    n.add_feature("name", node2name[nodeid])
    n.add_feature("taxid", nodeid)
    n.add_feature("Order", node2order[nodeid])

    # updates node list and connections
    node2parentid[n] = parentid
    id2node[nodeid] = n
print len(id2node)
# Reconstruct tree topology from previously stored tree connections
print 'Reconstructing tree topology...'
for node in id2node.itervalues():
    parentid = node2parentid[node]
    parent = id2node[parentid]
    # node with taxid=1 is the root of the tree