示例#1
0
def validate_tree(tree_path, msa_path):
    ''' Tries to validate that the tree contains unique ids, that those ids exist in the MSA, and finally,
    it links each leaf to its msa row.'''
    # holds the link between the leaf accession and the row in that msa
    leaf_information = {}
    # first load the tree
    tree = TreeNode(newick=tree_path, format=0)
    # also load in the MSA
    msa = AlignIO.read(msa_path,'fasta')
    # next go through all of the leaves
    for leaf in tree.get_leaves():
        found = False
        for alignment_row in msa:
            if leaf.name in alignment_row.description:
                if leaf.name not in leaf_information:
                    leaf_information[leaf.name] = alignment_row
                    found = True
                    break
                else:
                    raise Exception("%s is found in the tree/MSA twice, accessions must be unique" % (leaf.name))
        if not found:
            raise Exception("%s is in the tree but not found in the MSA" % (leaf.name))
    return (tree, msa, leaf_information)
  elif arg.startswith("--paralogs="):
    pz = arg.split("=")
    paralogsStr = pz[1]
  elif not arg.startswith('-'):
    if graphfile1 == '':
      graphfile1 = arg
    elif graphfile2 == '':
      graphfile2 = arg

# Map each species name to a leaf.  This avoids using search_nodes, which is slow
# speciesTree is used in every mode, so we build it right here right now
speciesTree = None

if speciesTreeStr != '':
  speciesTree = TreeNode(speciesTreeStr)
  speciesLeavesList = speciesTree.get_leaves()
  speciesLeaves = {}
  for leaf in speciesLeavesList:
    speciesLeaves[leaf.name] = leaf
 


class ConstraintGraph:
  """ Given a set of genes, and and two set of tuples that represent orthology and paralogy relationships between genes,
      builds a graph of orthologies, and a graph of paralogies.  It then becomes possible to build a DS-tree that satisifies
      all these constraints (or detect that it can't be done).
  """
  
  def __init__(self, genes, orthology_relations, paralogy_relations, speciesTree = None, geneSpeciesMapping = None, treelessGeneSpeciesMapping  = None):
    """Constructor.  The 2 graphs are built here, as adjacency lists (self.orthologs and self.paralogs, two dicts with key = gene-string, value = neighbors as a set of gene-strings).