示例#1
0
    def __init__(self, inputTree, distributionTrees=None):
        """
        SuperTreeInputTrees is a utility to create sets of input trees. 
        The input trees are primarily to be used to evaluate super tree
        construction methods. 

        Invocation removing a fixed number of taxa from each prospective input tree:

            stit = SuperTreeInputTrees(inputTree)
            stit.writeInputTreesToFile = True
            stit.outputFile = 'myInputtrees.tre'
            stit.noTaxaToRemove = 32 
            stit.noOutputTrees = 10
            stit.generateInputTrees()


        Invocation using built in distribution gathered from real world super tree cases::

            stit = SuperTreeInputTrees(inputTree)
            stit.writeInputTreesToFile = True
            stit.outputFile = 'myInputtrees.tre'
            stit.useTaxonDistribution = True
            stit.generateInputTrees()

        The user can generate a distribution of their own by supplying a list of p4 trees or a tree file. 
        The order of the trees is important, supertree and then all other trees. This goes for both list and 
        file. Like so::

            stit = SuperTreeInputTrees(inputTree, distributionTrees='myTreefile.nex')
            stit.writeInputTreesToFile = True
            stit.outputFile = 'myInputtrees.tre'
            stit.useTaxonDistribution = True
            stit.generateInputTrees()

        Placeholders which allow access to data after completed computations::

            stit.outputTrees 
            stit.dist

        """

        self.writeInputTreesToFile = False
        self.outputFile = 'inputtrees.tre'

        # Set to False if you want to have a set number of taxa in the output
        # trees
        self.useTaxonDistribution = False
        # Only meaningful if setting useTaxonDistribution = False
        self.noTaxaToRemove = 32
        self.noOutputTrees = 10

        gm = ['SuperTreeInputTrees()']

        if isinstance(inputTree, Tree):
            self.inputTree = inputTree  # not a list.
        elif isinstance(inputTree, str):
            var.trees = []
            read(inputTree)
            if len(var.trees) > 1:
                gm.append('Sorry, supply only one tree as supertree')
                raise P4Error(gm)
            # this was originally a list, ie [var.trees.pop()]
            self.inputTree = var.trees.pop()
        else:
            gm.append("Input tree was neither a p4 Tree nor a valid filename")
            gm.append("Got %s" % inputTree)
            raise P4Error(gm)

        if not self.inputTree._taxNames:
            self.inputTree._setTaxNamesFromLeaves()

        self.outputTrees = []

        self.normalizedDist = []

        # Distributions gathered from real world supertree input
        # The dists are first a list of input tree taxon set sizes and the supertree taxon set size
        # Using this data we can normalize the dists to fit the size of trees
        # we want

        # BunnyRSVNormal set from Wilkinson et al 2005, Syst Biol 54:823
        #        self.dist = [[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 13, 13, 13, 13, 13, 14, 14, 15, 17, 17, 18, 18, 18, 18, 18, 19, 19, 19, 20, 20, 20, 21, 22, 22, 23, 24, 25, 25, 25, 25, 25, 25, 26, 27, 28, 29, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 37, 38, 38, 40, 40, 41, 47, 51, 51, 52, 52, 52, 68, 70, 78, 78, 79, 80, 80], 80]

        # CanidaeRVS set from Wilkinson et al 2005, Syst Biol 54:823
        #self.dist = [[3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 11, 11, 12, 16, 16, 20, 23, 24, 30, 30, 33, 34, 34, 34, 34, 34], 34]

        # CarnivoraRVS set from Wilkinson et al 2005, Syst Biol 54:823
        #self.dist = [[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12], 12]

        # DavideDinoMRP set from Wilkinson et al 2005, Syst Biol 54:823
        #self.dist = [[4, 4, 4, 5, 6, 6, 6, 7, 8, 8, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 17, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 20, 20, 20, 22, 23, 23, 24, 24, 25, 26, 27, 27, 28, 28, 29, 29, 29, 29, 29, 30, 30, 30, 31, 31, 31, 31, 33, 33, 33, 33, 36, 37, 37, 38, 38, 39, 42, 45, 47, 48, 50, 53, 53, 66, 70, 71, 74, 74, 75, 75, 76, 78, 78, 80, 86, 86, 92, 94, 96, 100, 101, 102, 102, 103, 105, 110, 111, 111, 139, 148, 149, 153, 173, 199, 204, 217, 240, 269, 270, 271, 272, 272, 273, 273, 273, 273, 274, 275], 277]

        # FelidaeRVS set from Wilkinson et al 2005, Syst Biol 54:823
        self.dist = [[
            3, 3, 3, 3, 3, 4, 4, 4, 5, 6, 6, 6, 7, 7, 7, 7, 9, 9, 10, 10, 14,
            16, 17, 24, 25, 28, 29, 29, 30, 30, 32, 34, 36, 36, 36, 36, 36, 36,
            36, 36
        ], 36]

        # KennedyPageData set from Wilkinson et al 2005, Syst Biol 54:823
        #self.dist = [[14, 16, 17, 20, 30, 30, 90], 122]

        # ViverridaeRVS set from Wilkinson et al 2005, Syst Biol 54:823
        #self.dist = [[4, 5, 10, 16, 19, 33, 34, 34, 34], 34]

        if distributionTrees:
            self.useTaxonDistribution = True
            if isinstance(distributionTrees, list):
                for t in distributionTrees:
                    if not isinstance(t, Tree):
                        gm.append(
                            "Input trees should be a list of p4 Tree objects. Got %s"
                            % t)
                        raise P4Error(gm)
                superTree = distributionTrees.pop(0)
                inputTrees = distributionTrees
            elif isinstance(distributionTrees, list):
                var.trees = []
                read(distributionTrees)
                if len(var.trees) < 1:
                    gm.append(
                        'Sorry, at least one tree must be supplied as input tree'
                    )
                    raise P4Error(gm)
                superTree = var.trees.pop(0)
                inputTrees = var.trees
            self._generateDistribution(superTree, inputTrees)
示例#2
0
    def __init__(self, supertree, inputTrees):

        #        There are two ways of decorating the supertree with the support values.
        #        Standard conforms to the consensus tree tradition, i.e. values are presented between
        #        0 to 100 percent. Non standard adhears to the few supertree papers regarding support values
        #        i.e -1 to 1.
        self.doStandardDecoration = True

        #        The decorated supertree can be saved to file
        self.doSaveDecoratedTree = False
        self.decoratedFilename = 'superTreeSupport.nex'

        #        There is a option to save a supertree decorated with index values instead of support values.
        #        This can then be used with a csv file containing the support values for each index.
        #        Further analysis of the support values can be performed and then matched to the indecies in the
        #        decorated supertree
        self.doSaveIndexTree = False
        self.indexFilename = 'supertreeIndex.nex'
        self.csvFilename = 'supertreeIndex.csv'

        #        Draws the decorated supertree to screen
        self.doDrawTree = False

        #        Produces output to screen
        self.verbose = 1

        # Placeholders that allows access to the data after completing
        # calculations
        self.decoratedSuperTree = None
        self.indexSuperTree = None
        self.csvList = None

        #       Keeps track of splits for producing output
        self.indexIntersections = []
        self.csvValues = []
        self.intersections = []

        #        Let t be the number of input trees,
        #        s the number of input trees supporting a supertree clade,
        #        r the number of input trees that are irrelevant to the supertree clade,
        #        q the number of input trees that conflict with the supertree clade,
        #        p the number of input trees that permit the supertree clade,
        #        so that t = p + q + r + s.

        self.T = 0  # no. of input trees;
        self.L = 0  # no. of leaves;
        # coverage (average proportion of leaves in the input tree);
        self.C = 0.0
        self.SC = 0  # number of supertree clades;
        self.U = 0  # no. of unsupported supertree clades;
        # no. of unsupported supertree clades that conflict with at least one
        # input tree;
        self.UC = 0
        # no. of unsupported clades conflicting with all relevant input trees;
        self.UCC = 0
        # average qualitative support for supertree clades. Figures in
        # parentheses are ranges.
        self.QS = 0.0
        self.S = 0.0  # average support
        self.P = 0.0  # average permitted
        self.Q = 0.0  # average conflict
        self.R = 0.0  # average relevance
        self.wS = 0.0  # average weighted support
        self.wP = 0.0  # average weighted permitance
        self.V = 0.0  # average V for supertree cladesV = (s minus q)/(s + q)
        self.VV = 0.0  # V+ = (s minus q +p)/(s + q + p)
        self.Vv = 0.0  # V minus = (s minus q minus p)/(s + q + p)
        self.wV = 0.0  # wV = (ws minus q)/(ws + q)
        self.wVV = 0.0  # wVV = (ws minus q +wp)/(ws + q + wp)
        self.wVv = 0.0  # wVv = (ws minus q minus wp)/(ws + q + wp)

        gm = ['SuperTreeSupport()']

        var.warnReadNoFile = False

        if isinstance(inputTrees, list):
            for t in inputTrees:
                if not isinstance(t, Tree):
                    gm.append(
                        "Input trees should be a list of p4 Tree objects. Got %s"
                        % t)
                    raise P4Error(gm)
            self.inputTrees = inputTrees
        elif isinstance(inputTrees, str):
            var.trees = []
            read(inputTrees)
            if len(var.trees) < 1:
                gm.append(
                    'Sorry, at least one tree must be supplied as input tree')
                raise P4Error(gm)
            self.inputTrees = var.trees
        else:
            gm.append(
                "Input trees are neither a list of p4 Tree objects nor a valid filename."
            )
            raise P4Error(gm)

        if isinstance(supertree, Tree):
            self.supertree = supertree  # not a list.
        elif isinstance(supertree, str):
            var.trees = []
            read(supertree)
            if len(var.trees) > 1:
                gm.append('Sorry, supply only one tree as supertree')
                raise P4Error(gm)
            # this was originally a list, ie [var.trees.pop()]
            self.supertree = var.trees.pop()
        else:
            gm.append("Supertree was neither a p4 Tree nor a valid filename")
            gm.append("Got %s" % supertree)
            raise P4Error(gm)

        for tree in self.inputTrees:
            if not tree._taxNames:
                tree._setTaxNamesFromLeaves()

        # Mean and median overlap of the input trees
        overlapList = []
        meanOverlap = 0.0
        index = 0
        for i in range(0, len(self.inputTrees) - 1):
            for j in range(i + 1, len(self.inputTrees)):
                overlap = len(
                    set(self.inputTrees[i].taxNames).intersection(
                        set(self.inputTrees[j].taxNames)))
                overlapList.append(overlap)
                meanOverlap += overlap
                index += 1

        if index == 0:
            self.mean = 0
            self.median = 0
        else:
            self.mean = meanOverlap / index
            overlapList.sort()
            self.median = overlapList[len(overlapList) / 2]

        commonLeafSet = CommonLeafSet()
        self.splits = commonLeafSet.updateTreesToCommonLeafSet(
            [self.inputTrees, [self.supertree]])
        self.bitkeys = commonLeafSet.getCommonBitkeys()
        self.taxnames = commonLeafSet.getCommonTaxNames()
        self.taxa2Bitkey = commonLeafSet.getCommonTaxa2Bitkey()
def treeFinderMAPAnalysis(alignment,
                          groups,
                          gamma=True,
                          invariant=True,
                          bootstrap=False,
                          nreplicates=100,
                          remove_files=False,
                          run_analysis=True,
                          verbose=False):
    """
    Uses TreeFinder to estimate a Maximum Likelihood tree using the MAP
    substitution model for grouped amino-acids.

    - *alignment*: p4 alignment object of original (un-recoded) protein data from
      which the "groups" are derived
    - *groups*: list of grouped amino-acids, possibly resuling from
      :meth:`Alignment.getKosiolAISGroups()` or :meth:`Alignment.getMinmaxChiSqGroups()`
    - *gamma*: include gamma distribution of among-site rate variation
    - *bootstrap*: run bootstrap analysis
    - *nreplicates*: number of bootstrap replicates
    - *invariant*: include a proportion of invariant sites
    - *run_analysis*: run the analysis if TreeFinder in $PATH, else just write the
      control file
    - *remove_files*: remove analysis files. Only available if run_analysis=True

    """

    gm = ["p4.alignment_recoding.treeFinderMAPAnalysis()"]

    if not isinstance(alignment, Alignment):
        msg = "alignment must be a Alignment object"
        gm.append(msg)
        raise P4Error(gm)

    if alignment.dataType != "protein":
        msg = "alignment should be the original protein data from" + \
              "which the groups were defined. Doing nothing."
        gm.append(msg)
        raise P4Error(gm)

    for param in [
            gamma, invariant, bootstrap, remove_files, run_analysis, verbose
    ]:
        if not isinstance(param, bool):
            msg = "%s value must be either True or False" % param
            gm.append(msg)
            raise P4Error(gm)

    if not isinstance(nreplicates, int):
        msg = "nreplictes must be an integer"
        gm.append(msg)
        raise P4Error(gm)

    if run_analysis:
        if not p4.func.which2("tf"):
            msg = "tf (treefinder) is not in your $PATH" + \
                  "Cannot run analysis"
            gm.append(msg)
            raise P4Error(gm)

    datafile_name = "tf_data.phy"

    #tf commands
    tls = """ReconstructPhylogeny[
             "%(datafile)s",
             SubstitutionModel->MAP[%(map)s][Optimum,Optimum]%(ifH)s,
             WithEdgeSupport->%(bootstrap)s%(nreplicates)s
             ],
             "%(outfile)s",SaveReport"""
    od = {}
    od["datafile"] = datafile_name
    if gamma:
        if invariant:
            od["ifH"] = ":GI[Optimum]"
        else:
            od["ifH"] = ":G[Optimum]"
    else:
        if invariant:
            od["ifH"] = ":I[Optimum]"
        else:
            od["ifH"] = ""
    if bootstrap:
        od["bootstrap"] = "True"
        od["nreplicates"] = ",NReplicates->%i" % nreplicates
    else:
        od["bootstrap"] = "False"
        od["nreplicates"] = ""
    od["outfile"] = "tf_reconstruction.output"
    od["map"] = ",".join(
        ['"%s"' % i for i in [group.upper() for group in groups]])

    if run_analysis:

        #Write data file
        alignment.writePhylip(datafile_name)

        #Write control file
        tl_file = "tf_control.tl"
        fh = open(tl_file, "w")
        fh.write(tls % od)
        fh.close()

        if verbose:
            direct = subprocess.STDOUT
        else:
            direct = open("/dev/null", "w")

        child = subprocess.Popen("tf tf_control.tl", stderr=direct, shell=True)

        if verbose:
            print("Running TreeFinder, this could take some time...", end=' ')
            sys.stdout.flush()

        child.communicate()

        if verbose:
            print("done.")
            sys.stdout.flush()

        #This doesnt seem to work, why?
        #while child.poll() is None:
        #    time.sleep(60)
        #    if verbose:
        #        sys.stdout.write(".")
        #        sys.stdout.flush()

        if child.returncode != 0:
            msg = "TreeFinder returned error code %s"
            gm.append(msg % (child.returncode))
            raise P4Error(gm)

        fh = open(od["outfile"], "r")
        line = fh.readlines()[1]
        fh.close()

        rd = {}
        #Likelihood
        rd["Likelihood"] = float(line[line.index("Likelihood->") +
                                      12:line.index(",")])
        #Tree
        ts = line[line.index("Phylogeny->") +
                  11:line.index("SubstitutionModel->") - 1]
        rd["Phylogeny"] = ts
        #SubstitutionModel
        sm = line[line.index("SubstitutionModel->") +
                  19:line.index("OSubstitutionModel->") - 1]
        rd["SubstitutionModel"] = sm
        #OSubstitutionModel
        osm = line[line.index("OSubstitutionModel->") +
                   20:line.index("OEdgeOptimizationOff->") - 1]
        rd["OSubstitutionModel"] = osm
        #NSites
        ns = line[line.index("NSites->") + 8:line.index("NParameters->") - 1]
        rd["Nsites"] = int(ns)
        #NParameters
        np = line[line.index("NParameters->") + 13:line.index("AIC->") - 1]
        rd["NParameters"] = int(np)
        #AIC
        rd["AIC"] = float(line[line.index("AIC->") + 5:line.index("AICc->") -
                               1])
        #AICc->
        rd["AICc"] = float(line[line.index("AICc->") + 6:line.index("HQ->") -
                                1])
        #HQ
        rd["HQ"] = float(line[line.index("HQ->") + 4:line.index("BIC->") - 1])
        #BIC
        rd["BIC"] = float(line[line.index("BIC->") +
                               5:line.index("Checksum->") - 1])
        #LikelihoodTime
        lt = line[line.index("LikelihoodTime->") +
                  16:line.index("LikelihoodMemory->") - 1]
        rd["LikelihoodTime"] = float(lt)
        #LikelihoodMemory
        lm = line[line.index("LikelihoodMemory->") + 18:-3]
        rd["LikelihoodMemory"] = int(lm)

        #Make a tree object
        tree = rd["Phylogeny"].replace("{", "(")
        tree = tree.replace("}", ")")
        tree = tree.replace("\"", "")
        tree = tree + ";"
        if bootstrap:
            #Tree viewer has the brlen before bootstrap value plus an extra colon
            # turn "xxx):0.00001:87.999,yyy" into "xxx)87.999:0.00001,yyy"
            patt = re.compile(r"\):([0-9]+\.[0-9e-]+):([0-9]+\.[0-9e-]*)")
            repl = r")\2:\1"
            tree = re.sub(patt, repl, tree)
        origw = var.warnReadNoFile
        var.warnReadNoFile = False
        read(tree)
        var.warnReadNoFile = origw
        result_tree = var.trees.pop()
        if bootstrap:
            #Round up floats to percentages
            for node in result_tree.iterInternalsNoRoot():
                node.name = "%2.f" % float(node.name)

        if remove_files:
            os.remove("tf_control.tl")
            os.remove("tf_data.phy")
            os.remove("tf_reconstruction.output")

        if verbose:
            print("\n")
            result_tree.draw()
            print("\nLikelihood: %.4f\n" % rd["Likelihood"])

        return result_tree, rd

    else:
        print(tls % od)
        return (None, None)