예제 #1
0
    def compoSummary(self):
        """A verbose composition summary, one for each data partition."""

        print("\n\nData composition summary")
        print("========================\n")

        # Make a name format (eg '%12s') that is long enough for the longest
        # name
        longestNameLen = 7  # to start
        for i in self.taxNames:
            if len(i) > longestNameLen:
                longestNameLen = len(i)
        nameFormat = '%' + '%i' % (longestNameLen + 1) + 's'

        for i in range(len(self.parts)):
            p = self.parts[i]
            print("Part %i" % i)
            print("%s" % (' ' * (longestNameLen + 1)), end=' ')
            for j in range(len(p.symbols)):
                print("%10s" % p.symbols[j], end=' ')
            print("%10s" % 'nSites')
            # print ''
            #cumulativeComps = [0.0] * len(p.symbols)
            grandTotalNSites = 0
            for k in range(p.nTax):
                c = p.composition([k])
                # print "tax %s, part.composition() returns %s" % (k, c)
                nSites = pf.partSequenceSitesCount(p.cPart, k)
                grandTotalNSites = grandTotalNSites + nSites
                print(nameFormat % self.taxNames[k], end=' ')

                # Usually sum(c) will be 1.0, unless the sequence is
                # empty.  We don't want to test "if sum(c) == 0.0:" or
                # "if sum(c):" cuz of small numbers.
                if sum(c) > 0.99:
                    for j in range(len(p.symbols)):
                        print("%10.4f" % c[j], end=' ')
                        #cumulativeComps[j] = cumulativeComps[j] + (c[j] * nSites)
                else:  # Empty sequence, all zeros.  Write dashes.
                    for j in range(len(p.symbols)):
                        print("%10s" % '-', end=' ')
                print("%10s" % nSites)
            c = p.composition()
            print(nameFormat % 'mean', end=' ')
            for j in range(len(p.symbols)):
                print("%10.4f" % c[j], end=' ')
            # print "%10s" % grandTotalNSites
            print("%10.4f" % (float(grandTotalNSites) / self.nTax))
            print("\n")
예제 #2
0
    def compoChiSquaredTest(self,
                            verbose=1,
                            skipColumnZeros=0,
                            useConstantSites=1,
                            skipTaxNums=None,
                            getRows=0):
        """A chi square composition test for each data partition.

        So you could do, for example::

            read('myData.nex')

            # Calling Data() with no args tells it to make a Data object 
            # using all the alignments in var.alignments
            d = Data()

            # Do the test.  By default it is verbose, and prints results.
            # Additionally, a list of lists is returned
            ret = d.compoChiSquaredTest()

            # With verbose on, it might print something like ---
            # Part 0: Chi-square = 145.435278, (dof=170) P = 0.913995

            print ret
            # The list of lists that it returns might be something like ---
            # [[145.43527849758556, 170, 0.91399521077908041]]
            # which has the same numbers as above, with one 
            # inner list for each data partition.

        If your data has more than one partition::

            read('first.nex')
            read('second.nex')
            d = Data()
            d.compoChiSquaredTest()

            # Output something like ---
            # Part 0: Chi-square = 200.870463, (dof=48) P = 0.000000
            # Part 1: Chi-square = 57.794704, (dof=80) P = 0.971059
            # [[200.87046313430443, 48, 0.0], [57.794704451018163, 80, 0.97105866938683427]]

        where the last line is returned.  With *verbose* turned off,
        the ``Part N`` lines are not printed.

        This method returns a list of lists, one for each data
        partition.  If *getRows* is off, the default, then it is a
        list of 3-item lists, and if *getRows* is turned on then it is
        a list of 4-item lists.  In each inner list, the first is the
        X-squared statistic, the second is the degrees of freedom, and
        the third is the probability from chi-squared.  (The expected
        comes from the data.)  If *getRows* is turned on, the 4th item
        is a list of X-sq contributions from individual rows (ie
        individual taxa), that together sum to the X-sq for the whole
        partition as found in the first item.  This latter way is the
        way that Tree-Puzzle does it.

        Note that this ostensibly tests whether the data are
        homogeneous in composition, but it does not work on sequences
        that are related.  That is, testing whether the X^2 stat is
        significant using the chi^2 curve has a high probability of
        type II error for phylogenetic sequences.

        However, the X-squared stat can be used in valid ways.  You
        can simulate data under the tree and model, and so generate a
        valid null distribution of X^2 values from the simulations, by
        which to assess the significance of the original X^2.  You can
        use this method to generate X^2 values.

        A problem arises when a composition of a character is zero.
        If that happens, we can't calculate X-squared because there
        will be a division by zero.  If *skipColumnZeros* is set to 1,
        then those columns are simply skipped.  They are silently
        skipped unless verbose is turned on.

        So lets say that your original data have all characters, but
        one of them has a very low value.  That is reflected in the
        model, and when you do simulations based on the model you
        occasionally get zeros for that character.  Here it is up to
        you: you could say that the the data containing the zeros are
        validly part of the possibilities and so should be included,
        or you could say that the data containing the zeros are not
        valid and should be excluded.  You choose between these by
        setting *skipColumnZeros*.  Note that if you do not set
        *skipColumnZeros*, and then you analyse a partition that has
        column zeros, the result is None for that partition.

        Another problem occurs when a partition is completely missing
        a sequence.  Of course that sequence does not contribute to
        the stat.  However, in any simulations that you might do, that
        sequence *will* be there, and *will* contribute to the stat.
        So you will want to skip that sequence when you do your calcs
        from the simulation.  You can do that with the *skipTaxNums*
        arg, which is a list of lists.  The outer list is nParts long,
        and each inner list is a list of taxNums to exclude.

        """

        if not useConstantSites:
            newData = Data([])
            aligs = []
            for a in self.alignments:
                # aligs.append(a.removeConstantSites())
                aligs.append(
                    a.subsetUsingMask(a.constantMask(),
                                      theMaskChar='1',
                                      inverse=1))
            newData._fill(aligs)
            theResult = newData.compoChiSquaredTest(
                verbose=verbose,
                skipColumnZeros=skipColumnZeros,
                useConstantSites=1,
                skipTaxNums=skipTaxNums,
                getRows=getRows)
            del (newData)
            return theResult

        gm = ['Data.compoChiSquaredTest()']
        nColumnZeros = 0
        results = []

        # check skipTaxNums
        if skipTaxNums != None:
            if not isinstance(skipTaxNums, list):
                gm.append("skipTaxNums should be a list of lists.")
                raise P4Error(gm)
            if len(skipTaxNums) != self.nParts:
                gm.append(
                    "skipTaxNums should be a list of lists, nParts long.")
                raise P4Error(gm)
            for s in skipTaxNums:
                if not isinstance(s, list):
                    gm.append("skipTaxNums should be a list of lists.")
                    raise P4Error(gm)
                for i in s:
                    if not isinstance(i, int):
                        gm.append(
                            "skipTaxNums inner list items should be tax numbers."
                        )
                        gm.append("Got %s" % i)
                        raise P4Error(gm)

        # Check for blank sequences.  Its a pain to force the user to do this.
        hasBlanks = False
        blankSeqNums = []
        for partNum in range(self.nParts):
            p = self.parts[partNum]
            partBlankSeqNums = []
            for taxNum in range(self.nTax):
                if skipTaxNums and skipTaxNums[
                        partNum] and taxNum in skipTaxNums[partNum]:
                    pass
                else:
                    nSites = pf.partSequenceSitesCount(
                        p.cPart, taxNum)  # no gaps, no missings
                    if not nSites:
                        partBlankSeqNums.append(taxNum)
            if partBlankSeqNums:
                hasBlanks = True
            blankSeqNums.append(partBlankSeqNums)
        if hasBlanks:
            gm.append(
                "These sequence numbers were found to be blank. They should be excluded."
            )
            gm.append("%s" % blankSeqNums)
            gm.append("Set the arg skipTaxNums to this list.")
            raise P4Error(gm)

        for partNum in range(self.nParts):
            gm = ['Data.compoChiSquaredTest()  Part %i' % partNum]
            p = self.parts[partNum]
            comps = []
            for taxNum in range(self.nTax):
                if skipTaxNums and skipTaxNums[
                        partNum] and taxNum in skipTaxNums[partNum]:
                    pass
                else:
                    oneComp = p.composition([taxNum])
                    nSites = pf.partSequenceSitesCount(
                        p.cPart, taxNum)  # no gaps, no missings
                    # print "tax %i, nSites=%i, oneComp=%s" % (taxNum, nSites,
                    # oneComp)
                    if nSites:
                        for k in range(len(oneComp)):
                            oneComp[k] = oneComp[k] * nSites
                        comps.append(oneComp)
                    else:
                        gm.append(
                            "(Zero-based) sequence %i is blank, and should be excluded."
                            % taxNum)
                        gm.append(
                            "You need to add the number %i to the arg skipTaxNums list of lists."
                            % taxNum)
                        gm.append(
                            "(I could do that automatically, but it is best if *you* do it, explicitly.)"
                        )
                        gm.append(
                            "You can use the Alignment method checkForBlankSequences(listSeqNumsOfBlanks=True)"
                        )
                        gm.append("to help you get those inner lists.")
                        raise P4Error(gm)
            # print "comps=", comps

            # Here we calculate the X^2 stat.  But we want to check
            # for columns summing to zero.  So we can't use
            # p4.func.xSquared()
            nRows = len(comps)
            nCols = len(comps[0])
            # I could have just kept nSites, above
            theSumOfRows = p4.func._sumOfRows(comps)
            theSumOfCols = p4.func._sumOfColumns(comps)
            # print theSumOfCols
            isOk = 1
            columnZeros = []
            for j in range(len(theSumOfRows)):
                if theSumOfRows[j] == 0.0:
                    gm.append("Zero in a row sum.  Programming error.")
                    raise P4Error(gm)
            for j in range(len(theSumOfCols)):
                if theSumOfCols[j] == 0.0:
                    if skipColumnZeros:
                        columnZeros.append(j)
                    else:
                        if verbose:
                            print(gm[0])
                            print("    Zero in a column sum.")
                            print(
                                "    And skipColumnZeros is not set, so I am refusing to do it at all."
                            )
                        isOk = 0
                        nColumnZeros += 1

            theExpected = p4.func._expected(theSumOfRows, theSumOfCols)
            # print "theExpected = ", theExpected
            # print "columnZeros = ", columnZeros
            if isOk:
                if getRows:
                    xSq_rows = []
                xSq = 0.0
                alreadyGivenZeroWarning = 0
                k = 0
                for taxNum in range(self.nTax):
                    if skipTaxNums and skipTaxNums[
                            partNum] and taxNum in skipTaxNums[partNum]:
                        if getRows:
                            # this taxon is not in comps.  Add a placeholder
                            xSq_rows.append(0.0)
                    # k is the counter for comps and theExpected, taxNum
                    # without the skips
                    else:
                        xSq_row = 0.0
                        for j in range(nCols):
                            if j in columnZeros:
                                if skipColumnZeros:
                                    if verbose and not alreadyGivenZeroWarning:
                                        print(gm[0])
                                        print(
                                            "    Skipping (zero-based) column number(s) %s, which sum to zero."
                                            % columnZeros)
                                        alreadyGivenZeroWarning = 1
                                else:
                                    gm.append("Programming error.")
                                    raise P4Error(gm)
                            else:
                                theDiff = comps[k][j] - theExpected[k][j]
                                xSq_row += (theDiff * theDiff) / \
                                    theExpected[k][j]
                        xSq += xSq_row
                        if getRows:
                            xSq_rows.append(xSq_row)
                        k += 1
                # print xSq_rows
                dof = (p.dim - len(columnZeros) - 1) * (len(comps) - 1)
                prob = pf.chiSquaredProb(xSq, dof)
                if verbose:
                    print("Part %i: Chi-square = %f, (dof=%i) P = %f" %
                          (partNum, xSq, dof, prob))
                    if getRows:
                        # print "        rows = %s" % xSq_rows
                        print("%20s  %7s  %s" %
                              ('taxName', 'xSq_row', 'P (like puzzle)'))
                        for tNum in range(self.nTax):
                            if not skipTaxNums or tNum not in skipTaxNums[
                                    partNum]:
                                thisProb = pf.chiSquaredProb(
                                    xSq_rows[tNum],
                                    self.parts[partNum].dim - 1)
                                print("%20s  %7.5f  %7.5f" %
                                      (self.taxNames[tNum], xSq_rows[tNum],
                                       thisProb))
                            else:
                                print("%20s    ---      ---" %
                                      self.taxNames[tNum])
                if getRows:
                    results.append([xSq, dof, prob, xSq_rows])
                else:
                    results.append([xSq, dof, prob])
            else:  # ie not isOk, ie there is a zero in a column sum
                # Maybe a bad idea.  Maybe it should just die, above.
                results.append(None)
        if nColumnZeros and verbose:
            print("There were %i column zeros." % nColumnZeros)
        return results