示例#1
0
    def scoreChiSquareIndependence(self, syncNumber, get):
        """Score one event with a chiSquareIndependence testStatistic.

        This reads from the multi-dimensional CountTable in PMML and
        ignores the data!  Data are only used to make the CountTable,
        so be sure to be running the producer if you want
        chiSquareIndependence.
        """

        self.resetLoggerLevels()

        # expect a CountTable (if it doesn't exist, the producer will make it)
        self.countTable = self.baseline.child()
        if not isinstance(self.countTable,
                          (pmml.CountTable, pmml.NormalizedCountTable)):
            self.lastScore = INVALID  # the "first" time doesn't happen until we see a count table
            return self.lastScore

        self.fields = []
        dimension = self.countTable.child(pmml.nonExtension)
        while True:
            self.fields.append(dimension.attrib["field"])
            if isinstance(dimension, pmml.FieldValueCount): break
            dimension = dimension.child(pmml.nonExtension)

        totals = {None: 0.}
        for f in self.fields:
            totals[f] = {}

        # every time: add up the n-field margins (which are "rows and columns" in 2-field case)
        self._chiSquareIndependence_add(self.countTable, [], totals)
        chi2 = self._chiSquareIndependence_chi2(self.countTable, [], totals)

        ndf = 1
        for f, tot in totals.items():
            if f is not None:
                ndf *= (len(tot) - 1)

        if chi2 is not None and ndf > 0:
            probability = chiSquare_cdf(chi2, ndf)
            pValue = 1. - probability
            self.lastScore = {
                SCORE_predictedValue: probability,
                SCORE_pValue: pValue,
                SCORE_chiSquare: chi2,
                SCORE_degreesOfFreedom: ndf
            }
        else:
            self.lastScore = INVALID
            self.logger.debug(
                "scoreChiSquareIndependence: returning INVALID score")
        return self.lastScore
示例#2
0
    def scoreChiSquareIndependence(self, syncNumber, get):
        """Score one event with a chiSquareIndependence testStatistic.

        This reads from the multi-dimensional CountTable in PMML and
        ignores the data!  Data are only used to make the CountTable,
        so be sure to be running the producer if you want
        chiSquareIndependence.
        """

        # expect a CountTable (if it doesn't exist, the producer will make it)
        self.countTable = self.baseline.child()
        if not isinstance(self.countTable, (pmml.CountTable, pmml.NormalizedCountTable)):
            return INVALID  # the "first" time doesn't happen until we see a count table

        self.fields = []
        dimension = self.countTable.child(pmml.nonExtension)
        while True:
            self.fields.append(dimension.attrib["field"])
            if isinstance(dimension, pmml.FieldValueCount):
                break
            dimension = dimension.child(pmml.nonExtension)

        totals = {None: 0.0}
        for f in self.fields:
            totals[f] = {}

        # every time: add up the n-field margins (which are "rows and columns" in 2-field case)
        self._chiSquareIndependence_add(self.countTable, [], totals)
        chi2 = self._chiSquareIndependence_chi2(self.countTable, [], totals)

        ndf = 1
        for f, tot in totals.items():
            if f is not None:
                ndf *= len(tot) - 1

        if chi2 is not None and ndf > 0:
            probability = chiSquare_cdf(chi2, ndf)
            pValue = 1.0 - probability
            return {
                SCORE_predictedValue: probability,
                SCORE_pValue: pValue,
                SCORE_chiSquare: chi2,
                SCORE_degreesOfFreedom: ndf,
            }
        else:
            return INVALID
示例#3
0
    def scoreHistogram(self, syncNumber, get):
        """Score one event with a chiSquareDistribution or scalarProduct."""

        value = get(self.field)
        if self.weightField is None:
            weight = 1.
        else:
            weight = get(self.weightField)

        # we can still calculate the consistency of the *accumulated* distribution, even if this datapoint is invalid
        if value is INVALID or value is MISSING or weight is INVALID or weight is MISSING:
            pass
        else:
            # for histograms, increment all bins, but only the correct bin gets a non-zero value
            found = False
            for bin, updator in self.updators.items():
                if bin == value:
                    updator.increment(syncNumber, weight)
                    found = True
                else:
                    updator.increment(syncNumber, 0.)

            # this might be a new bin
            if not found:
                updator = self.updateScheme.updator(SUMX)
                updator.increment(syncNumber, weight)
                self.updators[value] = updator

        fieldValueCounts = self.countTable.matches(pmml.FieldValueCount,
                                                   maxdepth=None)

        # chiSquareDistribution
        if self.testStatistic == self.CHISQUAREDISTRIBUTION:
            expectedTotal = 0.
            expectedValues = {}
            for fieldValueCount in fieldValueCounts:
                bin = fieldValueCount.attrib["value"]
                count = fieldValueCount.attrib["count"]
                expectedTotal += count
                expectedValues[bin] = count

            observedTotal = 0.
            for bin, updator in self.updators.items():
                observedTotal += updator.sum()

            if expectedTotal <= 0. or observedTotal <= 0. or (
                    isinstance(self.countTable, pmml.NormalizedCountTable)
                    and self.countTable.attrib["sample"] <= 0.):
                return INVALID

            chi2 = 0.
            if self.binsOfInterest is None:
                ndf = -1  # normalization removes one degree of freedom
            else:
                ndf = 0

            for bin in set(expectedValues.keys()).union(
                    set(self.updators.keys())):
                if self.binsOfInterest is not None:
                    if bin not in self.binsOfInterest:
                        continue

                expected = expectedValues.get(bin, 0.)
                updator = self.updators.get(bin, None)
                if updator is not None:
                    observed = updator.sum()
                else:
                    observed = 0.

                if expected > 0. or observed > 0.:
                    if isinstance(self.countTable, pmml.CountTable):
                        chi2 += (expected / expectedTotal -
                                 observed / observedTotal)**2 / (
                                     expected / expectedTotal**2 +
                                     observed / observedTotal**2)

                    elif isinstance(self.countTable,
                                    pmml.NormalizedCountTable):
                        sample = self.countTable.attrib["sample"]
                        chi2 += (expected / expectedTotal -
                                 observed / observedTotal)**2 / (
                                     expected / expectedTotal / sample +
                                     observed / observedTotal**2)

                    ndf += 1

            if ndf > 0:
                probability = chiSquare_cdf(chi2, ndf)
                pValue = 1. - probability
                return {
                    SCORE_predictedValue: probability,
                    SCORE_pValue: pValue,
                    SCORE_chiSquare: chi2,
                    SCORE_degreesOfFreedom: ndf
                }
            else:
                return INVALID

        # scalarProduct
        elif self.testStatistic == self.SCALARPRODUCT:
            expectedNorm2 = 0.
            dotProduct = 0.
            for fieldValueCount in fieldValueCounts:
                expected = fieldValueCount.attrib["count"]
                expectedNorm2 += expected**2

                bin = fieldValueCount.attrib["value"]
                if expected > 0. and bin in self.updators:
                    observed = self.updators[bin].sum()
                    dotProduct += expected * observed

            observedNorm2 = 0.
            for updator in self.updators.values():
                observed = updator.sum()
                observedNorm2 += observed**2

            if expectedNorm2 > 0. and observedNorm2 > 0.:
                if self.normalizationScheme is None:
                    return {SCORE_predictedValue: dotProduct}

                elif self.normalizationScheme is self.INDEPENDENT:
                    if expectedNorm2 <= 0. or observedNorm2 <= 0.:
                        return INVALID
                    return {
                        SCORE_predictedValue:
                        dotProduct / math.sqrt(expectedNorm2) /
                        math.sqrt(observedNorm2)
                    }

                elif self.normalizationScheme is self.SIZEWEIGHTED:
                    if expectedNorm2 + observedNorm2 <= 0.: return INVALID
                    return {
                        SCORE_predictedValue:
                        2. * dotProduct / (expectedNorm2 + observedNorm2)
                    }

            else:
                return INVALID
示例#4
0
    def scoreHistogram(self, syncNumber, get):
        """Score one event with a chiSquareDistribution or scalarProduct."""

        self.resetLoggerLevels()
        value = get(self.field)
        if self.weightField is None:
            weight = 1.
        else:
            weight = get(self.weightField)

        # we can still calculate the consistency of the *accumulated* distribution, even if this datapoint is invalid
        if value is INVALID or value is MISSING or weight is INVALID or weight is MISSING:
            pass
        else:
            # for histograms, increment all bins, but only the correct bin gets a non-zero value
            found = False
            for bin, updator in self.updators.items():
                if bin == value:
                    updator.increment(syncNumber, weight)
                    found = True
                else:
                    updator.increment(syncNumber, 0.)

            # this might be a new bin
            if not found:
                updator = self.updateScheme.updator(SUMX)
                updator.increment(syncNumber, weight)
                self.updators[value] = updator
            
        fieldValueCounts = self.countTable.matches(pmml.FieldValueCount, maxdepth=None)

        # chiSquareDistribution
        if self.testStatistic == self.CHISQUAREDISTRIBUTION:
            expectedTotal = 0.
            expectedValues = {}
            for fieldValueCount in fieldValueCounts:
                bin = fieldValueCount.attrib["value"]
                count = fieldValueCount.attrib["count"]
                expectedTotal += count
                expectedValues[bin] = count

            observedTotal = 0.
            for bin, updator in self.updators.items():
                observedTotal += updator.sum()

            if expectedTotal <= 0. or observedTotal <= 0. or (isinstance(self.countTable, pmml.NormalizedCountTable) and self.countTable.attrib["sample"] <= 0.):
                self.lastScore = INVALID
                self.logger.debug("scoreHistogram: returning INVALID score")
                return self.lastScore

            chi2 = 0.
            if self.binsOfInterest is None:
                ndf = -1  # normalization removes one degree of freedom
            else:
                ndf = 0

            for bin in set(expectedValues.keys()).union(set(self.updators.keys())):
                if self.binsOfInterest is not None:
                    if bin not in self.binsOfInterest:
                        continue
                
                expected = expectedValues.get(bin, 0.)
                updator = self.updators.get(bin, None)
                if updator is not None:
                    observed = updator.sum()
                else:
                    observed = 0.

                if expected > 0. or observed > 0.:
                    if isinstance(self.countTable, pmml.CountTable):
                        chi2 += (expected/expectedTotal - observed/observedTotal)**2 / (expected/expectedTotal**2 + observed/observedTotal**2)

                    elif isinstance(self.countTable, pmml.NormalizedCountTable):
                        sample = self.countTable.attrib["sample"]
                        chi2 += (expected/expectedTotal - observed/observedTotal)**2 / (expected/expectedTotal/sample + observed/observedTotal**2)

                    ndf += 1

            if ndf > 0:
                probability = chiSquare_cdf(chi2, ndf)
                pValue = 1. - probability
                self.lastScore = {SCORE_predictedValue: probability, SCORE_pValue: pValue, SCORE_chiSquare: chi2, SCORE_degreesOfFreedom: ndf}
            else:
                self.lastScore = INVALID
                self.logger.debug("scoreHistogram: returning INVALID score")
            return self.lastScore

        # scalarProduct
        elif self.testStatistic == self.SCALARPRODUCT:
            expectedNorm2 = 0.
            dotProduct = 0.
            for fieldValueCount in fieldValueCounts:
                expected = fieldValueCount.attrib["count"]
                expectedNorm2 += expected**2

                bin = fieldValueCount.attrib["value"]
                if expected > 0. and bin in self.updators:
                    observed = self.updators[bin].sum()
                    dotProduct += expected * observed

            observedNorm2 = 0.
            for updator in self.updators.values():
                observed = updator.sum()
                observedNorm2 += observed**2

            if expectedNorm2 > 0. and observedNorm2 > 0.:
                if self.normalizationScheme is None:
                    self.lastScore = {SCORE_predictedValue: dotProduct}

                elif self.normalizationScheme is self.INDEPENDENT:
                    if expectedNorm2 <= 0. or observedNorm2 <= 0.:
                        self.lastScore = INVALID
                        self.logger.debug("scoreHistogram: returning INVALID score")
                        return self.lastScore
                    self.lastScore = {SCORE_predictedValue: dotProduct/math.sqrt(expectedNorm2)/math.sqrt(observedNorm2)}

                elif self.normalizationScheme is self.SIZEWEIGHTED:
                    if expectedNorm2 + observedNorm2 <= 0.:
                        self.lastScore = INVALID
                        self.logger.debug("scoreHistogram: returning INVALID score")
                        return self.lastScore
                    self.lastScore = {SCORE_predictedValue: 2.*dotProduct/(expectedNorm2 + observedNorm2)}

            else:
                self.lastScore = INVALID
                self.logger.debug("scoreHistogram: returning INVALID score")

            return self.lastScore