Exemplo n.º 1
0
def getUpdateScheme(configOptions):
    """Return an UpdateScheme as specified by the user configOptions.

    Arguments:

        configOptions (XML object, defined in xmlbase):
            An XML element of type "Blending"; either
            <ConsumerBlending/> or <ProducerBlending/>; containing the
            weightings and default settings for the model update schemes.
    """
    if configOptions is None:
        return UpdateScheme("unweighted")

    params = configOptions.attrib.copy()
    scheme = "unweighted"
    if "method" in params:
        scheme = params["method"]

        if scheme == "computerTimeWindowSeconds":
            raise NotImplementedError

        elif scheme == "eventTimeWindow":
            scheme = "synchronized"

        del params["method"]

    if "windowLag" not in params:
        if scheme not in ("unweighted", "exponential"):
            params["windowLag"] = 0

    return UpdateScheme(scheme, **params)
Exemplo n.º 2
0
    def _getUpdateScheme(self, configuration):
        """Return an UpdateScheme as specified by the user configuration.

        Arguments:

            configuration (XML object, defined in xmlbase):
                An XML element of type "Blending"; either
                <ConsumerBlending/> or <ProducerBlending/>; containing the
                weightings and default settings for the model update schemes.
        """

        if configuration is None: return UpdateScheme("unweighted")

        params = dict(configuration.attrib)
        scheme = "unweighted"

        if "method" in params:
            scheme = params["method"]
            if scheme == "eventTimeWindow": scheme = "synchronized"
            del params["method"]

        if scheme in ("window", "synchronized") and "windowLag" not in params:
            params["windowLag"] = 0

        return UpdateScheme(scheme, **params)
Exemplo n.º 3
0
    def initialize(self):
        """Initialize a baseline consumer.

        Unlike other consumers, this creates the score function
        dynamically, depending on the type of testStatistic.
        """

        testDistributions = self.segmentRecord.pmmlModel.child(
            pmml.TestDistributions)
        self.field = testDistributions.attrib["field"]
        testStatistic = testDistributions.attrib["testStatistic"]

        # updating can be configured in the Augustus configuration file and in the "windowSize" attribute in this segment
        # I will assume that the "windowSize" attribute can override CUSUM and GLR only

        # (the only other baseline consumer that has an intermediate state is chiSquareDistribution, which only makes
        # sense as UpdateScheme("synchronized"), and that one depends on the configuration of syncNumber, not in PMML)

        # the general case:
        self.updateScheme = self.engine.consumerUpdateScheme
        # the special case:
        if testStatistic in ("CUSUM", "GLR"):
            if "windowSize" in testDistributions.attrib and testDistributions.attrib[
                    "windowSize"] != 0:
                self.updateScheme = UpdateScheme(
                    "window",
                    windowSize=testDistributions.attrib["windowSize"],
                    windowLag=0)

        if testStatistic == "CUSUM":
            self.baseline = testDistributions.child(pmml.Baseline).child()
            self.alternate = testDistributions.child(pmml.Alternate).child()
            self.updator = self.updateScheme.updator(CUSUM)
            self.updator.resetValue = testDistributions.attrib["resetValue"]
            self.score = self.scoreCUSUM

            extension = testDistributions.child(pmml.Extension,
                                                exception=False)
            if extension is not None:
                init = extension.child(pmml.X_ODG_CUSUMInitialization,
                                       exception=False)
                if init is not None:
                    self.updator.initialize({CUSUM: [init.attrib["value"]]})

            self.pseudoField = self.field
            self.pseudoOutputAll = True

        elif testStatistic == "zValue":
            self.baseline = testDistributions.child(pmml.Baseline).child()
            if isinstance(self.baseline, pmml.GaussianDistribution):
                self.score = self.scoreZValueGaussian
            else:
                self.score = self.scoreZValue

            self.pseudoField = self.field
            self.pseudoOutputAll = True

        elif testStatistic in ("chiSquareDistribution", "scalarProduct"):
            self.updators = {}
            self.countTable = testDistributions.child(pmml.Baseline).child()

            if "weightField" in testDistributions.attrib:
                self.weightField = testDistributions.attrib["weightField"]
            else:
                self.weightField = None

            if "normalizationScheme" not in testDistributions.attrib:
                self.normalizationScheme = None
            elif testDistributions.attrib[
                    "normalizationScheme"] == "Independent":
                self.normalizationScheme = self.INDEPENDENT
            elif testDistributions.attrib[
                    "normalizationScheme"] == "SizeWeighted":
                self.normalizationScheme = self.SIZEWEIGHTED

            self.testStatistic = {
                "chiSquareDistribution": self.CHISQUAREDISTRIBUTION,
                "scalarProduct": self.SCALARPRODUCT,
            }[testStatistic]

            self.score = self.scoreHistogram

            self.pseudoField = (self.field, self.weightField)
            self.pseudoOutputAll = False

            # ODG extensions
            self.binsOfInterest = testDistributions.descendant(
                pmml.X_ODG_BinsOfInterest, exception=False)

        elif testStatistic == "chiSquareIndependence":
            self.baseline = testDistributions.child(pmml.Baseline)
            self.fields = None
            self.countTable = None
            self.score = self.scoreChiSquareIndependence

            self.pseudoField = None
            self.pseudoOutputAll = True

        # ODG extensions
        elif testStatistic == "GLR":
            self.baseline = testDistributions.child(pmml.Baseline).child()
            if not isinstance(
                    self.baseline,
                (pmml.GaussianDistribution, pmml.PoissonDistribution)):
                raise NotImplementedError, "GLR has only been implemented for Gaussian and Poisson distributions"

            self.updator = self.updateScheme.updator(GLR)
            self.score = self.scoreGLR

            self.pseudoField = self.field
            self.pseudoOutputAll = False
Exemplo n.º 4
0
class ConsumerBaselineModel(ConsumerAlgorithm):
    CHISQUAREDISTRIBUTION = Atom("chiSquareDistribution")
    SCALARPRODUCT = Atom("scalarProduct")
    INDEPENDENT = Atom("Independent")
    SIZEWEIGHTED = Atom("SizeWeighted")

    def initialize(self):
        """Initialize a baseline consumer.

        Unlike other consumers, this creates the score function
        dynamically, depending on the type of testStatistic.
        """

        testDistributions = self.segmentRecord.pmmlModel.child(
            pmml.TestDistributions)
        self.field = testDistributions.attrib["field"]
        testStatistic = testDistributions.attrib["testStatistic"]

        # updating can be configured in the Augustus configuration file and in the "windowSize" attribute in this segment
        # I will assume that the "windowSize" attribute can override CUSUM and GLR only

        # (the only other baseline consumer that has an intermediate state is chiSquareDistribution, which only makes
        # sense as UpdateScheme("synchronized"), and that one depends on the configuration of syncNumber, not in PMML)

        # the general case:
        self.updateScheme = self.engine.consumerUpdateScheme
        # the special case:
        if testStatistic in ("CUSUM", "GLR"):
            if "windowSize" in testDistributions.attrib and testDistributions.attrib[
                    "windowSize"] != 0:
                self.updateScheme = UpdateScheme(
                    "window",
                    windowSize=testDistributions.attrib["windowSize"],
                    windowLag=0)

        if testStatistic == "CUSUM":
            self.baseline = testDistributions.child(pmml.Baseline).child()
            self.alternate = testDistributions.child(pmml.Alternate).child()
            self.updator = self.updateScheme.updator(CUSUM)
            self.updator.resetValue = testDistributions.attrib["resetValue"]
            self.score = self.scoreCUSUM

            extension = testDistributions.child(pmml.Extension,
                                                exception=False)
            if extension is not None:
                init = extension.child(pmml.X_ODG_CUSUMInitialization,
                                       exception=False)
                if init is not None:
                    self.updator.initialize({CUSUM: [init.attrib["value"]]})

            self.pseudoField = self.field
            self.pseudoOutputAll = True

        elif testStatistic == "zValue":
            self.baseline = testDistributions.child(pmml.Baseline).child()
            if isinstance(self.baseline, pmml.GaussianDistribution):
                self.score = self.scoreZValueGaussian
            else:
                self.score = self.scoreZValue

            self.pseudoField = self.field
            self.pseudoOutputAll = True

        elif testStatistic in ("chiSquareDistribution", "scalarProduct"):
            self.updators = {}
            self.countTable = testDistributions.child(pmml.Baseline).child()

            if "weightField" in testDistributions.attrib:
                self.weightField = testDistributions.attrib["weightField"]
            else:
                self.weightField = None

            if "normalizationScheme" not in testDistributions.attrib:
                self.normalizationScheme = None
            elif testDistributions.attrib[
                    "normalizationScheme"] == "Independent":
                self.normalizationScheme = self.INDEPENDENT
            elif testDistributions.attrib[
                    "normalizationScheme"] == "SizeWeighted":
                self.normalizationScheme = self.SIZEWEIGHTED

            self.testStatistic = {
                "chiSquareDistribution": self.CHISQUAREDISTRIBUTION,
                "scalarProduct": self.SCALARPRODUCT,
            }[testStatistic]

            self.score = self.scoreHistogram

            self.pseudoField = (self.field, self.weightField)
            self.pseudoOutputAll = False

            # ODG extensions
            self.binsOfInterest = testDistributions.descendant(
                pmml.X_ODG_BinsOfInterest, exception=False)

        elif testStatistic == "chiSquareIndependence":
            self.baseline = testDistributions.child(pmml.Baseline)
            self.fields = None
            self.countTable = None
            self.score = self.scoreChiSquareIndependence

            self.pseudoField = None
            self.pseudoOutputAll = True

        # ODG extensions
        elif testStatistic == "GLR":
            self.baseline = testDistributions.child(pmml.Baseline).child()
            if not isinstance(
                    self.baseline,
                (pmml.GaussianDistribution, pmml.PoissonDistribution)):
                raise NotImplementedError, "GLR has only been implemented for Gaussian and Poisson distributions"

            self.updator = self.updateScheme.updator(GLR)
            self.score = self.scoreGLR

            self.pseudoField = self.field
            self.pseudoOutputAll = False

    ######################################## CUSUM

    def scoreCUSUM(self, syncNumber, get):
        """Score one event with a CUSUM testStatistic."""

        value = get(self.field)
        if value is INVALID or value is MISSING:
            return INVALID

        self.updator.increment(
            syncNumber,
            self.alternate.logpdf(value) - self.baseline.logpdf(value))
        return {SCORE_predictedValue: self.updator.cusum()}

    ######################################## zValue

    def scoreZValueGaussian(self, syncNumber, get):
        """Score one event with a zValue testStatistic (Gaussian)."""

        value = get(self.field)
        if value is INVALID or value is MISSING:
            return INVALID

        if self.baseline.attrib["variance"] == 0.:
            return {SCORE_predictedValue: float("inf"), SCORE_pValue: 0.}

        elif self.baseline.attrib["variance"] < 0.:
            return INVALID

        zValue = (value - self.baseline.attrib["mean"]) / math.sqrt(
            self.baseline.attrib["variance"])
        probability = self.baseline.cdf(value)
        pValue = 1. - 2. * abs(probability - 0.5)

        return {SCORE_predictedValue: zValue, SCORE_pValue: pValue}

    def scoreZValue(self, syncNumber, get):
        """Score one event with a zValue testStatistic (non-Gaussian)."""

        value = get(self.field)
        if value is INVALID or value is MISSING:
            return INVALID

        probability = self.baseline.cdf(value)
        if probability <= 1e-16:
            zValue = -10.
        elif probability >= 1. - 1e-16:
            zValue = 10.
        else:
            zValue = math.sqrt(2.) * erfinv(2. * probability - 1.)
        pValue = 1. - 2. * abs(probability - 0.5)

        return {SCORE_predictedValue: zValue, SCORE_pValue: pValue}

    ######################################## chiSquareDistribution and scalarProduct

    def scoreHistogram(self, syncNumber, get):
        """Score one event with a chiSquareDistribution or scalarProduct."""

        value = get(self.field)
        if self.weightField is None:
            weight = 1.
        else:
            weight = get(self.weightField)

        # we can still calculate the consistency of the *accumulated* distribution, even if this datapoint is invalid
        if value is INVALID or value is MISSING or weight is INVALID or weight is MISSING:
            pass
        else:
            # for histograms, increment all bins, but only the correct bin gets a non-zero value
            found = False
            for bin, updator in self.updators.items():
                if bin == value:
                    updator.increment(syncNumber, weight)
                    found = True
                else:
                    updator.increment(syncNumber, 0.)

            # this might be a new bin
            if not found:
                updator = self.updateScheme.updator(SUMX)
                updator.increment(syncNumber, weight)
                self.updators[value] = updator

        fieldValueCounts = self.countTable.matches(pmml.FieldValueCount,
                                                   maxdepth=None)

        # chiSquareDistribution
        if self.testStatistic == self.CHISQUAREDISTRIBUTION:
            expectedTotal = 0.
            expectedValues = {}
            for fieldValueCount in fieldValueCounts:
                bin = fieldValueCount.attrib["value"]
                count = fieldValueCount.attrib["count"]
                expectedTotal += count
                expectedValues[bin] = count

            observedTotal = 0.
            for bin, updator in self.updators.items():
                observedTotal += updator.sum()

            if expectedTotal <= 0. or observedTotal <= 0. or (
                    isinstance(self.countTable, pmml.NormalizedCountTable)
                    and self.countTable.attrib["sample"] <= 0.):
                return INVALID

            chi2 = 0.
            if self.binsOfInterest is None:
                ndf = -1  # normalization removes one degree of freedom
            else:
                ndf = 0

            for bin in set(expectedValues.keys()).union(
                    set(self.updators.keys())):
                if self.binsOfInterest is not None:
                    if bin not in self.binsOfInterest:
                        continue

                expected = expectedValues.get(bin, 0.)
                updator = self.updators.get(bin, None)
                if updator is not None:
                    observed = updator.sum()
                else:
                    observed = 0.

                if expected > 0. or observed > 0.:
                    if isinstance(self.countTable, pmml.CountTable):
                        chi2 += (expected / expectedTotal -
                                 observed / observedTotal)**2 / (
                                     expected / expectedTotal**2 +
                                     observed / observedTotal**2)

                    elif isinstance(self.countTable,
                                    pmml.NormalizedCountTable):
                        sample = self.countTable.attrib["sample"]
                        chi2 += (expected / expectedTotal -
                                 observed / observedTotal)**2 / (
                                     expected / expectedTotal / sample +
                                     observed / observedTotal**2)

                    ndf += 1

            if ndf > 0:
                probability = chiSquare_cdf(chi2, ndf)
                pValue = 1. - probability
                return {
                    SCORE_predictedValue: probability,
                    SCORE_pValue: pValue,
                    SCORE_chiSquare: chi2,
                    SCORE_degreesOfFreedom: ndf
                }
            else:
                return INVALID

        # scalarProduct
        elif self.testStatistic == self.SCALARPRODUCT:
            expectedNorm2 = 0.
            dotProduct = 0.
            for fieldValueCount in fieldValueCounts:
                expected = fieldValueCount.attrib["count"]
                expectedNorm2 += expected**2

                bin = fieldValueCount.attrib["value"]
                if expected > 0. and bin in self.updators:
                    observed = self.updators[bin].sum()
                    dotProduct += expected * observed

            observedNorm2 = 0.
            for updator in self.updators.values():
                observed = updator.sum()
                observedNorm2 += observed**2

            if expectedNorm2 > 0. and observedNorm2 > 0.:
                if self.normalizationScheme is None:
                    return {SCORE_predictedValue: dotProduct}

                elif self.normalizationScheme is self.INDEPENDENT:
                    if expectedNorm2 <= 0. or observedNorm2 <= 0.:
                        return INVALID
                    return {
                        SCORE_predictedValue:
                        dotProduct / math.sqrt(expectedNorm2) /
                        math.sqrt(observedNorm2)
                    }

                elif self.normalizationScheme is self.SIZEWEIGHTED:
                    if expectedNorm2 + observedNorm2 <= 0.: return INVALID
                    return {
                        SCORE_predictedValue:
                        2. * dotProduct / (expectedNorm2 + observedNorm2)
                    }

            else:
                return INVALID

    ######################################## chiSquareIndependence

    def _chiSquareIndependence_add(self, pmmlNode, fieldValues, totals):
        if isinstance(
                pmmlNode,
            (pmml.CountTable, pmml.NormalizedCountTable, pmml.FieldValue)):
            for child in pmmlNode:
                self._chiSquareIndependence_add(
                    child, fieldValues + [child.attrib["value"]], totals)

        elif isinstance(pmmlNode, pmml.FieldValueCount):
            count = pmmlNode.attrib["count"]

            totals[None] += count
            for f, v in zip(self.fields, fieldValues):
                if v not in totals[f]:
                    totals[f][v] = 0.
                totals[f][v] += count

    def _chiSquareIndependence_chi2(self, pmmlNode, fieldValues, totals):
        if isinstance(
                pmmlNode,
            (pmml.CountTable, pmml.NormalizedCountTable, pmml.FieldValue)):
            output = 0.
            for child in pmmlNode:
                subchi2 = self._chiSquareIndependence_chi2(
                    child, fieldValues + [child.attrib["value"]], totals)
                if subchi2 is None: return None
                output += subchi2
            return output

        elif isinstance(pmmlNode, pmml.FieldValueCount):
            observed = pmmlNode.attrib["count"]

            if totals[None] == 0:
                return None
            else:
                if isinstance(self.countTable, pmml.NormalizedCountTable):
                    scale = self.countTable.attrib["sample"] / totals[None]
                else:
                    scale = 1.

                expected = 1. / (totals[None] * scale)**(len(self.fields) - 1)
                for f, v in zip(self.fields, fieldValues):
                    expected *= (totals[f][v] * scale)

                if expected == 0.:
                    return None
                else:
                    return (expected - (observed * scale))**2 / expected

    def scoreChiSquareIndependence(self, syncNumber, get):
        """Score one event with a chiSquareIndependence testStatistic.

        This reads from the multi-dimensional CountTable in PMML and
        ignores the data!  Data are only used to make the CountTable,
        so be sure to be running the producer if you want
        chiSquareIndependence.
        """

        # expect a CountTable (if it doesn't exist, the producer will make it)
        self.countTable = self.baseline.child()
        if not isinstance(self.countTable,
                          (pmml.CountTable, pmml.NormalizedCountTable)):
            return INVALID  # the "first" time doesn't happen until we see a count table

        self.fields = []
        dimension = self.countTable.child(pmml.nonExtension)
        while True:
            self.fields.append(dimension.attrib["field"])
            if isinstance(dimension, pmml.FieldValueCount): break
            dimension = dimension.child(pmml.nonExtension)

        totals = {None: 0.}
        for f in self.fields:
            totals[f] = {}

        # every time: add up the n-field margins (which are "rows and columns" in 2-field case)
        self._chiSquareIndependence_add(self.countTable, [], totals)
        chi2 = self._chiSquareIndependence_chi2(self.countTable, [], totals)

        ndf = 1
        for f, tot in totals.items():
            if f is not None:
                ndf *= (len(tot) - 1)

        if chi2 is not None and ndf > 0:
            probability = chiSquare_cdf(chi2, ndf)
            pValue = 1. - probability
            return {
                SCORE_predictedValue: probability,
                SCORE_pValue: pValue,
                SCORE_chiSquare: chi2,
                SCORE_degreesOfFreedom: ndf
            }
        else:
            return INVALID

    ######################################## ODG-extension: GLR

    def _scoreGLR_GaussianDistribution(self, s, N):
        return (s - N * self.baseline.attrib["mean"])**2 / N

    def _scoreGLR_PoissonDistribution(self, s, N):
        if s > 0.:
            return -math.log(self.baseline.attrib["mean"]) * s + math.log(
                s / N) * s + N * self.baseline.attrib["mean"] - s
        else:
            return -math.log(self.baseline.attrib["mean"]
                             ) * s + N * self.baseline.attrib["mean"] - s

    def scoreGLR(self, syncNumber, get):
        """Score one event with a GLR testStatistic.

        Output is the *current* best-guess of the turn-around time (as
        the corresponding syncNumber) and its log-likelihood ratio.
        """

        # Eq. 2.4.40 in Basseville and Nikiforov: http://www.irisa.fr/sisthem/kniga/ (partly in eventweighting.py)

        value = get(self.field)
        if value is not INVALID and value is not MISSING:
            self.updator.increment(syncNumber, value)

        if isinstance(self.baseline, pmml.GaussianDistribution):
            maximum_syncNumber, maximum = self.updator.glr(
                self._scoreGLR_GaussianDistribution)

            if maximum is None or self.baseline.attrib["variance"] < 0.:
                return INVALID
            elif self.baseline.attrib["variance"] == 0.:
                return {
                    SCORE_predictedValue: float("inf"),
                    SCORE_thresholdTime: maximum_syncNumber
                }
            else:
                return {
                    SCORE_predictedValue:
                    maximum / 2. / self.baseline.attrib["variance"],
                    SCORE_thresholdTime:
                    maximum_syncNumber
                }

        elif isinstance(self.baseline, pmml.PoissonDistribution):
            maximum_syncNumber, maximum = self.updator.glr(
                self._scoreGLR_PoissonDistribution)

            if maximum is None:
                return INVALID
            else:
                return {
                    SCORE_predictedValue: maximum,
                    SCORE_thresholdTime: maximum_syncNumber
                }
Exemplo n.º 5
0
    def initialize(self):
        """Initialize a baseline consumer.

        Unlike other consumers, this creates the score function
        dynamically, depending on the type of testStatistic.
        """

        testDistributions = self.segmentRecord.pmmlModel.child(pmml.TestDistributions)
        self.field = testDistributions.attrib["field"]
        testStatistic = testDistributions.attrib["testStatistic"]

        # updating can be configured in the Augustus configuration file and in the "windowSize" attribute in this segment
        # I will assume that the "windowSize" attribute can override CUSUM and GLR only

        # (the only other baseline consumer that has an intermediate state is chiSquareDistribution, which only makes
        # sense as UpdateScheme("synchronized"), and that one depends on the configuration of syncNumber, not in PMML)

        # the general case:
        self.updateScheme = self.engine.consumerUpdateScheme
        # the special case:
        if testStatistic in ("CUSUM", "GLR"):
            if "windowSize" in testDistributions.attrib and testDistributions.attrib["windowSize"] != 0:
                self.updateScheme = UpdateScheme("window", windowSize=testDistributions.attrib["windowSize"], windowLag=0)

        if testStatistic == "CUSUM":
            self.baseline = testDistributions.child(pmml.Baseline).child()
            self.alternate = testDistributions.child(pmml.Alternate).child()
            self.updator = self.updateScheme.updator(CUSUM)
            self.updator.resetValue = testDistributions.attrib["resetValue"]
            self.score = self.scoreCUSUM

            extension = testDistributions.child(pmml.Extension, exception=False)
            if extension is not None:
                init = extension.child(pmml.X_ODG_CUSUMInitialization, exception=False)
                if init is not None:
                    self.updator.initialize({CUSUM: [init.attrib["value"]]})
                
            self.pseudoField = self.field
            self.pseudoOutputAll = True

        elif testStatistic == "zValue":
            self.baseline = testDistributions.child(pmml.Baseline).child()
            if isinstance(self.baseline, pmml.GaussianDistribution):
                self.score = self.scoreZValueGaussian
            else:
                self.score = self.scoreZValue

            self.pseudoField = self.field
            self.pseudoOutputAll = True

        elif testStatistic in ("chiSquareDistribution", "scalarProduct"):
            self.updators = {}
            self.countTable = testDistributions.child(pmml.Baseline).child()

            if "weightField" in testDistributions.attrib:
                self.weightField = testDistributions.attrib["weightField"]
            else:
                self.weightField = None

            if "normalizationScheme" not in testDistributions.attrib:
                self.normalizationScheme = None
            elif testDistributions.attrib["normalizationScheme"] == "Independent":
                self.normalizationScheme = self.INDEPENDENT
            elif testDistributions.attrib["normalizationScheme"] == "SizeWeighted":
                self.normalizationScheme = self.SIZEWEIGHTED

            self.testStatistic = {"chiSquareDistribution": self.CHISQUAREDISTRIBUTION,
                                  "scalarProduct": self.SCALARPRODUCT,
                                  }[testStatistic]

            self.score = self.scoreHistogram

            self.pseudoField = (self.field, self.weightField)
            self.pseudoOutputAll = False

            # ODG extensions
            self.binsOfInterest = testDistributions.descendant(pmml.X_ODG_BinsOfInterest, exception=False)

        elif testStatistic == "chiSquareIndependence":
            self.baseline = testDistributions.child(pmml.Baseline)
            self.fields = None
            self.countTable = None
            self.score = self.scoreChiSquareIndependence

            self.pseudoField = None
            self.pseudoOutputAll = True

        # ODG extensions
        elif testStatistic == "GLR":
            self.baseline = testDistributions.child(pmml.Baseline).child()
            if not isinstance(self.baseline, (pmml.GaussianDistribution, pmml.PoissonDistribution)):
                raise NotImplementedError, "GLR has only been implemented for Gaussian and Poisson distributions"

            self.updator = self.updateScheme.updator(GLR)
            self.score = self.scoreGLR

            self.pseudoField = self.field
            self.pseudoOutputAll = False
Exemplo n.º 6
0
class ConsumerBaselineModel(ConsumerAlgorithm):
    CHISQUAREDISTRIBUTION = Atom("chiSquareDistribution")
    SCALARPRODUCT = Atom("scalarProduct")
    INDEPENDENT = Atom("Independent")
    SIZEWEIGHTED = Atom("SizeWeighted")

    def initialize(self):
        """Initialize a baseline consumer.

        Unlike other consumers, this creates the score function
        dynamically, depending on the type of testStatistic.
        """

        testDistributions = self.segmentRecord.pmmlModel.child(pmml.TestDistributions)
        self.field = testDistributions.attrib["field"]
        testStatistic = testDistributions.attrib["testStatistic"]

        # updating can be configured in the Augustus configuration file and in the "windowSize" attribute in this segment
        # I will assume that the "windowSize" attribute can override CUSUM and GLR only

        # (the only other baseline consumer that has an intermediate state is chiSquareDistribution, which only makes
        # sense as UpdateScheme("synchronized"), and that one depends on the configuration of syncNumber, not in PMML)

        # the general case:
        self.updateScheme = self.engine.consumerUpdateScheme
        # the special case:
        if testStatistic in ("CUSUM", "GLR"):
            if "windowSize" in testDistributions.attrib and testDistributions.attrib["windowSize"] != 0:
                self.updateScheme = UpdateScheme("window", windowSize=testDistributions.attrib["windowSize"], windowLag=0)

        if testStatistic == "CUSUM":
            self.baseline = testDistributions.child(pmml.Baseline).child()
            self.alternate = testDistributions.child(pmml.Alternate).child()
            self.updator = self.updateScheme.updator(CUSUM)
            self.updator.resetValue = testDistributions.attrib["resetValue"]
            self.score = self.scoreCUSUM

            extension = testDistributions.child(pmml.Extension, exception=False)
            if extension is not None:
                init = extension.child(pmml.X_ODG_CUSUMInitialization, exception=False)
                if init is not None:
                    self.updator.initialize({CUSUM: [init.attrib["value"]]})
                
            self.pseudoField = self.field
            self.pseudoOutputAll = True

        elif testStatistic == "zValue":
            self.baseline = testDistributions.child(pmml.Baseline).child()
            if isinstance(self.baseline, pmml.GaussianDistribution):
                self.score = self.scoreZValueGaussian
            else:
                self.score = self.scoreZValue

            self.pseudoField = self.field
            self.pseudoOutputAll = True

        elif testStatistic in ("chiSquareDistribution", "scalarProduct"):
            self.updators = {}
            self.countTable = testDistributions.child(pmml.Baseline).child()

            if "weightField" in testDistributions.attrib:
                self.weightField = testDistributions.attrib["weightField"]
            else:
                self.weightField = None

            if "normalizationScheme" not in testDistributions.attrib:
                self.normalizationScheme = None
            elif testDistributions.attrib["normalizationScheme"] == "Independent":
                self.normalizationScheme = self.INDEPENDENT
            elif testDistributions.attrib["normalizationScheme"] == "SizeWeighted":
                self.normalizationScheme = self.SIZEWEIGHTED

            self.testStatistic = {"chiSquareDistribution": self.CHISQUAREDISTRIBUTION,
                                  "scalarProduct": self.SCALARPRODUCT,
                                  }[testStatistic]

            self.score = self.scoreHistogram

            self.pseudoField = (self.field, self.weightField)
            self.pseudoOutputAll = False

            # ODG extensions
            self.binsOfInterest = testDistributions.descendant(pmml.X_ODG_BinsOfInterest, exception=False)

        elif testStatistic == "chiSquareIndependence":
            self.baseline = testDistributions.child(pmml.Baseline)
            self.fields = None
            self.countTable = None
            self.score = self.scoreChiSquareIndependence

            self.pseudoField = None
            self.pseudoOutputAll = True

        # ODG extensions
        elif testStatistic == "GLR":
            self.baseline = testDistributions.child(pmml.Baseline).child()
            if not isinstance(self.baseline, (pmml.GaussianDistribution, pmml.PoissonDistribution)):
                raise NotImplementedError, "GLR has only been implemented for Gaussian and Poisson distributions"

            self.updator = self.updateScheme.updator(GLR)
            self.score = self.scoreGLR

            self.pseudoField = self.field
            self.pseudoOutputAll = False

    ######################################## CUSUM

    def scoreCUSUM(self, syncNumber, get):
        """Score one event with a CUSUM testStatistic."""

        self.resetLoggerLevels()
        value = get(self.field)
        if value is INVALID or value is MISSING:
            self.lastScore = INVALID
            self.logger.debug("scoreCUSUM: returning INVALID score")
            return self.lastScore

        self.updator.increment(syncNumber, self.alternate.logpdf(value) - self.baseline.logpdf(value))
        self.lastScore = {SCORE_predictedValue: self.updator.cusum()}
        return self.lastScore

    ######################################## zValue

    def scoreZValueGaussian(self, syncNumber, get):
        """Score one event with a zValue testStatistic (Gaussian)."""

        self.resetLoggerLevels()
        value = get(self.field)
        if value is INVALID or value is MISSING:
            self.lastScore = INVALID
            self.logger.debug("scoreZValueGaussian: returning INVALID score")
            return self.lastScore

        if self.baseline.attrib["variance"] == 0.:
            self.lastScore = {SCORE_predictedValue: float("inf"), SCORE_pValue: 0.}
            self.logger.debug("scoreZValueGaussian: returning infinite score")
            return self.lastScore

        elif self.baseline.attrib["variance"] < 0.:
            self.logger.debug("scoreZValueGaussian: returning INVALID score")
            self.lastScore = INVALID
            return self.lastScore

        zValue = (value - self.baseline.attrib["mean"]) / math.sqrt(self.baseline.attrib["variance"])
        probability = self.baseline.cdf(value)
        pValue = 1. - 2.*abs(probability - 0.5)

        self.lastScore = {SCORE_predictedValue: zValue, SCORE_pValue: pValue}
        return self.lastScore

    def scoreZValue(self, syncNumber, get):
        """Score one event with a zValue testStatistic (non-Gaussian)."""

        self.resetLoggerLevels()
        value = get(self.field)
        if value is INVALID or value is MISSING:
            self.lastScore = INVALID
            self.logger.debug("scoreZValue: returning INVALID score")
            return self.lastScore

        probability = self.baseline.cdf(value)
        if probability <= 1e-16:
            zValue = -10.
        elif probability >= 1. - 1e-16:
            zValue = 10.
        else:
            zValue = math.sqrt(2.)*erfinv(2.*probability - 1.)
        pValue = 1. - 2.*abs(probability - 0.5)

        self.lastScore = {SCORE_predictedValue: zValue, SCORE_pValue: pValue}
        return self.lastScore

    ######################################## chiSquareDistribution and scalarProduct

    def scoreHistogram(self, syncNumber, get):
        """Score one event with a chiSquareDistribution or scalarProduct."""

        self.resetLoggerLevels()
        value = get(self.field)
        if self.weightField is None:
            weight = 1.
        else:
            weight = get(self.weightField)

        # we can still calculate the consistency of the *accumulated* distribution, even if this datapoint is invalid
        if value is INVALID or value is MISSING or weight is INVALID or weight is MISSING:
            pass
        else:
            # for histograms, increment all bins, but only the correct bin gets a non-zero value
            found = False
            for bin, updator in self.updators.items():
                if bin == value:
                    updator.increment(syncNumber, weight)
                    found = True
                else:
                    updator.increment(syncNumber, 0.)

            # this might be a new bin
            if not found:
                updator = self.updateScheme.updator(SUMX)
                updator.increment(syncNumber, weight)
                self.updators[value] = updator
            
        fieldValueCounts = self.countTable.matches(pmml.FieldValueCount, maxdepth=None)

        # chiSquareDistribution
        if self.testStatistic == self.CHISQUAREDISTRIBUTION:
            expectedTotal = 0.
            expectedValues = {}
            for fieldValueCount in fieldValueCounts:
                bin = fieldValueCount.attrib["value"]
                count = fieldValueCount.attrib["count"]
                expectedTotal += count
                expectedValues[bin] = count

            observedTotal = 0.
            for bin, updator in self.updators.items():
                observedTotal += updator.sum()

            if expectedTotal <= 0. or observedTotal <= 0. or (isinstance(self.countTable, pmml.NormalizedCountTable) and self.countTable.attrib["sample"] <= 0.):
                self.lastScore = INVALID
                self.logger.debug("scoreHistogram: returning INVALID score")
                return self.lastScore

            chi2 = 0.
            if self.binsOfInterest is None:
                ndf = -1  # normalization removes one degree of freedom
            else:
                ndf = 0

            for bin in set(expectedValues.keys()).union(set(self.updators.keys())):
                if self.binsOfInterest is not None:
                    if bin not in self.binsOfInterest:
                        continue
                
                expected = expectedValues.get(bin, 0.)
                updator = self.updators.get(bin, None)
                if updator is not None:
                    observed = updator.sum()
                else:
                    observed = 0.

                if expected > 0. or observed > 0.:
                    if isinstance(self.countTable, pmml.CountTable):
                        chi2 += (expected/expectedTotal - observed/observedTotal)**2 / (expected/expectedTotal**2 + observed/observedTotal**2)

                    elif isinstance(self.countTable, pmml.NormalizedCountTable):
                        sample = self.countTable.attrib["sample"]
                        chi2 += (expected/expectedTotal - observed/observedTotal)**2 / (expected/expectedTotal/sample + observed/observedTotal**2)

                    ndf += 1

            if ndf > 0:
                probability = chiSquare_cdf(chi2, ndf)
                pValue = 1. - probability
                self.lastScore = {SCORE_predictedValue: probability, SCORE_pValue: pValue, SCORE_chiSquare: chi2, SCORE_degreesOfFreedom: ndf}
            else:
                self.lastScore = INVALID
                self.logger.debug("scoreHistogram: returning INVALID score")
            return self.lastScore

        # scalarProduct
        elif self.testStatistic == self.SCALARPRODUCT:
            expectedNorm2 = 0.
            dotProduct = 0.
            for fieldValueCount in fieldValueCounts:
                expected = fieldValueCount.attrib["count"]
                expectedNorm2 += expected**2

                bin = fieldValueCount.attrib["value"]
                if expected > 0. and bin in self.updators:
                    observed = self.updators[bin].sum()
                    dotProduct += expected * observed

            observedNorm2 = 0.
            for updator in self.updators.values():
                observed = updator.sum()
                observedNorm2 += observed**2

            if expectedNorm2 > 0. and observedNorm2 > 0.:
                if self.normalizationScheme is None:
                    self.lastScore = {SCORE_predictedValue: dotProduct}

                elif self.normalizationScheme is self.INDEPENDENT:
                    if expectedNorm2 <= 0. or observedNorm2 <= 0.:
                        self.lastScore = INVALID
                        self.logger.debug("scoreHistogram: returning INVALID score")
                        return self.lastScore
                    self.lastScore = {SCORE_predictedValue: dotProduct/math.sqrt(expectedNorm2)/math.sqrt(observedNorm2)}

                elif self.normalizationScheme is self.SIZEWEIGHTED:
                    if expectedNorm2 + observedNorm2 <= 0.:
                        self.lastScore = INVALID
                        self.logger.debug("scoreHistogram: returning INVALID score")
                        return self.lastScore
                    self.lastScore = {SCORE_predictedValue: 2.*dotProduct/(expectedNorm2 + observedNorm2)}

            else:
                self.lastScore = INVALID
                self.logger.debug("scoreHistogram: returning INVALID score")

            return self.lastScore

    ######################################## chiSquareIndependence

    def _chiSquareIndependence_add(self, pmmlNode, fieldValues, totals):
        if isinstance(pmmlNode, (pmml.CountTable, pmml.NormalizedCountTable, pmml.FieldValue)):
            for child in pmmlNode:
                self._chiSquareIndependence_add(child, fieldValues + [child.attrib["value"]], totals)

        elif isinstance(pmmlNode, pmml.FieldValueCount):
            count = pmmlNode.attrib["count"]

            totals[None] += count
            for f, v in zip(self.fields, fieldValues):
                if v not in totals[f]:
                    totals[f][v] = 0.
                totals[f][v] += count

    def _chiSquareIndependence_chi2(self, pmmlNode, fieldValues, totals):
        if isinstance(pmmlNode, (pmml.CountTable, pmml.NormalizedCountTable, pmml.FieldValue)):
            output = 0.
            for child in pmmlNode:
                subchi2 = self._chiSquareIndependence_chi2(child, fieldValues + [child.attrib["value"]], totals)
                if subchi2 is None: return None
                output += subchi2
            return output

        elif isinstance(pmmlNode, pmml.FieldValueCount):
            observed = pmmlNode.attrib["count"]

            if totals[None] == 0:
                return None
            else:
                if isinstance(self.countTable, pmml.NormalizedCountTable):
                    scale = self.countTable.attrib["sample"]/totals[None]
                else:
                    scale = 1.

                expected = 1./(totals[None] * scale)**(len(self.fields) - 1)
                for f, v in zip(self.fields, fieldValues):
                    expected *= (totals[f][v] * scale)

                if expected == 0.:
                    return None
                else:
                    return (expected - (observed*scale))**2 / expected

    def scoreChiSquareIndependence(self, syncNumber, get):
        """Score one event with a chiSquareIndependence testStatistic.

        This reads from the multi-dimensional CountTable in PMML and
        ignores the data!  Data are only used to make the CountTable,
        so be sure to be running the producer if you want
        chiSquareIndependence.
        """

        self.resetLoggerLevels()

        # expect a CountTable (if it doesn't exist, the producer will make it)
        self.countTable = self.baseline.child()
        if not isinstance(self.countTable, (pmml.CountTable, pmml.NormalizedCountTable)):
            self.lastScore = INVALID   # the "first" time doesn't happen until we see a count table
            return self.lastScore

        self.fields = []
        dimension = self.countTable.child(pmml.nonExtension)
        while True:
            self.fields.append(dimension.attrib["field"])
            if isinstance(dimension, pmml.FieldValueCount): break
            dimension = dimension.child(pmml.nonExtension)

        totals = {None: 0.}
        for f in self.fields:
            totals[f] = {}

        # every time: add up the n-field margins (which are "rows and columns" in 2-field case)
        self._chiSquareIndependence_add(self.countTable, [], totals)
        chi2 = self._chiSquareIndependence_chi2(self.countTable, [], totals)

        ndf = 1
        for f, tot in totals.items():
            if f is not None:
                ndf *= (len(tot) - 1)

        if chi2 is not None and ndf > 0:
            probability = chiSquare_cdf(chi2, ndf)
            pValue = 1. - probability
            self.lastScore = {SCORE_predictedValue: probability, SCORE_pValue: pValue, SCORE_chiSquare: chi2, SCORE_degreesOfFreedom: ndf}
        else:
            self.lastScore = INVALID
            self.logger.debug("scoreChiSquareIndependence: returning INVALID score")
        return self.lastScore

    ######################################## ODG-extension: GLR

    def _scoreGLR_GaussianDistribution(self, s, N):
        return (s - N*self.baseline.attrib["mean"])**2 / N

    def _scoreGLR_PoissonDistribution(self, s, N):
        if s > 0.:
            return -math.log(self.baseline.attrib["mean"])*s + math.log(s/N)*s + N*self.baseline.attrib["mean"] - s
        else:
            return -math.log(self.baseline.attrib["mean"])*s + N*self.baseline.attrib["mean"] - s

    def scoreGLR(self, syncNumber, get):
        """Score one event with a GLR testStatistic.

        Output is the *current* best-guess of the turn-around time (as
        the corresponding syncNumber) and its log-likelihood ratio.
        """

        self.resetLoggerLevels()

        # Eq. 2.4.40 in Basseville and Nikiforov: http://www.irisa.fr/sisthem/kniga/ (partly in eventweighting.py)
        value = get(self.field)
        if value is not INVALID and value is not MISSING:
            self.updator.increment(syncNumber, value)

        if isinstance(self.baseline, pmml.GaussianDistribution):
            maximum_syncNumber, maximum = self.updator.glr(self._scoreGLR_GaussianDistribution)

            if maximum is None or self.baseline.attrib["variance"] < 0.:
                self.lastScore = INVALID
                self.logger.debug("scoreGLR: returning INVALID score")
            elif self.baseline.attrib["variance"] == 0.:
                self.lastScore = {SCORE_predictedValue: float("inf"), SCORE_thresholdTime: maximum_syncNumber}
            else:
                self.lastScore = {SCORE_predictedValue: maximum/2./self.baseline.attrib["variance"], SCORE_thresholdTime: maximum_syncNumber}
            return self.lastScore

        elif isinstance(self.baseline, pmml.PoissonDistribution):
            maximum_syncNumber, maximum = self.updator.glr(self._scoreGLR_PoissonDistribution)

            if maximum is None:
                self.lastScore = INVALID
                self.logger.debug("scoreGLR: returning INVALID score")
            else:
                self.lastScore = {SCORE_predictedValue: maximum, SCORE_thresholdTime: maximum_syncNumber}
            return self.lastScore