def scoreChiSquareIndependence(self, syncNumber, get): """Score one event with a chiSquareIndependence testStatistic. This reads from the multi-dimensional CountTable in PMML and ignores the data! Data are only used to make the CountTable, so be sure to be running the producer if you want chiSquareIndependence. """ self.resetLoggerLevels() # expect a CountTable (if it doesn't exist, the producer will make it) self.countTable = self.baseline.child() if not isinstance(self.countTable, (pmml.CountTable, pmml.NormalizedCountTable)): self.lastScore = INVALID # the "first" time doesn't happen until we see a count table return self.lastScore self.fields = [] dimension = self.countTable.child(pmml.nonExtension) while True: self.fields.append(dimension.attrib["field"]) if isinstance(dimension, pmml.FieldValueCount): break dimension = dimension.child(pmml.nonExtension) totals = {None: 0.} for f in self.fields: totals[f] = {} # every time: add up the n-field margins (which are "rows and columns" in 2-field case) self._chiSquareIndependence_add(self.countTable, [], totals) chi2 = self._chiSquareIndependence_chi2(self.countTable, [], totals) ndf = 1 for f, tot in totals.items(): if f is not None: ndf *= (len(tot) - 1) if chi2 is not None and ndf > 0: probability = chiSquare_cdf(chi2, ndf) pValue = 1. - probability self.lastScore = { SCORE_predictedValue: probability, SCORE_pValue: pValue, SCORE_chiSquare: chi2, SCORE_degreesOfFreedom: ndf } else: self.lastScore = INVALID self.logger.debug( "scoreChiSquareIndependence: returning INVALID score") return self.lastScore
def scoreChiSquareIndependence(self, syncNumber, get): """Score one event with a chiSquareIndependence testStatistic. This reads from the multi-dimensional CountTable in PMML and ignores the data! Data are only used to make the CountTable, so be sure to be running the producer if you want chiSquareIndependence. """ # expect a CountTable (if it doesn't exist, the producer will make it) self.countTable = self.baseline.child() if not isinstance(self.countTable, (pmml.CountTable, pmml.NormalizedCountTable)): return INVALID # the "first" time doesn't happen until we see a count table self.fields = [] dimension = self.countTable.child(pmml.nonExtension) while True: self.fields.append(dimension.attrib["field"]) if isinstance(dimension, pmml.FieldValueCount): break dimension = dimension.child(pmml.nonExtension) totals = {None: 0.0} for f in self.fields: totals[f] = {} # every time: add up the n-field margins (which are "rows and columns" in 2-field case) self._chiSquareIndependence_add(self.countTable, [], totals) chi2 = self._chiSquareIndependence_chi2(self.countTable, [], totals) ndf = 1 for f, tot in totals.items(): if f is not None: ndf *= len(tot) - 1 if chi2 is not None and ndf > 0: probability = chiSquare_cdf(chi2, ndf) pValue = 1.0 - probability return { SCORE_predictedValue: probability, SCORE_pValue: pValue, SCORE_chiSquare: chi2, SCORE_degreesOfFreedom: ndf, } else: return INVALID
def scoreHistogram(self, syncNumber, get): """Score one event with a chiSquareDistribution or scalarProduct.""" value = get(self.field) if self.weightField is None: weight = 1. else: weight = get(self.weightField) # we can still calculate the consistency of the *accumulated* distribution, even if this datapoint is invalid if value is INVALID or value is MISSING or weight is INVALID or weight is MISSING: pass else: # for histograms, increment all bins, but only the correct bin gets a non-zero value found = False for bin, updator in self.updators.items(): if bin == value: updator.increment(syncNumber, weight) found = True else: updator.increment(syncNumber, 0.) # this might be a new bin if not found: updator = self.updateScheme.updator(SUMX) updator.increment(syncNumber, weight) self.updators[value] = updator fieldValueCounts = self.countTable.matches(pmml.FieldValueCount, maxdepth=None) # chiSquareDistribution if self.testStatistic == self.CHISQUAREDISTRIBUTION: expectedTotal = 0. expectedValues = {} for fieldValueCount in fieldValueCounts: bin = fieldValueCount.attrib["value"] count = fieldValueCount.attrib["count"] expectedTotal += count expectedValues[bin] = count observedTotal = 0. for bin, updator in self.updators.items(): observedTotal += updator.sum() if expectedTotal <= 0. or observedTotal <= 0. or ( isinstance(self.countTable, pmml.NormalizedCountTable) and self.countTable.attrib["sample"] <= 0.): return INVALID chi2 = 0. if self.binsOfInterest is None: ndf = -1 # normalization removes one degree of freedom else: ndf = 0 for bin in set(expectedValues.keys()).union( set(self.updators.keys())): if self.binsOfInterest is not None: if bin not in self.binsOfInterest: continue expected = expectedValues.get(bin, 0.) updator = self.updators.get(bin, None) if updator is not None: observed = updator.sum() else: observed = 0. if expected > 0. or observed > 0.: if isinstance(self.countTable, pmml.CountTable): chi2 += (expected / expectedTotal - observed / observedTotal)**2 / ( expected / expectedTotal**2 + observed / observedTotal**2) elif isinstance(self.countTable, pmml.NormalizedCountTable): sample = self.countTable.attrib["sample"] chi2 += (expected / expectedTotal - observed / observedTotal)**2 / ( expected / expectedTotal / sample + observed / observedTotal**2) ndf += 1 if ndf > 0: probability = chiSquare_cdf(chi2, ndf) pValue = 1. - probability return { SCORE_predictedValue: probability, SCORE_pValue: pValue, SCORE_chiSquare: chi2, SCORE_degreesOfFreedom: ndf } else: return INVALID # scalarProduct elif self.testStatistic == self.SCALARPRODUCT: expectedNorm2 = 0. dotProduct = 0. for fieldValueCount in fieldValueCounts: expected = fieldValueCount.attrib["count"] expectedNorm2 += expected**2 bin = fieldValueCount.attrib["value"] if expected > 0. and bin in self.updators: observed = self.updators[bin].sum() dotProduct += expected * observed observedNorm2 = 0. for updator in self.updators.values(): observed = updator.sum() observedNorm2 += observed**2 if expectedNorm2 > 0. and observedNorm2 > 0.: if self.normalizationScheme is None: return {SCORE_predictedValue: dotProduct} elif self.normalizationScheme is self.INDEPENDENT: if expectedNorm2 <= 0. or observedNorm2 <= 0.: return INVALID return { SCORE_predictedValue: dotProduct / math.sqrt(expectedNorm2) / math.sqrt(observedNorm2) } elif self.normalizationScheme is self.SIZEWEIGHTED: if expectedNorm2 + observedNorm2 <= 0.: return INVALID return { SCORE_predictedValue: 2. * dotProduct / (expectedNorm2 + observedNorm2) } else: return INVALID
def scoreHistogram(self, syncNumber, get): """Score one event with a chiSquareDistribution or scalarProduct.""" self.resetLoggerLevels() value = get(self.field) if self.weightField is None: weight = 1. else: weight = get(self.weightField) # we can still calculate the consistency of the *accumulated* distribution, even if this datapoint is invalid if value is INVALID or value is MISSING or weight is INVALID or weight is MISSING: pass else: # for histograms, increment all bins, but only the correct bin gets a non-zero value found = False for bin, updator in self.updators.items(): if bin == value: updator.increment(syncNumber, weight) found = True else: updator.increment(syncNumber, 0.) # this might be a new bin if not found: updator = self.updateScheme.updator(SUMX) updator.increment(syncNumber, weight) self.updators[value] = updator fieldValueCounts = self.countTable.matches(pmml.FieldValueCount, maxdepth=None) # chiSquareDistribution if self.testStatistic == self.CHISQUAREDISTRIBUTION: expectedTotal = 0. expectedValues = {} for fieldValueCount in fieldValueCounts: bin = fieldValueCount.attrib["value"] count = fieldValueCount.attrib["count"] expectedTotal += count expectedValues[bin] = count observedTotal = 0. for bin, updator in self.updators.items(): observedTotal += updator.sum() if expectedTotal <= 0. or observedTotal <= 0. or (isinstance(self.countTable, pmml.NormalizedCountTable) and self.countTable.attrib["sample"] <= 0.): self.lastScore = INVALID self.logger.debug("scoreHistogram: returning INVALID score") return self.lastScore chi2 = 0. if self.binsOfInterest is None: ndf = -1 # normalization removes one degree of freedom else: ndf = 0 for bin in set(expectedValues.keys()).union(set(self.updators.keys())): if self.binsOfInterest is not None: if bin not in self.binsOfInterest: continue expected = expectedValues.get(bin, 0.) updator = self.updators.get(bin, None) if updator is not None: observed = updator.sum() else: observed = 0. if expected > 0. or observed > 0.: if isinstance(self.countTable, pmml.CountTable): chi2 += (expected/expectedTotal - observed/observedTotal)**2 / (expected/expectedTotal**2 + observed/observedTotal**2) elif isinstance(self.countTable, pmml.NormalizedCountTable): sample = self.countTable.attrib["sample"] chi2 += (expected/expectedTotal - observed/observedTotal)**2 / (expected/expectedTotal/sample + observed/observedTotal**2) ndf += 1 if ndf > 0: probability = chiSquare_cdf(chi2, ndf) pValue = 1. - probability self.lastScore = {SCORE_predictedValue: probability, SCORE_pValue: pValue, SCORE_chiSquare: chi2, SCORE_degreesOfFreedom: ndf} else: self.lastScore = INVALID self.logger.debug("scoreHistogram: returning INVALID score") return self.lastScore # scalarProduct elif self.testStatistic == self.SCALARPRODUCT: expectedNorm2 = 0. dotProduct = 0. for fieldValueCount in fieldValueCounts: expected = fieldValueCount.attrib["count"] expectedNorm2 += expected**2 bin = fieldValueCount.attrib["value"] if expected > 0. and bin in self.updators: observed = self.updators[bin].sum() dotProduct += expected * observed observedNorm2 = 0. for updator in self.updators.values(): observed = updator.sum() observedNorm2 += observed**2 if expectedNorm2 > 0. and observedNorm2 > 0.: if self.normalizationScheme is None: self.lastScore = {SCORE_predictedValue: dotProduct} elif self.normalizationScheme is self.INDEPENDENT: if expectedNorm2 <= 0. or observedNorm2 <= 0.: self.lastScore = INVALID self.logger.debug("scoreHistogram: returning INVALID score") return self.lastScore self.lastScore = {SCORE_predictedValue: dotProduct/math.sqrt(expectedNorm2)/math.sqrt(observedNorm2)} elif self.normalizationScheme is self.SIZEWEIGHTED: if expectedNorm2 + observedNorm2 <= 0.: self.lastScore = INVALID self.logger.debug("scoreHistogram: returning INVALID score") return self.lastScore self.lastScore = {SCORE_predictedValue: 2.*dotProduct/(expectedNorm2 + observedNorm2)} else: self.lastScore = INVALID self.logger.debug("scoreHistogram: returning INVALID score") return self.lastScore