def setProvenance(self, model, algoName, algorithm, userParameters): model.attrib["algorithmName"] = algoName parameters = dict(algorithm.defaultParams) parameters.update(userParameters) extension = model.child(pmml.Extension, exception=False) if extension is None: extension = pmml.newInstance("Extension") else: extension.children = [ c for c in extension.children if not isinstance(c, pmml.X_ODG_AlgorithmParameter) ] extension.extender("ODG") keys = parameters.keys() keys.sort() for key in keys: ap = pmml.newInstance("X-ODG-AlgorithmParameter", attrib={ "name": key, "value": parameters[key] }, base=pmml.X_ODG_PMML) extension.children.append(ap)
def _compoundAnd(self, *predicates): if len(predicates) == 0: raise Exception("Encountered a list of zero predicates in SegmentationScheme's _compoundAnd; this should not ever be possible.") elif len(predicates) == 1: return predicates[0] else: return pmml.newInstance("CompoundPredicate", attrib={"booleanOperator": "and"}, children=predicates)
def setProvenance(self, model, algoName, algorithm, userParameters): model.attrib["algorithmName"] = algoName parameters = dict(algorithm.defaultParams) parameters.update(userParameters) extension = model.child(pmml.Extension, exception=False) if extension is None: extension = pmml.newInstance("Extension") else: extension.children = [c for c in extension.children if not isinstance(c, pmml.X_ODG_AlgorithmParameter)] keys = parameters.keys() keys.sort() for key in keys: ap = pmml.newInstance("X-ODG-AlgorithmParameter", attrib={"name": key, "value": parameters[key]}, base=pmml.X_ODG_PMML) extension.children.append(ap)
def _simplePredicate(self, field, value, operator): p = pmml.newInstance("SimplePredicate", attrib={ "field": field, "value": value, "operator": operator }) p.post_validate() return p
def _compoundAnd(self, *predicates): if len(predicates) == 0: raise Exception, " ".join([ "Encountered a list of zero predicates in", "SegmentationScheme's _compoundAnd; this", "should not ever be possible."]) elif len(predicates) == 1: return predicates[0] else: return pmml.newInstance("CompoundPredicate", attrib={"booleanOperator": "and"}, children=predicates)
def _compoundRange(self, rangeTuple): f, low, high, closure = rangeTuple opL = "greaterOrEqual" if closure.startswith('c') else "greaterThan" opH = "lessOrEqual" if closure.endswith('d') else "lessThan" if high is None: return self._simplePredicate(field=f, operator=opL, value=low) elif low is None: return self._simplePredicate(field=f, operator=opH, value=high) else: p1 = self._simplePredicate(field=f, operator=opL, value=low) p2 = self._simplePredicate(field=f, operator=opH, value=high) return pmml.newInstance("CompoundPredicate", attrib={"booleanOperator": "and"}, children=[p1, p2])
def updateHistogram(self, syncNumber, get): """Update a baseline model with a chiSquareDistribution or scalarProduct testStatistic (binned histogram).""" self.resetLoggerLevels() if self.first: self._updateHistogram_first() self.first = False value = get(self.field) if value is INVALID or value is MISSING: self.logger.debug( "updateHistogram: returning False (INVALID or MISSING data)") return False if self.weightField is None: weight = 1. else: weight = get(self.weightField) if weight is INVALID or weight is MISSING: self.logger.debug( "updateHistogram: returning False (INVALID or MISSING weight)" ) return False # this might be a new bin if value not in self.pmmlEntries: newNode = pmml.newInstance("FieldValueCount", attrib={ "field": self.field, "value": value, "count": 0 }) # FIXME: should field=self.field??? self.countTable.children.append(newNode) self.pmmlEntries[value] = newNode self.updators[value] = self.engine.producerUpdateScheme.updator( SUMX) # for histograms, increment all bins, but only the correct bin gets a non-zero value for bin, updator in self.updators.items(): if bin == value: updator.increment(syncNumber, weight) else: updator.increment(syncNumber, 0.) self.pmmlEntries[bin].attrib["count"] = self.updators[bin].sum() self.total_updator.increment(syncNumber, weight) self.countTable.attrib["sample"] = self.total_updator.sum() return True
def updateHistogram(self, syncNumber, get): """Update a baseline model with a chiSquareDistribution or scalarProduct testStatistic (binned histogram).""" self.resetLoggerLevels() if self.first: self._updateHistogram_first() self.first = False value = get(self.field) if value is INVALID or value is MISSING: self.logger.debug("updateHistogram: returning False (INVALID or MISSING data)") return False if self.weightField is None: weight = 1. else: weight = get(self.weightField) if weight is INVALID or weight is MISSING: self.logger.debug("updateHistogram: returning False (INVALID or MISSING weight)") return False # this might be a new bin if value not in self.pmmlEntries: newNode = pmml.newInstance("FieldValueCount", attrib={"field": self.field, "value": value, "count": 0}) # FIXME: should field=self.field??? self.countTable.children.append(newNode) self.pmmlEntries[value] = newNode self.updators[value] = self.engine.producerUpdateScheme.updator(SUMX) # for histograms, increment all bins, but only the correct bin gets a non-zero value for bin, updator in self.updators.items(): if bin == value: updator.increment(syncNumber, weight) else: updator.increment(syncNumber, 0.) self.pmmlEntries[bin].attrib["count"] = self.updators[bin].sum() self.total_updator.increment(syncNumber, weight) self.countTable.attrib["sample"] = self.total_updator.sum() return True
def _simplePredicate(self, field, value, operator): p = pmml.newInstance("SimplePredicate", attrib={"field": field, "value": value, "operator": operator}) p.post_validate() return p
def update(self, syncNumber, get): self.resetLoggerLevels() if self.first: self.firstUpdate() ### get the output value outputValue = get(self.outputField) if outputValue is INVALID or outputValue is MISSING: self.logger.debug( "NaiveBayes.update: returning False (INVALID or MISSING data)") return False # output values are compared as strings because that is how they're referenced by TargetValueCount["value"] and OutputField["value"] outputValue = str(outputValue) ### if we have not seen this output value, make a new element in all representations ### this happens relatively rarely if outputValue not in self.outputUpdators: # updator self.outputUpdators[ outputValue] = self.engine.producerUpdateScheme.updator(SUMX) # PMML tvc = pmml.newInstance("TargetValueCount", attrib={ "value": outputValue, "count": 0. }) targetValueCounts = self.model.bayesOutput.targetValueCounts targetValueCounts.tvcMap[outputValue] = tvc targetValueCounts.children.append(tvc) # consumer cache self.model.targetIndex[outputValue] = len( self.model.targetCategories) self.model.targetCategories.append(outputValue) self.model.targetCounts = numpy.append(self.model.targetCounts, 0.) ### update the output values histogram ### this happens very frequently tvcMap = self.model.bayesOutput.targetValueCounts.tvcMap targetCounts = self.model.targetCounts targetIndex = self.model.targetIndex for value, updator in self.outputUpdators.items(): # updator if value == outputValue: updator.increment(syncNumber, 1.) else: updator.increment(syncNumber, 0.) newcount = updator.sum() # PMML tvcMap[value].attrib["count"] = newcount # consumer cache targetCounts[targetIndex[value]] = newcount ### get the input value; INVALID input -> skip all input fields, MISSING input -> skip only the missing field inputValues = [bi.evaluate(get) for bi in self.model.bayesInputs] if INVALID in inputValues: self.logger.debug( "NaiveBayes.update: returning False (INVALID Bayes input fields)" ) return False for inputField, inputValue in zip(self.inputFields, inputValues): if inputValue is not MISSING: bayesInput = self.model.bayesInput[inputField] inputPairUpdator = self.inputPairUpdators[inputField] inputDenomUpdator = self.inputDenomUpdators[inputField] ### if we have not seen this input value, make a new element in all representations ### this happens relatively rarely if inputValue not in inputPairUpdator: # updator inputPairUpdator[inputValue] = {} # PMML tv = pmml.newInstance("TargetValueCounts") tv.tvcMap = {} pc = pmml.newInstance("PairCounts", attrib={"value": inputValue}, children=[tv]) pc.targetValueCounts = tv bayesInput.pcMap[inputValue] = pc bayesInput.tvcMap[inputValue] = {} bayesInput.children.append(pc) # consumer cache bayesInput.pairCounts[inputValue] = {} ### advance local pointers one level deeper inputPairUpdator = inputPairUpdator[inputValue] pcMap = bayesInput.pcMap[inputValue] tvcMap = bayesInput.tvcMap[inputValue] pairCounts = bayesInput.pairCounts[inputValue] ### if we have not seen this input value/output value combination, make a new element if outputValue not in inputPairUpdator: # updator inputPairUpdator[ outputValue] = self.engine.producerUpdateScheme.updator( SUMX) # PMML tvc = pmml.newInstance("TargetValueCount", attrib={ "value": outputValue, "count": 0. }) tvcMap[outputValue] = tvc pcMap.targetValueCounts.children.append(tvc) # consumer cache pairCounts[outputValue] = 0. ### update the output values histogram for this input value ### this happens very frequently for value, updator in inputPairUpdator.items(): # updator if value == outputValue: updator.increment(syncNumber, 1.) else: updator.increment(syncNumber, 0.) newcount = updator.sum() # PMML tvcMap[value].attrib["count"] = newcount # consumer cache pairCounts[value] = newcount ### if this inputField has not seen this outputValue, make new elements (there is no corresponding PMML) denominator = bayesInput.denominators if outputValue not in inputDenomUpdator: # updator inputDenomUpdator[ outputValue] = self.engine.producerUpdateScheme.updator( SUMX) # consumer cache denominator[outputValue] = 0. ### update the denominator histogram for this inputField (there is no corresponding PMML) ### this happens very frequently for value, updator in inputDenomUpdator.items(): # updator if value == outputValue: updator.increment(syncNumber, 1.) else: updator.increment(syncNumber, 0.) # consumer cache denominator[value] = updator.sum() # print "outputValue", outputValue # print "inputValues", dict(zip(self.inputFields, inputValues)) # print # for bi in self.model.bayesInputs: # print bi.xml() # print "pairCounts", bi.pairCounts # print "denominators", bi.denominators # print # print self.model.bayesOutput.xml() # raw_input() return True
def update(self, syncNumber, get): self.resetLoggerLevels() if self.first: self.firstUpdate() ### get the output value outputValue = get(self.outputField) if outputValue is INVALID or outputValue is MISSING: self.logger.debug("NaiveBayes.update: returning False (INVALID or MISSING data)") return False # output values are compared as strings because that is how they're referenced by TargetValueCount["value"] and OutputField["value"] outputValue = str(outputValue) ### if we have not seen this output value, make a new element in all representations ### this happens relatively rarely if outputValue not in self.outputUpdators: # updator self.outputUpdators[outputValue] = self.engine.producerUpdateScheme.updator(SUMX) # PMML tvc = pmml.newInstance("TargetValueCount", attrib={"value": outputValue, "count": 0.}) targetValueCounts = self.model.bayesOutput.targetValueCounts targetValueCounts.tvcMap[outputValue] = tvc targetValueCounts.children.append(tvc) # consumer cache self.model.targetIndex[outputValue] = len(self.model.targetCategories) self.model.targetCategories.append(outputValue) self.model.targetCounts = numpy.append(self.model.targetCounts, 0.) ### update the output values histogram ### this happens very frequently tvcMap = self.model.bayesOutput.targetValueCounts.tvcMap targetCounts = self.model.targetCounts targetIndex = self.model.targetIndex for value, updator in self.outputUpdators.items(): # updator if value == outputValue: updator.increment(syncNumber, 1.) else: updator.increment(syncNumber, 0.) newcount = updator.sum() # PMML tvcMap[value].attrib["count"] = newcount # consumer cache targetCounts[targetIndex[value]] = newcount ### get the input value; INVALID input -> skip all input fields, MISSING input -> skip only the missing field inputValues = [bi.evaluate(get) for bi in self.model.bayesInputs] if INVALID in inputValues: self.logger.debug("NaiveBayes.update: returning False (INVALID Bayes input fields)") return False for inputField, inputValue in zip(self.inputFields, inputValues): if inputValue is not MISSING: bayesInput = self.model.bayesInput[inputField] inputPairUpdator = self.inputPairUpdators[inputField] inputDenomUpdator = self.inputDenomUpdators[inputField] ### if we have not seen this input value, make a new element in all representations ### this happens relatively rarely if inputValue not in inputPairUpdator: # updator inputPairUpdator[inputValue] = {} # PMML tv = pmml.newInstance("TargetValueCounts") tv.tvcMap = {} pc = pmml.newInstance("PairCounts", attrib={"value": inputValue}, children=[tv]) pc.targetValueCounts = tv bayesInput.pcMap[inputValue] = pc bayesInput.tvcMap[inputValue] = {} bayesInput.children.append(pc) # consumer cache bayesInput.pairCounts[inputValue] = {} ### advance local pointers one level deeper inputPairUpdator = inputPairUpdator[inputValue] pcMap = bayesInput.pcMap[inputValue] tvcMap = bayesInput.tvcMap[inputValue] pairCounts = bayesInput.pairCounts[inputValue] ### if we have not seen this input value/output value combination, make a new element if outputValue not in inputPairUpdator: # updator inputPairUpdator[outputValue] = self.engine.producerUpdateScheme.updator(SUMX) # PMML tvc = pmml.newInstance("TargetValueCount", attrib={"value": outputValue, "count": 0.}) tvcMap[outputValue] = tvc pcMap.targetValueCounts.children.append(tvc) # consumer cache pairCounts[outputValue] = 0. ### update the output values histogram for this input value ### this happens very frequently for value, updator in inputPairUpdator.items(): # updator if value == outputValue: updator.increment(syncNumber, 1.) else: updator.increment(syncNumber, 0.) newcount = updator.sum() # PMML tvcMap[value].attrib["count"] = newcount # consumer cache pairCounts[value] = newcount ### if this inputField has not seen this outputValue, make new elements (there is no corresponding PMML) denominator = bayesInput.denominators if outputValue not in inputDenomUpdator: # updator inputDenomUpdator[outputValue] = self.engine.producerUpdateScheme.updator(SUMX) # consumer cache denominator[outputValue] = 0. ### update the denominator histogram for this inputField (there is no corresponding PMML) ### this happens very frequently for value, updator in inputDenomUpdator.items(): # updator if value == outputValue: updator.increment(syncNumber, 1.) else: updator.increment(syncNumber, 0.) # consumer cache denominator[value] = updator.sum() # print "outputValue", outputValue # print "inputValues", dict(zip(self.inputFields, inputValues)) # print # for bi in self.model.bayesInputs: # print bi.xml() # print "pairCounts", bi.pairCounts # print "denominators", bi.denominators # print # print self.model.bayesOutput.xml() # raw_input() return True
def initialize(self, **params): """Initialize a baseline producer. Unlike other producers, this creates the update function dynamically, depending on the testStatistic. """ testDistributions = self.model.child(pmml.TestDistributions) self.field = testDistributions.attrib["field"] if "updateExisting" in params: self.updateExisting = pmml.boolCheck(params["updateExisting"]) del params["updateExisting"] else: self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"]) self.first = True testStatistic = self.model.child(pmml.TestDistributions).attrib["testStatistic"] if testStatistic in ("CUSUM", "zValue", "GLR"): self.baseline = testDistributions.child(pmml.Baseline).child() self.update = self.updateDistribution if testStatistic == "CUSUM": if "alternateField" in params: self.alternateField = params["alternateField"] del params["alternateField"] else: self.alternateField = None if "alternateValue" in params: self.alternateValue = params["alternateValue"] del params["alternateValue"] else: self.alternateValue = None extension = testDistributions.child(pmml.Extension, exception=False) if extension is None: extension = pmml.newInstance("Extension") testDistributions.children.append(extension) extension.extender("ODG") self.cusumInitialization = extension.child(pmml.X_ODG_CUSUMInitialization, exception=False) if self.cusumInitialization is None: self.cusumInitialization = pmml.newInstance("X-ODG-CUSUMInitialization", attrib={"value": 0.}, base=pmml.X_ODG_PMML) extension.children.append(self.cusumInitialization) elif not self.updateExisting: self.cusumInitialization.attrib["value"] = 0. else: self.alternateField = None self.alternateValue = None self.cusumInitialization = None elif testStatistic in ("chiSquareDistribution", "scalarProduct"): self.weightField = testDistributions.attrib.get("weightField", None) self.countTable = testDistributions.child(pmml.Baseline).child(lambda x: isinstance(x, (pmml.CountTable, pmml.NormalizedCountTable))) self.update = self.updateHistogram elif testStatistic == "chiSquareIndependence": self.baseline = testDistributions.child(pmml.Baseline) self.fields = None self.countTable = None self.updators = {} self.total_updator = self.engine.producerUpdateScheme.updator(SUMX) self.update = self.updateChiSquareIndependence if "alternateField" in params: raise NotImplementedError, "The 'alternateField' producerParameter is only used by CUSUM" if "alternateValue" in params: raise NotImplementedError, "The 'alternateValue' producerParameter is only used by CUSUM" if len(params) > 0: raise TypeError, "Unrecognized parameters %s" % params
def initialize(self, **params): """Initialize a baseline producer. Unlike other producers, this creates the update function dynamically, depending on the testStatistic. """ testDistributions = self.model.child(pmml.TestDistributions) self.field = testDistributions.attrib["field"] if "updateExisting" in params: self.updateExisting = pmml.boolCheck(params["updateExisting"]) del params["updateExisting"] else: self.updateExisting = pmml.boolCheck(self.defaultParams["updateExisting"]) self.first = True testStatistic = self.model.child(pmml.TestDistributions).attrib["testStatistic"] if testStatistic in ("CUSUM", "zValue", "GLR"): self.baseline = testDistributions.child(pmml.Baseline).child() self.update = self.updateDistribution if testStatistic == "CUSUM": if "alternateField" in params: self.alternateField = params["alternateField"] del params["alternateField"] else: self.alternateField = None if "alternateValue" in params: self.alternateValue = params["alternateValue"] del params["alternateValue"] else: self.alternateValue = None extension = testDistributions.child(pmml.Extension, exception=False) if extension is None: extension = pmml.newInstance("Extension") testDistributions.children.append(extension) self.cusumInitialization = extension.child(pmml.X_ODG_CUSUMInitialization, exception=False) if self.cusumInitialization is None: self.cusumInitialization = pmml.newInstance("X-ODG-CUSUMInitialization", attrib={"value": 0.}, base=pmml.X_ODG_PMML) extension.children.append(self.cusumInitialization) elif not self.updateExisting: self.cusumInitialization.attrib["value"] = 0. else: self.alternateField = None self.alternateValue = None self.cusumInitialization = None elif testStatistic in ("chiSquareDistribution", "scalarProduct"): self.weightField = testDistributions.attrib.get("weightField", None) self.countTable = testDistributions.child(pmml.Baseline).child(lambda x: isinstance(x, (pmml.CountTable, pmml.NormalizedCountTable))) self.update = self.updateHistogram elif testStatistic == "chiSquareIndependence": self.baseline = testDistributions.child(pmml.Baseline) self.fields = None self.countTable = None self.updators = {} self.total_updator = self.engine.producerUpdateScheme.updator(SUMX) self.update = self.updateChiSquareIndependence if "alternateField" in params: raise NotImplementedError("The 'alternateField' producerParameter is only used by CUSUM") if "alternateValue" in params: raise NotImplementedError("The 'alternateValue' producerParameter is only used by CUSUM") if len(params) > 0: raise TypeError("Unrecognized parameters %s" % params)