def evaluate(self, dataTable, functionTable, performanceTable, arguments): """Evaluate the function, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this expression. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: The result of the calculation as a DataColumn. """ arguments = [x.evaluate(dataTable, functionTable, performanceTable) for x in arguments] performanceTable.begin("user-defined \"%s\"" % self.name) parameters = self.childrenOfTag("ParameterField") if len(arguments) != len(parameters): raise defs.PmmlValidationError("Apply function=\"%s\" has %d arguments but the corresponding DefineFunction has %d parameters" % (self.name, len(arguments), len(parameters))) subTable = dataTable.subTable() for argument, parameter in zip(arguments, parameters): dataType = parameter.get("dataType", argument.fieldType.dataType) optype = parameter.get("optype", argument.fieldType.optype) if dataType != argument.fieldType.dataType or optype != argument.fieldType.optype: argument = FieldCastMethods.cast(FakeFieldType(dataType, optype), argument) subTable.fields[parameter["name"]] = argument performanceTable.pause("user-defined \"%s\"" % self.name) dataColumn = self.childOfClass(PmmlExpression).evaluate(subTable, functionTable, performanceTable) performanceTable.unpause("user-defined \"%s\"" % self.name) dataType = self.get("dataType", dataColumn.fieldType.dataType) optype = self.get("optype", dataColumn.fieldType.optype) if dataType != dataColumn.fieldType.dataType or optype != dataColumn.fieldType.optype: dataColumn = FieldCastMethods.cast(FakeFieldType(dataType, optype), dataColumn) performanceTable.end("user-defined \"%s\"" % self.name) return dataColumn
def calculate(self, dataTable, functionTable=None, performanceTable=None): """Calculate a DerivedField. This method modifies the input DataTable. If the data types between the DerivedField and its EXPRESSION are not matched, the DerivedField will need to cast the output. This is a potentially expensive and often unwanted operation. When a DerivedField casts, it reports the cast in the PerformanceTable with DerivedField name, to help the user debug their PMML. @type dataTable: DataTable @param dataTable: The pre-built DataTable. @type functionTable: FunctionTable or None @param functionTable: A table of functions. Initially, it contains only the built-in functions, but any user functions defined in PMML would be added to it. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataTable @return: A DataTable containing the result, usually a modified version of the input. """ if functionTable is None: functionTable = FunctionTable() if performanceTable is None: performanceTable = FakePerformanceTable() dataColumn = self.childOfClass(PmmlExpression).evaluate(dataTable, functionTable, performanceTable) performanceTable.begin("DerivedField") dataType = dataColumn.fieldType.dataType optype = dataColumn.fieldType.optype if self.get("dataType", dataType) == dataType and self.get("optype", optype) == optype and len(self.childrenOfTag("Value")) == 0: dataTable.fields[self.name] = dataColumn else: performanceTable.begin("cast (\"%s\")" % self.name) dataTable.fields[self.name] = FieldCastMethods.cast(FieldType(self), dataColumn) performanceTable.end("cast (\"%s\")" % self.name) performanceTable.end("DerivedField") return dataTable.fields[self.name]
def evaluate(self, dataTable, functionTable, performanceTable): """Evaluate the expression, using a DataTable as input. @type dataTable: DataTable @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression. @type functionTable: FunctionTable @param functionTable: The FunctionTable, containing any functions that might be called in this expression. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: DataColumn @return: The result of the calculation as a DataColumn. """ dataColumn = self.childOfClass(PmmlExpression).evaluate(dataTable, functionTable, performanceTable) performanceTable.begin("CastExpression") dataColumn = FieldCastMethods.cast(FieldType(self), dataColumn) mask = FieldCastMethods.applyInvalidValueTreatment(dataColumn.mask, self.get("invalidValueTreatment")) data, mask = FieldCastMethods.applyMapMissingTo(dataColumn.fieldType, dataColumn.data, mask, self.get("mapMissingTo")) performanceTable.end("CastExpression") return DataColumn(dataColumn.fieldType, data, mask)
def format(self, subTable, functionTable, performanceTable, score): """Extract or post-process output for the output field of a DataTable. @type subTable: DataTable @param subTable: The DataTable associated with this local lexical scope. @type functionTable: FunctionTable or None @param functionTable: A table of functions. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @type score: dict @param score: Dictionary mapping PMML score "feature" strings to DataColumns. This dictionary always contains a None key, which is the basic feature ("predictedValue"). @rtype: DataColumn @return: The output that would go into an output field of a DataTable. """ performanceTable.begin("OutputField") feature = self.get("feature") if feature is None: dataColumn = subTable.fields[self["name"]] elif feature == "predictedValue": dataColumn = score[None] elif feature == "predictedDisplayValue": original = score[None] toString = original.fieldType.valueToString data = NP("empty", len(subTable), dtype=NP.dtype(object)) for i, x in enumerate(original.data): data[i] = toString(x) dataColumn = DataColumn(FakeFieldType("string", "continuous"), data, None) elif feature == "transformedValue": expression = self.childOfClass(PmmlExpression) if expression is None: raise defs.PmmlValidationError("OutputField with feature \"transformedValue\" requires an EXPRESSION") performanceTable.pause("OutputField") dataColumn = expression.evaluate(subTable, functionTable, performanceTable) performanceTable.unpause("OutputField") elif feature == "decision": decisions = self.childOfTag("Decisions") if decisions is None: raise defs.PmmlValidationError("OutputField with feature \"decision\" requires a Decisions block") performanceTable.pause("OutputField") dataColumn = self.childOfClass(PmmlExpression).evaluate(subTable, functionTable, performanceTable) performanceTable.unpause("OutputField") if dataColumn.mask is None: valid = None else: valid = NP(dataColumn.mask == defs.VALID) fieldType = FakeFieldType("object", "any") data = NP("empty", len(subTable), dtype=fieldType.dtype) mask = NP(NP("ones", len(subTable), dtype=defs.maskType) * defs.MISSING) for decision in decisions.childrenOfTag("Decision"): value = dataColumn.fieldType.stringToValue(decision["value"]) selection = NP(dataColumn.data == value) if valid is not None: NP("logical_and", selection, valid, selection) for i in xrange(len(data)): if selection[i]: data[i] = decision mask[selection] = defs.VALID if not mask.any(): mask = None dataColumn = DataColumn(fieldType, data, mask) elif feature in score: dataColumn = score[feature] else: model = self.getparent() if model is not None: model = model.getparent() if model is None: model = "(orphaned OutputField; no parent model)" else: model = model.t raise defs.PmmlValidationError("Models of type %s do not produce \"%s\" features (or at least, it is not yet implemented by Augustus)" % (model, feature)) dataType = self.get("dataType", dataColumn.fieldType.dataType) optype = self.get("optype", dataColumn.fieldType.optype) if (dataType != dataColumn.fieldType.dataType or optype != dataColumn.fieldType.optype) and feature not in ("predictedDisplayValue", "decision"): dataColumn = FieldCastMethods.cast(FakeFieldType(dataType, optype), dataColumn) if feature is not None: subTable.fields[self.get("displayName", self["name"])] = dataColumn performanceTable.end("OutputField") return dataColumn
def replaceField(self, dataTable, functionTable, performanceTable): """Replace a field in the DataTable for outlier removal, missing value handling, and invalid value treatment. @type dataTable: DataTable @param dataTable: The pre-built DataTable. @type functionTable: FunctionTable @param functionTable: A table of functions. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. """ dataColumn = dataTable.fields.get(self.name) if dataColumn is None: return performanceTable.begin("MiningField") optype = self.get("optype", dataColumn.fieldType.optype) if optype != dataColumn.fieldType.optype: dataColumn = FieldCastMethods.cast(FakeFieldType(dataColumn.fieldType.dataType, optype), dataColumn) data = dataColumn.data mask = dataColumn.mask outliers = self.get("outliers") lowValue = self.get("lowValue") if lowValue is not None: lowValue = dataColumn.fieldType.stringToValue(lowValue) if outliers == "asMissingValues": selection = NP(dataColumn.data < lowValue) mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection) elif outliers == "asExtremeValues": selection = NP(dataColumn.data < lowValue) if data is dataColumn.data: data = NP("copy", data) data.setflags(write=True) data[selection] = lowValue highValue = self.get("highValue") if highValue is not None: highValue = dataColumn.fieldType.stringToValue(highValue) if outliers == "asMissingValues": selection = NP(dataColumn.data > highValue) mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection) elif outliers == "asExtremeValues": selection = NP(dataColumn.data > highValue) if data is dataColumn.data: data = NP("copy", data) data.setflags(write=True) data[selection] = highValue mask = FieldCastMethods.applyInvalidValueTreatment(mask, self.get("invalidValueTreatment")) data, mask = FieldCastMethods.applyMapMissingTo(dataColumn.fieldType, data, mask, self.get("missingValueReplacement")) dataTable.fields.replaceField(self.name, DataColumn(dataColumn.fieldType, data, mask)) performanceTable.end("MiningField")
def format(self, subTable, functionTable, performanceTable, score): """Extract or post-process output for the output field of a DataTable. @type subTable: DataTable @param subTable: The DataTable associated with this local lexical scope. @type functionTable: FunctionTable or None @param functionTable: A table of functions. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @type score: dict @param score: Dictionary mapping PMML score "feature" strings to DataColumns. This dictionary always contains a None key, which is the basic feature ("predictedValue"). @rtype: DataColumn @return: The output that would go into an output field of a DataTable. """ performanceTable.begin("OutputField") feature = self.get("feature") if feature is None: dataColumn = subTable.fields[self["name"]] elif feature == "predictedValue": dataColumn = score[None] elif feature == "predictedDisplayValue": original = score[None] toString = original.fieldType.valueToString data = NP("empty", len(subTable), dtype=NP.dtype(object)) for i, x in enumerate(original.data): data[i] = toString(x) dataColumn = DataColumn(FakeFieldType("string", "continuous"), data, None) elif feature == "transformedValue": expression = self.childOfClass(PmmlExpression) if expression is None: raise defs.PmmlValidationError( "OutputField with feature \"transformedValue\" requires an EXPRESSION" ) performanceTable.pause("OutputField") dataColumn = expression.evaluate(subTable, functionTable, performanceTable) performanceTable.unpause("OutputField") elif feature == "decision": decisions = self.childOfTag("Decisions") if decisions is None: raise defs.PmmlValidationError( "OutputField with feature \"decision\" requires a Decisions block" ) performanceTable.pause("OutputField") dataColumn = self.childOfClass(PmmlExpression).evaluate( subTable, functionTable, performanceTable) performanceTable.unpause("OutputField") if dataColumn.mask is None: valid = None else: valid = NP(dataColumn.mask == defs.VALID) fieldType = FakeFieldType("object", "any") data = NP("empty", len(subTable), dtype=fieldType.dtype) mask = NP( NP("ones", len(subTable), dtype=defs.maskType) * defs.MISSING) for decision in decisions.childrenOfTag("Decision"): value = dataColumn.fieldType.stringToValue(decision["value"]) selection = NP(dataColumn.data == value) if valid is not None: NP("logical_and", selection, valid, selection) for i in xrange(len(data)): if selection[i]: data[i] = decision mask[selection] = defs.VALID if not mask.any(): mask = None dataColumn = DataColumn(fieldType, data, mask) elif feature in score: dataColumn = score[feature] else: model = self.getparent() if model is not None: model = model.getparent() if model is None: model = "(orphaned OutputField; no parent model)" else: model = model.t raise defs.PmmlValidationError( "Models of type %s do not produce \"%s\" features (or at least, it is not yet implemented by Augustus)" % (model, feature)) dataType = self.get("dataType", dataColumn.fieldType.dataType) optype = self.get("optype", dataColumn.fieldType.optype) if (dataType != dataColumn.fieldType.dataType or optype != dataColumn.fieldType.optype) and feature not in ( "predictedDisplayValue", "decision"): dataColumn = FieldCastMethods.cast(FakeFieldType(dataType, optype), dataColumn) if feature is not None: subTable.fields[self.get("displayName", self["name"])] = dataColumn performanceTable.end("OutputField") return dataColumn
def replaceField(self, dataTable, functionTable, performanceTable): """Replace a field in the DataTable for outlier removal, missing value handling, and invalid value treatment. @type dataTable: DataTable @param dataTable: The pre-built DataTable. @type functionTable: FunctionTable @param functionTable: A table of functions. @type performanceTable: PerformanceTable @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. """ dataColumn = dataTable.fields.get(self.name) if dataColumn is None: return performanceTable.begin("MiningField") optype = self.get("optype", dataColumn.fieldType.optype) if optype != dataColumn.fieldType.optype: dataColumn = FieldCastMethods.cast( FakeFieldType(dataColumn.fieldType.dataType, optype), dataColumn) data = dataColumn.data mask = dataColumn.mask outliers = self.get("outliers") lowValue = self.get("lowValue") if lowValue is not None: lowValue = dataColumn.fieldType.stringToValue(lowValue) if outliers == "asMissingValues": selection = NP(dataColumn.data < lowValue) mask = FieldCastMethods.outliersAsMissing( mask, dataColumn.mask, selection) elif outliers == "asExtremeValues": selection = NP(dataColumn.data < lowValue) if data is dataColumn.data: data = NP("copy", data) data.setflags(write=True) data[selection] = lowValue highValue = self.get("highValue") if highValue is not None: highValue = dataColumn.fieldType.stringToValue(highValue) if outliers == "asMissingValues": selection = NP(dataColumn.data > highValue) mask = FieldCastMethods.outliersAsMissing( mask, dataColumn.mask, selection) elif outliers == "asExtremeValues": selection = NP(dataColumn.data > highValue) if data is dataColumn.data: data = NP("copy", data) data.setflags(write=True) data[selection] = highValue mask = FieldCastMethods.applyInvalidValueTreatment( mask, self.get("invalidValueTreatment")) data, mask = FieldCastMethods.applyMapMissingTo( dataColumn.fieldType, data, mask, self.get("missingValueReplacement")) dataTable.fields.replaceField( self.name, DataColumn(dataColumn.fieldType, data, mask)) performanceTable.end("MiningField")