Пример #1
0
    def evaluate(self, dataTable, functionTable, performanceTable, arguments):
        """Evaluate the function, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        arguments = [x.evaluate(dataTable, functionTable, performanceTable) for x in arguments]
        performanceTable.begin("user-defined \"%s\"" % self.name)

        parameters = self.childrenOfTag("ParameterField")

        if len(arguments) != len(parameters):
            raise defs.PmmlValidationError("Apply function=\"%s\" has %d arguments but the corresponding DefineFunction has %d parameters" % (self.name, len(arguments), len(parameters)))

        subTable = dataTable.subTable()

        for argument, parameter in zip(arguments, parameters):
            dataType = parameter.get("dataType", argument.fieldType.dataType)
            optype = parameter.get("optype", argument.fieldType.optype)
            if dataType != argument.fieldType.dataType or optype != argument.fieldType.optype:
                argument = FieldCastMethods.cast(FakeFieldType(dataType, optype), argument)

            subTable.fields[parameter["name"]] = argument

        performanceTable.pause("user-defined \"%s\"" % self.name)
        dataColumn = self.childOfClass(PmmlExpression).evaluate(subTable, functionTable, performanceTable)
        performanceTable.unpause("user-defined \"%s\"" % self.name)

        dataType = self.get("dataType", dataColumn.fieldType.dataType)
        optype = self.get("optype", dataColumn.fieldType.optype)
        if dataType != dataColumn.fieldType.dataType or optype != dataColumn.fieldType.optype:
            dataColumn = FieldCastMethods.cast(FakeFieldType(dataType, optype), dataColumn)

        performanceTable.end("user-defined \"%s\"" % self.name)
        return dataColumn
Пример #2
0
    def calculate(self, dataTable, functionTable=None, performanceTable=None):
        """Calculate a DerivedField.

        This method modifies the input DataTable.

        If the data types between the DerivedField and its EXPRESSION
        are not matched, the DerivedField will need to cast the output.
        This is a potentially expensive and often unwanted operation.
        When a DerivedField casts, it reports the cast in the
        PerformanceTable with DerivedField name, to help the user
        debug their PMML.

        @type dataTable: DataTable
        @param dataTable: The pre-built DataTable.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.  Initially, it contains only the built-in functions, but any user functions defined in PMML would be added to it.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataTable
        @return: A DataTable containing the result, usually a modified version of the input.
        """

        if functionTable is None:
            functionTable = FunctionTable()
        if performanceTable is None:
            performanceTable = FakePerformanceTable()

        dataColumn = self.childOfClass(PmmlExpression).evaluate(dataTable, functionTable, performanceTable)
        performanceTable.begin("DerivedField")

        dataType = dataColumn.fieldType.dataType
        optype = dataColumn.fieldType.optype
        if self.get("dataType", dataType) == dataType and self.get("optype", optype) == optype and len(self.childrenOfTag("Value")) == 0:
            dataTable.fields[self.name] = dataColumn

        else:
            performanceTable.begin("cast (\"%s\")" % self.name)
            dataTable.fields[self.name] = FieldCastMethods.cast(FieldType(self), dataColumn)
            performanceTable.end("cast (\"%s\")" % self.name)

        performanceTable.end("DerivedField")

        return dataTable.fields[self.name]
Пример #3
0
    def evaluate(self, dataTable, functionTable, performanceTable):
        """Evaluate the expression, using a DataTable as input.

        @type dataTable: DataTable
        @param dataTable: The input DataTable, containing any fields that might be used to evaluate this expression.
        @type functionTable: FunctionTable
        @param functionTable: The FunctionTable, containing any functions that might be called in this expression.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @rtype: DataColumn
        @return: The result of the calculation as a DataColumn.
        """

        dataColumn = self.childOfClass(PmmlExpression).evaluate(dataTable, functionTable, performanceTable)
        performanceTable.begin("CastExpression")

        dataColumn = FieldCastMethods.cast(FieldType(self), dataColumn)
        mask = FieldCastMethods.applyInvalidValueTreatment(dataColumn.mask, self.get("invalidValueTreatment"))
        data, mask = FieldCastMethods.applyMapMissingTo(dataColumn.fieldType, dataColumn.data, mask, self.get("mapMissingTo"))

        performanceTable.end("CastExpression")
        return DataColumn(dataColumn.fieldType, data, mask)
Пример #4
0
    def format(self, subTable, functionTable, performanceTable, score):
        """Extract or post-process output for the output field of a DataTable.

        @type subTable: DataTable
        @param subTable: The DataTable associated with this local lexical scope.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @type score: dict
        @param score: Dictionary mapping PMML score "feature" strings to DataColumns.  This dictionary always contains a None key, which is the basic feature ("predictedValue").
        @rtype: DataColumn
        @return: The output that would go into an output field of a DataTable.
        """

        performanceTable.begin("OutputField")

        feature = self.get("feature")
        if feature is None:
            dataColumn = subTable.fields[self["name"]]

        elif feature == "predictedValue":
            dataColumn = score[None]

        elif feature == "predictedDisplayValue":
            original = score[None]
            toString = original.fieldType.valueToString
            data = NP("empty", len(subTable), dtype=NP.dtype(object))
            for i, x in enumerate(original.data):
                data[i] = toString(x)
            dataColumn = DataColumn(FakeFieldType("string", "continuous"), data, None)

        elif feature == "transformedValue":
            expression = self.childOfClass(PmmlExpression)
            if expression is None:
                raise defs.PmmlValidationError("OutputField with feature \"transformedValue\" requires an EXPRESSION")
            
            performanceTable.pause("OutputField")
            dataColumn = expression.evaluate(subTable, functionTable, performanceTable)
            performanceTable.unpause("OutputField")

        elif feature == "decision":
            decisions = self.childOfTag("Decisions")
            if decisions is None:
                raise defs.PmmlValidationError("OutputField with feature \"decision\" requires a Decisions block")

            performanceTable.pause("OutputField")
            dataColumn = self.childOfClass(PmmlExpression).evaluate(subTable, functionTable, performanceTable)
            performanceTable.unpause("OutputField")

            if dataColumn.mask is None:
                valid = None
            else:
                valid = NP(dataColumn.mask == defs.VALID)

            fieldType = FakeFieldType("object", "any")
            data = NP("empty", len(subTable), dtype=fieldType.dtype)
            mask = NP(NP("ones", len(subTable), dtype=defs.maskType) * defs.MISSING)

            for decision in decisions.childrenOfTag("Decision"):
                value = dataColumn.fieldType.stringToValue(decision["value"])

                selection = NP(dataColumn.data == value)
                if valid is not None:
                    NP("logical_and", selection, valid, selection)

                for i in xrange(len(data)):
                    if selection[i]:
                        data[i] = decision

                mask[selection] = defs.VALID
            
            if not mask.any():
                mask = None

            dataColumn = DataColumn(fieldType, data, mask)

        elif feature in score:
            dataColumn = score[feature]

        else:
            model = self.getparent()
            if model is not None: model = model.getparent()

            if model is None:
                model = "(orphaned OutputField; no parent model)"
            else:
                model = model.t

            raise defs.PmmlValidationError("Models of type %s do not produce \"%s\" features (or at least, it is not yet implemented by Augustus)" % (model, feature))

        dataType = self.get("dataType", dataColumn.fieldType.dataType)
        optype = self.get("optype", dataColumn.fieldType.optype)
        if (dataType != dataColumn.fieldType.dataType or optype != dataColumn.fieldType.optype) and feature not in ("predictedDisplayValue", "decision"):
            dataColumn = FieldCastMethods.cast(FakeFieldType(dataType, optype), dataColumn)

        if feature is not None:
            subTable.fields[self.get("displayName", self["name"])] = dataColumn

        performanceTable.end("OutputField")
        return dataColumn
Пример #5
0
    def replaceField(self, dataTable, functionTable, performanceTable):
        """Replace a field in the DataTable for outlier removal,
        missing value handling, and invalid value treatment.

        @type dataTable: DataTable
        @param dataTable: The pre-built DataTable.
        @type functionTable: FunctionTable
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        """

        dataColumn = dataTable.fields.get(self.name)
        if dataColumn is None:
            return

        performanceTable.begin("MiningField")

        optype = self.get("optype", dataColumn.fieldType.optype)
        if optype != dataColumn.fieldType.optype:
            dataColumn = FieldCastMethods.cast(FakeFieldType(dataColumn.fieldType.dataType, optype), dataColumn)

        data = dataColumn.data
        mask = dataColumn.mask

        outliers = self.get("outliers")
        
        lowValue = self.get("lowValue")
        if lowValue is not None:
            lowValue = dataColumn.fieldType.stringToValue(lowValue)

            if outliers == "asMissingValues":
                selection = NP(dataColumn.data < lowValue)
                mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection)

            elif outliers == "asExtremeValues":
                selection = NP(dataColumn.data < lowValue)
                if data is dataColumn.data:
                    data = NP("copy", data)
                    data.setflags(write=True)
                    data[selection] = lowValue

        highValue = self.get("highValue")
        if highValue is not None:
            highValue = dataColumn.fieldType.stringToValue(highValue)

            if outliers == "asMissingValues":
                selection = NP(dataColumn.data > highValue)
                mask = FieldCastMethods.outliersAsMissing(mask, dataColumn.mask, selection)

            elif outliers == "asExtremeValues":
                selection = NP(dataColumn.data > highValue)
                if data is dataColumn.data:
                    data = NP("copy", data)
                    data.setflags(write=True)
                    data[selection] = highValue

        mask = FieldCastMethods.applyInvalidValueTreatment(mask, self.get("invalidValueTreatment"))
        data, mask = FieldCastMethods.applyMapMissingTo(dataColumn.fieldType, data, mask, self.get("missingValueReplacement"))

        dataTable.fields.replaceField(self.name, DataColumn(dataColumn.fieldType, data, mask))
        performanceTable.end("MiningField")
Пример #6
0
    def format(self, subTable, functionTable, performanceTable, score):
        """Extract or post-process output for the output field of a DataTable.

        @type subTable: DataTable
        @param subTable: The DataTable associated with this local lexical scope.
        @type functionTable: FunctionTable or None
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable or None
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        @type score: dict
        @param score: Dictionary mapping PMML score "feature" strings to DataColumns.  This dictionary always contains a None key, which is the basic feature ("predictedValue").
        @rtype: DataColumn
        @return: The output that would go into an output field of a DataTable.
        """

        performanceTable.begin("OutputField")

        feature = self.get("feature")
        if feature is None:
            dataColumn = subTable.fields[self["name"]]

        elif feature == "predictedValue":
            dataColumn = score[None]

        elif feature == "predictedDisplayValue":
            original = score[None]
            toString = original.fieldType.valueToString
            data = NP("empty", len(subTable), dtype=NP.dtype(object))
            for i, x in enumerate(original.data):
                data[i] = toString(x)
            dataColumn = DataColumn(FakeFieldType("string", "continuous"),
                                    data, None)

        elif feature == "transformedValue":
            expression = self.childOfClass(PmmlExpression)
            if expression is None:
                raise defs.PmmlValidationError(
                    "OutputField with feature \"transformedValue\" requires an EXPRESSION"
                )

            performanceTable.pause("OutputField")
            dataColumn = expression.evaluate(subTable, functionTable,
                                             performanceTable)
            performanceTable.unpause("OutputField")

        elif feature == "decision":
            decisions = self.childOfTag("Decisions")
            if decisions is None:
                raise defs.PmmlValidationError(
                    "OutputField with feature \"decision\" requires a Decisions block"
                )

            performanceTable.pause("OutputField")
            dataColumn = self.childOfClass(PmmlExpression).evaluate(
                subTable, functionTable, performanceTable)
            performanceTable.unpause("OutputField")

            if dataColumn.mask is None:
                valid = None
            else:
                valid = NP(dataColumn.mask == defs.VALID)

            fieldType = FakeFieldType("object", "any")
            data = NP("empty", len(subTable), dtype=fieldType.dtype)
            mask = NP(
                NP("ones", len(subTable), dtype=defs.maskType) * defs.MISSING)

            for decision in decisions.childrenOfTag("Decision"):
                value = dataColumn.fieldType.stringToValue(decision["value"])

                selection = NP(dataColumn.data == value)
                if valid is not None:
                    NP("logical_and", selection, valid, selection)

                for i in xrange(len(data)):
                    if selection[i]:
                        data[i] = decision

                mask[selection] = defs.VALID

            if not mask.any():
                mask = None

            dataColumn = DataColumn(fieldType, data, mask)

        elif feature in score:
            dataColumn = score[feature]

        else:
            model = self.getparent()
            if model is not None: model = model.getparent()

            if model is None:
                model = "(orphaned OutputField; no parent model)"
            else:
                model = model.t

            raise defs.PmmlValidationError(
                "Models of type %s do not produce \"%s\" features (or at least, it is not yet implemented by Augustus)"
                % (model, feature))

        dataType = self.get("dataType", dataColumn.fieldType.dataType)
        optype = self.get("optype", dataColumn.fieldType.optype)
        if (dataType != dataColumn.fieldType.dataType
                or optype != dataColumn.fieldType.optype) and feature not in (
                    "predictedDisplayValue", "decision"):
            dataColumn = FieldCastMethods.cast(FakeFieldType(dataType, optype),
                                               dataColumn)

        if feature is not None:
            subTable.fields[self.get("displayName", self["name"])] = dataColumn

        performanceTable.end("OutputField")
        return dataColumn
Пример #7
0
    def replaceField(self, dataTable, functionTable, performanceTable):
        """Replace a field in the DataTable for outlier removal,
        missing value handling, and invalid value treatment.

        @type dataTable: DataTable
        @param dataTable: The pre-built DataTable.
        @type functionTable: FunctionTable
        @param functionTable: A table of functions.
        @type performanceTable: PerformanceTable
        @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation.
        """

        dataColumn = dataTable.fields.get(self.name)
        if dataColumn is None:
            return

        performanceTable.begin("MiningField")

        optype = self.get("optype", dataColumn.fieldType.optype)
        if optype != dataColumn.fieldType.optype:
            dataColumn = FieldCastMethods.cast(
                FakeFieldType(dataColumn.fieldType.dataType, optype),
                dataColumn)

        data = dataColumn.data
        mask = dataColumn.mask

        outliers = self.get("outliers")

        lowValue = self.get("lowValue")
        if lowValue is not None:
            lowValue = dataColumn.fieldType.stringToValue(lowValue)

            if outliers == "asMissingValues":
                selection = NP(dataColumn.data < lowValue)
                mask = FieldCastMethods.outliersAsMissing(
                    mask, dataColumn.mask, selection)

            elif outliers == "asExtremeValues":
                selection = NP(dataColumn.data < lowValue)
                if data is dataColumn.data:
                    data = NP("copy", data)
                    data.setflags(write=True)
                    data[selection] = lowValue

        highValue = self.get("highValue")
        if highValue is not None:
            highValue = dataColumn.fieldType.stringToValue(highValue)

            if outliers == "asMissingValues":
                selection = NP(dataColumn.data > highValue)
                mask = FieldCastMethods.outliersAsMissing(
                    mask, dataColumn.mask, selection)

            elif outliers == "asExtremeValues":
                selection = NP(dataColumn.data > highValue)
                if data is dataColumn.data:
                    data = NP("copy", data)
                    data.setflags(write=True)
                    data[selection] = highValue

        mask = FieldCastMethods.applyInvalidValueTreatment(
            mask, self.get("invalidValueTreatment"))
        data, mask = FieldCastMethods.applyMapMissingTo(
            dataColumn.fieldType, data, mask,
            self.get("missingValueReplacement"))

        dataTable.fields.replaceField(
            self.name, DataColumn(dataColumn.fieldType, data, mask))
        performanceTable.end("MiningField")